Merge branch 'for-linus' of git://git390.marist.edu/pub/scm/linux-2.6
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 4 Aug 2011 16:35:51 +0000 (06:35 -1000)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 4 Aug 2011 16:35:51 +0000 (06:35 -1000)
* 'for-linus' of git://git390.marist.edu/pub/scm/linux-2.6:
  [S390] signal: use set_restore_sigmask() helper
  [S390] smp: remove pointless comments in startup_secondary()
  [S390] qdio: Use kstrtoul_from_user
  [S390] sclp_async: Use kstrtoul_from_user
  [S390] exec: remove redundant set_fs(USER_DS)
  [S390] cpu hotplug: on cpu start wait until being marked active
  [S390] signal: convert to use set_current_blocked()
  [S390] asm offsets: fix coding style
  [S390] Add support for IBM zEnterprise 114
  [S390] dasd: check if raw track access is supported
  [S390] Use diagnose 308 for system reset
  [S390] Export store_status() function
  [S390] dasd: use vmalloc for statistics input buffer
  [S390] Add PSW restart shutdown trigger
  [S390] missing return in page_table_alloc_pgste
  [S390] qdio: 2nd stage retry on SIGA-W busy conditions

193 files changed:
Documentation/acpi/apei/einj.txt
Documentation/devicetree/bindings/gpio/gpio_keys.txt
Documentation/devicetree/bindings/input/fsl-mma8450.txt [new file with mode: 0644]
Documentation/fault-injection/fault-injection.txt
Documentation/feature-removal-schedule.txt
Documentation/frv/booting.txt
Documentation/ioctl/ioctl-number.txt
Documentation/kernel-parameters.txt
Documentation/m68k/kernel-options.txt
MAINTAINERS
arch/Kconfig
arch/alpha/Kconfig
arch/arm/kernel/process.c
arch/avr32/Kconfig
arch/cris/arch-v10/drivers/sync_serial.c
arch/cris/arch-v10/kernel/irq.c
arch/cris/include/asm/thread_info.h
arch/frv/Kconfig
arch/ia64/Kconfig
arch/m68k/Kconfig
arch/parisc/Kconfig
arch/powerpc/Kconfig
arch/s390/Kconfig
arch/sh/Kconfig
arch/sh/kernel/idle.c
arch/sparc/Kconfig
arch/tile/Kconfig
arch/tile/include/asm/Kbuild
arch/tile/include/asm/bug.h [deleted file]
arch/tile/include/asm/bugs.h [deleted file]
arch/tile/include/asm/cputime.h [deleted file]
arch/tile/include/asm/device.h [deleted file]
arch/tile/include/asm/div64.h [deleted file]
arch/tile/include/asm/emergency-restart.h [deleted file]
arch/tile/include/asm/errno.h [deleted file]
arch/tile/include/asm/fb.h [deleted file]
arch/tile/include/asm/fcntl.h [deleted file]
arch/tile/include/asm/fixmap.h
arch/tile/include/asm/ioctl.h [deleted file]
arch/tile/include/asm/ioctls.h [deleted file]
arch/tile/include/asm/ipc.h [deleted file]
arch/tile/include/asm/ipcbuf.h [deleted file]
arch/tile/include/asm/irq_regs.h [deleted file]
arch/tile/include/asm/kdebug.h [deleted file]
arch/tile/include/asm/local.h [deleted file]
arch/tile/include/asm/module.h [deleted file]
arch/tile/include/asm/msgbuf.h [deleted file]
arch/tile/include/asm/mutex.h [deleted file]
arch/tile/include/asm/param.h [deleted file]
arch/tile/include/asm/parport.h [deleted file]
arch/tile/include/asm/poll.h [deleted file]
arch/tile/include/asm/posix_types.h [deleted file]
arch/tile/include/asm/resource.h [deleted file]
arch/tile/include/asm/scatterlist.h [deleted file]
arch/tile/include/asm/sembuf.h [deleted file]
arch/tile/include/asm/serial.h [deleted file]
arch/tile/include/asm/shmbuf.h [deleted file]
arch/tile/include/asm/shmparam.h [deleted file]
arch/tile/include/asm/socket.h [deleted file]
arch/tile/include/asm/sockios.h [deleted file]
arch/tile/include/asm/statfs.h [deleted file]
arch/tile/include/asm/termbits.h [deleted file]
arch/tile/include/asm/termios.h [deleted file]
arch/tile/include/asm/types.h [deleted file]
arch/tile/include/asm/ucontext.h [deleted file]
arch/tile/include/asm/xor.h [deleted file]
arch/tile/include/hv/drv_srom_intf.h [new file with mode: 0644]
arch/tile/kernel/time.c
arch/tile/mm/init.c
arch/x86/Kconfig
arch/x86/include/asm/io.h
arch/x86/include/asm/processor.h
arch/x86/kernel/acpi/cstate.c
arch/x86/kernel/process.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/platform/mrst/Makefile
arch/x86/platform/mrst/pmu.c [new file with mode: 0644]
arch/x86/platform/mrst/pmu.h [new file with mode: 0644]
arch/x86/xen/setup.c
block/blk-core.c
block/blk-timeout.c
drivers/acpi/acpica/acglobal.h
drivers/acpi/acpica/aclocal.h
drivers/acpi/acpica/acpredef.h
drivers/acpi/acpica/nspredef.c
drivers/acpi/acpica/nsrepair2.c
drivers/acpi/acpica/tbinstal.c
drivers/acpi/apei/Kconfig
drivers/acpi/apei/apei-base.c
drivers/acpi/apei/apei-internal.h
drivers/acpi/apei/einj.c
drivers/acpi/apei/erst-dbg.c
drivers/acpi/apei/erst.c
drivers/acpi/apei/ghes.c
drivers/acpi/apei/hest.c
drivers/acpi/battery.c
drivers/acpi/bus.c
drivers/acpi/dock.c
drivers/acpi/ec_sys.c
drivers/acpi/fan.c
drivers/acpi/osl.c
drivers/acpi/pci_irq.c
drivers/acpi/pci_root.c
drivers/acpi/processor_thermal.c
drivers/acpi/sbs.c
drivers/acpi/sleep.c
drivers/acpi/sysfs.c
drivers/acpi/thermal.c
drivers/acpi/video.c
drivers/ata/libata-acpi.c
drivers/char/Kconfig
drivers/char/Makefile
drivers/char/ramoops.c
drivers/char/tile-srom.c [new file with mode: 0644]
drivers/char/tpm/tpm_tis.c
drivers/cpuidle/cpuidle.c
drivers/cpuidle/cpuidle.h
drivers/cpuidle/driver.c
drivers/cpuidle/governor.c
drivers/eisa/pci_eisa.c
drivers/firmware/efivars.c
drivers/input/keyboard/gpio_keys.c
drivers/input/keyboard/lm8323.c
drivers/input/keyboard/tegra-kbc.c
drivers/input/misc/kxtj9.c
drivers/input/misc/mma8450.c
drivers/input/mouse/hgpk.c
drivers/input/touchscreen/ad7879.c
drivers/of/base.c
drivers/of/fdt.c
drivers/pci/hotplug/acpiphp_glue.c
drivers/rtc/rtc-omap.c
drivers/target/iscsi/Kconfig
drivers/target/iscsi/iscsi_target.c
drivers/target/iscsi/iscsi_target_configfs.c
drivers/target/iscsi/iscsi_target_nego.c
drivers/target/target_core_transport.c
drivers/target/tcm_fc/tcm_fc.h
drivers/target/tcm_fc/tfc_cmd.c
drivers/target/tcm_fc/tfc_io.c
drivers/thermal/Kconfig
drivers/thermal/thermal_sys.c
drivers/video/backlight/Kconfig
drivers/video/backlight/aat2870_bl.c
fs/Kconfig
fs/dcache.c
fs/ext4/super.c
fs/stack.c
include/acpi/acpi_drivers.h
include/acpi/acpixf.h
include/acpi/apei.h
include/acpi/processor.h
include/linux/acpi.h
include/linux/bitmap.h
include/linux/cpuidle.h
include/linux/fault-inject.h
include/linux/genalloc.h
include/linux/gfp.h
include/linux/idr.h
include/linux/llist.h [new file with mode: 0644]
include/linux/memcontrol.h
include/linux/mfd/aat2870.h
include/linux/mm.h
include/linux/of.h
include/linux/of_fdt.h
include/linux/radix-tree.h
include/linux/shmem_fs.h
include/linux/swapops.h
include/linux/thermal.h
init/main.c
ipc/shm.c
kernel/kmod.c
kernel/taskstats.c
lib/Kconfig
lib/Makefile
lib/bitmap.c
lib/fault-inject.c
lib/genalloc.c
lib/idr.c
lib/llist.c [new file with mode: 0644]
lib/radix-tree.c
mm/failslab.c
mm/filemap.c
mm/memcontrol.c
mm/memory-failure.c
mm/mincore.c
mm/page_alloc.c
mm/shmem.c
mm/swapfile.c
mm/truncate.c
tools/power/x86/turbostat/turbostat.c
tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c

index dfab718..5cc699b 100644 (file)
@@ -48,12 +48,19 @@ directory apei/einj. The following files are provided.
 - param1
   This file is used to set the first error parameter value. Effect of
   parameter depends on error_type specified. For memory error, this is
-  physical memory address.
+  physical memory address.  Only available if param_extension module
+  parameter is specified.
 
 - param2
   This file is used to set the second error parameter value. Effect of
   parameter depends on error_type specified. For memory error, this is
-  physical memory address mask.
+  physical memory address mask.  Only available if param_extension
+  module parameter is specified.
+
+Injecting parameter support is a BIOS version specific extension, that
+is, it only works on some BIOS version.  If you want to use it, please
+make sure your BIOS version has the proper support and specify
+"param_extension=y" in module parameter.
 
 For more information about EINJ, please refer to ACPI specification
 version 4.0, section 17.5.
index 7190c99..5c2c021 100644 (file)
@@ -10,7 +10,7 @@ Optional properties:
 Each button (key) is represented as a sub-node of "gpio-keys":
 Subnode properties:
 
-       - gpios: OF devcie-tree gpio specificatin.
+       - gpios: OF device-tree gpio specification.
        - label: Descriptive name of the key.
        - linux,code: Keycode to emit.
 
diff --git a/Documentation/devicetree/bindings/input/fsl-mma8450.txt b/Documentation/devicetree/bindings/input/fsl-mma8450.txt
new file mode 100644 (file)
index 0000000..a00c94c
--- /dev/null
@@ -0,0 +1,11 @@
+* Freescale MMA8450 3-Axis Accelerometer
+
+Required properties:
+- compatible : "fsl,mma8450".
+
+Example:
+
+accelerometer: mma8450@1c {
+       compatible = "fsl,mma8450";
+       reg = <0x1c>;
+};
index 7be15e4..82a5d25 100644 (file)
@@ -143,8 +143,7 @@ o provide a way to configure fault attributes
   failslab, fail_page_alloc, and fail_make_request use this way.
   Helper functions:
 
-       init_fault_attr_dentries(entries, attr, name);
-       void cleanup_fault_attr_dentries(entries);
+       fault_create_debugfs_attr(name, parent, attr);
 
 - module parameters
 
index ea0bace..43f4809 100644 (file)
@@ -296,15 +296,6 @@ Who:       Ravikiran Thirumalai <kiran@scalex86.org>
 
 ---------------------------
 
-What:  CONFIG_THERMAL_HWMON
-When:  January 2009
-Why:   This option was introduced just to allow older lm-sensors userspace
-       to keep working over the upgrade to 2.6.26. At the scheduled time of
-       removal fixed lm-sensors (2.x or 3.x) should be readily available.
-Who:   Rene Herman <rene.herman@gmail.com>
-
----------------------------
-
 What:  Code that is now under CONFIG_WIRELESS_EXT_SYSFS
        (in net/core/net-sysfs.c)
 When:  After the only user (hal) has seen a release with the patches
index ace200b..37c4d84 100644 (file)
@@ -106,13 +106,20 @@ separated by spaces:
       To use the first on-chip serial port at baud rate 115200, no parity, 8
       bits, and no flow control.
 
-  (*) root=/dev/<xxxx>
+  (*) root=<xxxx>
 
-      This specifies the device upon which the root filesystem resides. For
-      example:
+      This specifies the device upon which the root filesystem resides. It
+      may be specified by major and minor number, device path, or even
+      partition uuid, if supported.  For example:
 
        /dev/nfs        NFS root filesystem
        /dev/mtdblock3  Fourth RedBoot partition on the System Flash
+       PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF/PARTNROFF=1
+               first partition after the partition with the given UUID
+       253:0           Device with major 253 and minor 0
+
+      Authoritative information can be found in
+      "Documentation/kernel-parameters.txt".
 
   (*) rw
 
index 72ba8d5..845a191 100644 (file)
@@ -292,6 +292,7 @@ Code  Seq#(hex)     Include File            Comments
                                        <mailto:buk@buks.ipn.de>
 0xA0   all     linux/sdp/sdp.h         Industrial Device Project
                                        <mailto:kenji@bitgate.com>
+0xA2   00-0F   arch/tile/include/asm/hardwall.h
 0xA3   80-8F   Port ACL                in development:
                                        <mailto:tlewis@mindspring.com>
 0xA3   90-9F   linux/dtlk.h
index 26a8374..e279b72 100644 (file)
@@ -163,6 +163,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
                        See also Documentation/power/pm.txt, pci=noacpi
 
+       acpi_rsdp=      [ACPI,EFI,KEXEC]
+                       Pass the RSDP address to the kernel, mostly used
+                       on machines running EFI runtime service to boot the
+                       second kernel for kdump.
+
        acpi_apic_instance=     [ACPI, IOAPIC]
                        Format: <int>
                        2: use 2nd APIC table, if available
@@ -546,6 +551,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        /proc/<pid>/coredump_filter.
                        See also Documentation/filesystems/proc.txt.
 
+       cpuidle.off=1   [CPU_IDLE]
+                       disable the cpuidle sub-system
+
        cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver
                        Format:
                        <first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
@@ -2240,6 +2248,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
        ro              [KNL] Mount root device read-only on boot
 
        root=           [KNL] Root filesystem
+                       See name_to_dev_t comment in init/do_mounts.c.
 
        rootdelay=      [KNL] Delay (in seconds) to pause before attempting to
                        mount the root filesystem
index c93bed6..97d45f2 100644 (file)
@@ -129,6 +129,20 @@ decimal 11 is the major of SCSI CD-ROMs, and the minor 0 stands for
 the first of these. You can find out all valid major numbers by
 looking into include/linux/major.h.
 
+In addition to major and minor numbers, if the device containing your
+root partition uses a partition table format with unique partition
+identifiers, then you may use them.  For instance,
+"root=PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF".  It is also
+possible to reference another partition on the same device using a
+known partition UUID as the starting point.  For example,
+if partition 5 of the device has the UUID of
+00112233-4455-6677-8899-AABBCCDDEEFF then partition 3 may be found as
+follows:
+  PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF/PARTNROFF=-2
+
+Authoritative information can be found in
+"Documentation/kernel-parameters.txt".
+
 
 2.2) ro, rw
 -----------
index 0d2fcda..07cfd8d 100644 (file)
@@ -3367,6 +3367,12 @@ F:       drivers/net/ixgb/
 F:     drivers/net/ixgbe/
 F:     drivers/net/ixgbevf/
 
+INTEL MRST PMU DRIVER
+M:     Len Brown <len.brown@intel.com>
+L:     linux-pm@lists.linux-foundation.org
+S:     Supported
+F:     arch/x86/platform/mrst/pmu.*
+
 INTEL PRO/WIRELESS 2100 NETWORK CONNECTION SUPPORT
 L:     linux-wireless@vger.kernel.org
 S:     Orphan
@@ -6319,6 +6325,7 @@ F:        include/linux/sysv_fs.h
 TARGET SUBSYSTEM
 M:     Nicholas A. Bellinger <nab@linux-iscsi.org>
 L:     linux-scsi@vger.kernel.org
+L:     target-devel@vger.kernel.org
 L:     http://groups.google.com/group/linux-iscsi-target-dev
 W:     http://www.linux-iscsi.org
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/nab/lio-core-2.6.git master
index 26b0e23..4b0669c 100644 (file)
@@ -178,4 +178,7 @@ config HAVE_ARCH_MUTEX_CPU_RELAX
 config HAVE_RCU_TABLE_FREE
        bool
 
+config ARCH_HAVE_NMI_SAFE_CMPXCHG
+       bool
+
 source "kernel/gcov/Kconfig"
index ca2da8d..60cde53 100644 (file)
@@ -14,6 +14,7 @@ config ALPHA
        select AUTO_IRQ_AFFINITY if SMP
        select GENERIC_IRQ_SHOW
        select ARCH_WANT_OPTIONAL_GPIOLIB
+       select ARCH_HAVE_NMI_SAFE_CMPXCHG
        help
          The Alpha is a 64-bit general-purpose processor designed and
          marketed by the Digital Equipment Corporation of blessed memory,
index 5e1e541..d7ee0d4 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/uaccess.h>
 #include <linux/random.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/cpuidle.h>
 
 #include <asm/cacheflush.h>
 #include <asm/leds.h>
@@ -196,7 +197,8 @@ void cpu_idle(void)
                                cpu_relax();
                        } else {
                                stop_critical_timings();
-                               pm_idle();
+                               if (cpuidle_call_idle())
+                                       pm_idle();
                                start_critical_timings();
                                /*
                                 * This will eventually be removed - pm_idle
index e9d689b..197e96f 100644 (file)
@@ -10,6 +10,7 @@ config AVR32
        select GENERIC_IRQ_PROBE
        select HARDIRQS_SW_RESEND
        select GENERIC_IRQ_SHOW
+       select ARCH_HAVE_NMI_SAFE_CMPXCHG
        help
          AVR32 is a high-performance 32-bit RISC microprocessor core,
          designed for cost-sensitive embedded applications, with particular
index 8502653..466af40 100644 (file)
@@ -158,7 +158,7 @@ static int sync_serial_open(struct inode *inode, struct file *file);
 static int sync_serial_release(struct inode *inode, struct file *file);
 static unsigned int sync_serial_poll(struct file *filp, poll_table *wait);
 
-static int sync_serial_ioctl(struct file *file,
+static long sync_serial_ioctl(struct file *file,
        unsigned int cmd, unsigned long arg);
 static ssize_t sync_serial_write(struct file *file, const char *buf,
        size_t count, loff_t *ppos);
@@ -625,11 +625,11 @@ static int sync_serial_open(struct inode *inode, struct file *file)
                        *R_IRQ_MASK1_SET = 1 << port->data_avail_bit;
                DEBUG(printk(KERN_DEBUG "sser%d rec started\n", dev));
        }
-       ret = 0;
+       err = 0;
        
 out:
        mutex_unlock(&sync_serial_mutex);
-       return ret;
+       return err;
 }
 
 static int sync_serial_release(struct inode *inode, struct file *file)
index 907cfb5..ba0e596 100644 (file)
@@ -20,6 +20,9 @@
 #define crisv10_mask_irq(irq_nr) (*R_VECT_MASK_CLR = 1 << (irq_nr));
 #define crisv10_unmask_irq(irq_nr) (*R_VECT_MASK_SET = 1 << (irq_nr));
 
+extern void kgdb_init(void);
+extern void breakpoint(void);
+
 /* don't use set_int_vector, it bypasses the linux interrupt handlers. it is
  * global just so that the kernel gdb can use it.
  */
index 29b74a1..332f19c 100644 (file)
@@ -11,8 +11,6 @@
 
 #ifdef __KERNEL__
 
-#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
-
 #ifndef __ASSEMBLY__
 #include <asm/types.h>
 #include <asm/processor.h>
@@ -67,8 +65,10 @@ struct thread_info {
 
 #define init_thread_info       (init_thread_union.thread_info)
 
+#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
 /* thread information allocation */
-#define alloc_thread_info(tsk, node) ((struct thread_info *) __get_free_pages(GFP_KERNEL,1))
+#define alloc_thread_info_node(tsk, node)      \
+       ((struct thread_info *) __get_free_pages(GFP_KERNEL, 1))
 #define free_thread_info(ti) free_pages((unsigned long) (ti), 1)
 
 #endif /* !__ASSEMBLY__ */
index cb884e4..bad27a6 100644 (file)
@@ -7,6 +7,7 @@ config FRV
        select HAVE_PERF_EVENTS
        select HAVE_GENERIC_HARDIRQS
        select GENERIC_IRQ_SHOW
+       select ARCH_HAVE_NMI_SAFE_CMPXCHG
 
 config ZONE_DMA
        bool
index 64c7ab7..1248547 100644 (file)
@@ -28,6 +28,7 @@ config IA64
        select IRQ_PER_CPU
        select GENERIC_IRQ_SHOW
        select ARCH_WANT_OPTIONAL_GPIOLIB
+       select ARCH_HAVE_NMI_SAFE_CMPXCHG
        default y
        help
          The Itanium Processor Family is Intel's 64-bit successor to
index 284cd37..9e8ee9d 100644 (file)
@@ -6,6 +6,7 @@ config M68K
        select GENERIC_ATOMIC64 if MMU
        select HAVE_GENERIC_HARDIRQS if !MMU
        select GENERIC_IRQ_SHOW if !MMU
+       select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS
 
 config RWSEM_GENERIC_SPINLOCK
        bool
index 65adc86..e077b0b 100644 (file)
@@ -15,6 +15,7 @@ config PARISC
        select HAVE_GENERIC_HARDIRQS
        select GENERIC_IRQ_PROBE
        select IRQ_PER_CPU
+       select ARCH_HAVE_NMI_SAFE_CMPXCHG
 
        help
          The PA-RISC microprocessor is designed by Hewlett-Packard and used
index 374c475..6926b61 100644 (file)
@@ -136,6 +136,7 @@ config PPC
        select HAVE_SYSCALL_TRACEPOINTS
        select HAVE_BPF_JIT if (PPC64 && NET)
        select HAVE_ARCH_JUMP_LABEL
+       select ARCH_HAVE_NMI_SAFE_CMPXCHG
 
 config EARLY_PRINTK
        bool
index 17dbb43..ed5cb5a 100644 (file)
@@ -81,6 +81,7 @@ config S390
        select INIT_ALL_POSSIBLE
        select HAVE_IRQ_WORK
        select HAVE_PERF_EVENTS
+       select ARCH_HAVE_NMI_SAFE_CMPXCHG
        select HAVE_KERNEL_GZIP
        select HAVE_KERNEL_BZIP2
        select HAVE_KERNEL_LZMA
index 748ff19..ff9177c 100644 (file)
@@ -11,6 +11,7 @@ config SUPERH
        select HAVE_DMA_ATTRS
        select HAVE_IRQ_WORK
        select HAVE_PERF_EVENTS
+       select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A)
        select PERF_USE_VMALLOC
        select HAVE_KERNEL_GZIP
        select HAVE_KERNEL_BZIP2
index 84db0d6..3c45de1 100644 (file)
 #include <linux/thread_info.h>
 #include <linux/irqflags.h>
 #include <linux/smp.h>
+#include <linux/cpuidle.h>
 #include <asm/pgalloc.h>
 #include <asm/system.h>
 #include <linux/atomic.h>
 #include <asm/smp.h>
 
-void (*pm_idle)(void) = NULL;
+static void (*pm_idle)(void);
 
 static int hlt_counter;
 
@@ -100,7 +101,8 @@ void cpu_idle(void)
                        local_irq_disable();
                        /* Don't trace irqs off for idle */
                        stop_critical_timings();
-                       pm_idle();
+                       if (cpuidle_call_idle())
+                               pm_idle();
                        /*
                         * Sanity check to ensure that pm_idle() returns
                         * with IRQs enabled
index 1074ddd..42c67be 100644 (file)
@@ -54,6 +54,7 @@ config SPARC64
        select HAVE_PERF_EVENTS
        select PERF_USE_VMALLOC
        select IRQ_PREFLOW_FASTEOI
+       select ARCH_HAVE_NMI_SAFE_CMPXCHG
 
 config ARCH_DEFCONFIG
        string
index 0249b8b..b30f71a 100644 (file)
@@ -12,6 +12,7 @@ config TILE
        select GENERIC_PENDING_IRQ if SMP
        select GENERIC_IRQ_SHOW
        select SYS_HYPERVISOR
+       select ARCH_HAVE_NMI_SAFE_CMPXCHG if !M386
 
 # FIXME: investigate whether we need/want these options.
 #      select HAVE_IOREMAP_PROT
index 849ab2f..aec60dc 100644 (file)
@@ -2,3 +2,41 @@ include include/asm-generic/Kbuild.asm
 
 header-y += ucontext.h
 header-y += hardwall.h
+
+generic-y += bug.h
+generic-y += bugs.h
+generic-y += cputime.h
+generic-y += device.h
+generic-y += div64.h
+generic-y += emergency-restart.h
+generic-y += errno.h
+generic-y += fb.h
+generic-y += fcntl.h
+generic-y += ioctl.h
+generic-y += ioctls.h
+generic-y += ipc.h
+generic-y += ipcbuf.h
+generic-y += irq_regs.h
+generic-y += kdebug.h
+generic-y += local.h
+generic-y += module.h
+generic-y += msgbuf.h
+generic-y += mutex.h
+generic-y += param.h
+generic-y += parport.h
+generic-y += poll.h
+generic-y += posix_types.h
+generic-y += resource.h
+generic-y += scatterlist.h
+generic-y += sembuf.h
+generic-y += serial.h
+generic-y += shmbuf.h
+generic-y += shmparam.h
+generic-y += socket.h
+generic-y += sockios.h
+generic-y += statfs.h
+generic-y += termbits.h
+generic-y += termios.h
+generic-y += types.h
+generic-y += ucontext.h
+generic-y += xor.h
diff --git a/arch/tile/include/asm/bug.h b/arch/tile/include/asm/bug.h
deleted file mode 100644 (file)
index b12fd89..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/bug.h>
diff --git a/arch/tile/include/asm/bugs.h b/arch/tile/include/asm/bugs.h
deleted file mode 100644 (file)
index 61791e1..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/bugs.h>
diff --git a/arch/tile/include/asm/cputime.h b/arch/tile/include/asm/cputime.h
deleted file mode 100644 (file)
index 6d68ad7..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/cputime.h>
diff --git a/arch/tile/include/asm/device.h b/arch/tile/include/asm/device.h
deleted file mode 100644 (file)
index f0a4c25..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/device.h>
diff --git a/arch/tile/include/asm/div64.h b/arch/tile/include/asm/div64.h
deleted file mode 100644 (file)
index 6cd978c..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/div64.h>
diff --git a/arch/tile/include/asm/emergency-restart.h b/arch/tile/include/asm/emergency-restart.h
deleted file mode 100644 (file)
index 3711bd9..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/emergency-restart.h>
diff --git a/arch/tile/include/asm/errno.h b/arch/tile/include/asm/errno.h
deleted file mode 100644 (file)
index 4c82b50..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/errno.h>
diff --git a/arch/tile/include/asm/fb.h b/arch/tile/include/asm/fb.h
deleted file mode 100644 (file)
index 3a4988e..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/fb.h>
diff --git a/arch/tile/include/asm/fcntl.h b/arch/tile/include/asm/fcntl.h
deleted file mode 100644 (file)
index 46ab12d..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/fcntl.h>
index 51537ff..c66f793 100644 (file)
@@ -75,12 +75,6 @@ extern void __set_fixmap(enum fixed_addresses idx,
 
 #define set_fixmap(idx, phys) \
                __set_fixmap(idx, phys, PAGE_KERNEL)
-/*
- * Some hardware wants to get fixmapped without caching.
- */
-#define set_fixmap_nocache(idx, phys) \
-               __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
-
 #define clear_fixmap(idx) \
                __set_fixmap(idx, 0, __pgprot(0))
 
diff --git a/arch/tile/include/asm/ioctl.h b/arch/tile/include/asm/ioctl.h
deleted file mode 100644 (file)
index b279fe0..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ioctl.h>
diff --git a/arch/tile/include/asm/ioctls.h b/arch/tile/include/asm/ioctls.h
deleted file mode 100644 (file)
index ec34c76..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ioctls.h>
diff --git a/arch/tile/include/asm/ipc.h b/arch/tile/include/asm/ipc.h
deleted file mode 100644 (file)
index a46e3d9..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ipc.h>
diff --git a/arch/tile/include/asm/ipcbuf.h b/arch/tile/include/asm/ipcbuf.h
deleted file mode 100644 (file)
index 84c7e51..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ipcbuf.h>
diff --git a/arch/tile/include/asm/irq_regs.h b/arch/tile/include/asm/irq_regs.h
deleted file mode 100644 (file)
index 3dd9c0b..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/irq_regs.h>
diff --git a/arch/tile/include/asm/kdebug.h b/arch/tile/include/asm/kdebug.h
deleted file mode 100644 (file)
index 6ece1b0..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/kdebug.h>
diff --git a/arch/tile/include/asm/local.h b/arch/tile/include/asm/local.h
deleted file mode 100644 (file)
index c11c530..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/local.h>
diff --git a/arch/tile/include/asm/module.h b/arch/tile/include/asm/module.h
deleted file mode 100644 (file)
index 1e4b79f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/module.h>
diff --git a/arch/tile/include/asm/msgbuf.h b/arch/tile/include/asm/msgbuf.h
deleted file mode 100644 (file)
index 809134c..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/msgbuf.h>
diff --git a/arch/tile/include/asm/mutex.h b/arch/tile/include/asm/mutex.h
deleted file mode 100644 (file)
index ff6101a..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/mutex-dec.h>
diff --git a/arch/tile/include/asm/param.h b/arch/tile/include/asm/param.h
deleted file mode 100644 (file)
index 965d454..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/param.h>
diff --git a/arch/tile/include/asm/parport.h b/arch/tile/include/asm/parport.h
deleted file mode 100644 (file)
index cf252af..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/parport.h>
diff --git a/arch/tile/include/asm/poll.h b/arch/tile/include/asm/poll.h
deleted file mode 100644 (file)
index c98509d..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/poll.h>
diff --git a/arch/tile/include/asm/posix_types.h b/arch/tile/include/asm/posix_types.h
deleted file mode 100644 (file)
index 22cae62..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/posix_types.h>
diff --git a/arch/tile/include/asm/resource.h b/arch/tile/include/asm/resource.h
deleted file mode 100644 (file)
index 04bc4db..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/resource.h>
diff --git a/arch/tile/include/asm/scatterlist.h b/arch/tile/include/asm/scatterlist.h
deleted file mode 100644 (file)
index 35d786f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/scatterlist.h>
diff --git a/arch/tile/include/asm/sembuf.h b/arch/tile/include/asm/sembuf.h
deleted file mode 100644 (file)
index 7673b83..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/sembuf.h>
diff --git a/arch/tile/include/asm/serial.h b/arch/tile/include/asm/serial.h
deleted file mode 100644 (file)
index a0cb0ca..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/serial.h>
diff --git a/arch/tile/include/asm/shmbuf.h b/arch/tile/include/asm/shmbuf.h
deleted file mode 100644 (file)
index 83c05fc..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/shmbuf.h>
diff --git a/arch/tile/include/asm/shmparam.h b/arch/tile/include/asm/shmparam.h
deleted file mode 100644 (file)
index 93f30de..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/shmparam.h>
diff --git a/arch/tile/include/asm/socket.h b/arch/tile/include/asm/socket.h
deleted file mode 100644 (file)
index 6b71384..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/socket.h>
diff --git a/arch/tile/include/asm/sockios.h b/arch/tile/include/asm/sockios.h
deleted file mode 100644 (file)
index def6d47..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/sockios.h>
diff --git a/arch/tile/include/asm/statfs.h b/arch/tile/include/asm/statfs.h
deleted file mode 100644 (file)
index 0b91fe1..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/statfs.h>
diff --git a/arch/tile/include/asm/termbits.h b/arch/tile/include/asm/termbits.h
deleted file mode 100644 (file)
index 3935b10..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/termbits.h>
diff --git a/arch/tile/include/asm/termios.h b/arch/tile/include/asm/termios.h
deleted file mode 100644 (file)
index 280d78a..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/termios.h>
diff --git a/arch/tile/include/asm/types.h b/arch/tile/include/asm/types.h
deleted file mode 100644 (file)
index b9e79bc..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/types.h>
diff --git a/arch/tile/include/asm/ucontext.h b/arch/tile/include/asm/ucontext.h
deleted file mode 100644 (file)
index 9bc07b9..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ucontext.h>
diff --git a/arch/tile/include/asm/xor.h b/arch/tile/include/asm/xor.h
deleted file mode 100644 (file)
index c82eb12..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/xor.h>
diff --git a/arch/tile/include/hv/drv_srom_intf.h b/arch/tile/include/hv/drv_srom_intf.h
new file mode 100644 (file)
index 0000000..6395faa
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * @file drv_srom_intf.h
+ * Interface definitions for the SPI Flash ROM driver.
+ */
+
+#ifndef _SYS_HV_INCLUDE_DRV_SROM_INTF_H
+#define _SYS_HV_INCLUDE_DRV_SROM_INTF_H
+
+/** Read this offset to get the total device size. */
+#define SROM_TOTAL_SIZE_OFF   0xF0000000
+
+/** Read this offset to get the device sector size. */
+#define SROM_SECTOR_SIZE_OFF  0xF0000004
+
+/** Read this offset to get the device page size. */
+#define SROM_PAGE_SIZE_OFF    0xF0000008
+
+/** Write this offset to flush any pending writes. */
+#define SROM_FLUSH_OFF        0xF1000000
+
+/** Write this offset, plus the byte offset of the start of a sector, to
+ *  erase a sector.  Any write data is ignored, but there must be at least
+ *  one byte of write data.  Only applies when the driver is in MTD mode.
+ */
+#define SROM_ERASE_OFF        0xF2000000
+
+#endif /* _SYS_HV_INCLUDE_DRV_SROM_INTF_H */
index c4be58c..f6f50f2 100644 (file)
@@ -78,7 +78,6 @@ static struct clocksource cycle_counter_cs = {
        .rating = 300,
        .read = clocksource_get_cycles,
        .mask = CLOCKSOURCE_MASK(64),
-       .shift = 22,   /* typical value, e.g. x86 tsc uses this */
        .flags = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
@@ -91,8 +90,6 @@ void __init setup_clock(void)
        cycles_per_sec = hv_sysconf(HV_SYSCONF_CPU_SPEED);
        sched_clock_mult =
                clocksource_hz2mult(cycles_per_sec, SCHED_CLOCK_SHIFT);
-       cycle_counter_cs.mult =
-               clocksource_hz2mult(cycles_per_sec, cycle_counter_cs.shift);
 }
 
 void __init calibrate_delay(void)
@@ -107,7 +104,7 @@ void __init calibrate_delay(void)
 void __init time_init(void)
 {
        /* Initialize and register the clock source. */
-       clocksource_register(&cycle_counter_cs);
+       clocksource_register_hz(&cycle_counter_cs, cycles_per_sec);
 
        /* Start up the tile-timer interrupt source on the boot cpu. */
        setup_tile_timer();
index 4e10c40..7309988 100644 (file)
@@ -836,8 +836,7 @@ void __init mem_init(void)
 #endif
 
 #ifdef CONFIG_FLATMEM
-       if (!mem_map)
-               BUG();
+       BUG_ON(!mem_map);
 #endif
 
 #ifdef CONFIG_HIGHMEM
index 7cf916f..6a47bb2 100644 (file)
@@ -72,6 +72,7 @@ config X86
        select USE_GENERIC_SMP_HELPERS if SMP
        select HAVE_BPF_JIT if (X86_64 && NET)
        select CLKEVT_I8253
+       select ARCH_HAVE_NMI_SAFE_CMPXCHG
 
 config INSTRUCTION_DECODER
        def_bool (KPROBES || PERF_EVENTS)
index d02804d..d8e8eef 100644 (file)
@@ -40,8 +40,6 @@
 #include <linux/compiler.h>
 #include <asm/page.h>
 
-#include <xen/xen.h>
-
 #define build_mmio_read(name, size, type, reg, barrier) \
 static inline type name(const volatile void __iomem *addr) \
 { type ret; asm volatile("mov" size " %1,%0":reg (ret) \
@@ -334,6 +332,7 @@ extern void fixup_early_ioremap(void);
 extern bool is_early_ioremap_ptep(pte_t *ptep);
 
 #ifdef CONFIG_XEN
+#include <xen/xen.h>
 struct bio_vec;
 
 extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
index 2193715..0d1171c 100644 (file)
@@ -751,8 +751,6 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
                     :: "a" (eax), "c" (ecx));
 }
 
-extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
-
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
 extern void init_amd_e400_c1e_mask(void);
 
index 5812404..f50e7fb 100644 (file)
@@ -149,6 +149,29 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
 }
 EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
 
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ *
+ * New with Core Duo processors, MWAIT can take some hints based on CPU
+ * capability.
+ */
+void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
+{
+       if (!need_resched()) {
+               if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
+                       clflush((void *)&current_thread_info()->flags);
+
+               __monitor((void *)&current_thread_info()->flags, 0, 0);
+               smp_mb();
+               if (!need_resched())
+                       __mwait(ax, cx);
+       }
+}
+
 void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
 {
        unsigned int cpu = smp_processor_id();
index e1ba8cb..e7e3b01 100644 (file)
@@ -438,29 +438,6 @@ void cpu_idle_wait(void)
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);
 
-/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- *
- * New with Core Duo processors, MWAIT can take some hints based on CPU
- * capability.
- */
-void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
-{
-       if (!need_resched()) {
-               if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
-                       clflush((void *)&current_thread_info()->flags);
-
-               __monitor((void *)&current_thread_info()->flags, 0, 0);
-               smp_mb();
-               if (!need_resched())
-                       __mwait(ax, cx);
-       }
-}
-
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 static void mwait_idle(void)
 {
index a3d0dc5..7a3b651 100644 (file)
@@ -38,6 +38,7 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/kdebug.h>
+#include <linux/cpuidle.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -109,7 +110,8 @@ void cpu_idle(void)
                        local_irq_disable();
                        /* Don't trace irqs off for idle */
                        stop_critical_timings();
-                       pm_idle();
+                       if (cpuidle_idle_call())
+                               pm_idle();
                        start_critical_timings();
                }
                tick_nohz_restart_sched_tick();
index ca6f7ab..f693e44 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/ftrace.h>
+#include <linux/cpuidle.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -136,7 +137,8 @@ void cpu_idle(void)
                        enter_idle();
                        /* Don't trace irqs off for idle */
                        stop_critical_timings();
-                       pm_idle();
+                       if (cpuidle_idle_call())
+                               pm_idle();
                        start_critical_timings();
 
                        /* In many cases the interrupt that ended idle
index f61ccdd..1ea3877 100644 (file)
@@ -1,3 +1,4 @@
 obj-$(CONFIG_X86_MRST)         += mrst.o
 obj-$(CONFIG_X86_MRST)         += vrtc.o
 obj-$(CONFIG_EARLY_PRINTK_MRST)        += early_printk_mrst.o
+obj-$(CONFIG_X86_MRST)         += pmu.o
diff --git a/arch/x86/platform/mrst/pmu.c b/arch/x86/platform/mrst/pmu.c
new file mode 100644 (file)
index 0000000..9281da7
--- /dev/null
@@ -0,0 +1,817 @@
+/*
+ * mrst/pmu.c - driver for MRST Power Management Unit
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/cpuidle.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/seq_file.h>
+#include <linux/sfi.h>
+#include <asm/intel_scu_ipc.h>
+#include "pmu.h"
+
+#define IPCMSG_FW_REVISION     0xF4
+
+struct mrst_device {
+       u16 pci_dev_num;        /* DEBUG only */
+       u16 lss;
+       u16 latest_request;
+       unsigned int pci_state_counts[PCI_D3cold + 1]; /* DEBUG only */
+};
+
+/*
+ * comlete list of MRST PCI devices
+ */
+static struct mrst_device mrst_devs[] = {
+/*  0 */ { 0x0800, LSS_SPI0 },         /* Moorestown SPI Ctrl 0 */
+/*  1 */ { 0x0801, LSS_SPI1 },         /* Moorestown SPI Ctrl 1 */
+/*  2 */ { 0x0802, LSS_I2C0 },         /* Moorestown I2C 0 */
+/*  3 */ { 0x0803, LSS_I2C1 },         /* Moorestown I2C 1 */
+/*  4 */ { 0x0804, LSS_I2C2 },         /* Moorestown I2C 2 */
+/*  5 */ { 0x0805, LSS_KBD },          /* Moorestown Keyboard Ctrl */
+/*  6 */ { 0x0806, LSS_USB_HC },       /* Moorestown USB Ctrl */
+/*  7 */ { 0x0807, LSS_SD_HC0 },       /* Moorestown SD Host Ctrl 0 */
+/*  8 */ { 0x0808, LSS_SD_HC1 },       /* Moorestown SD Host Ctrl 1 */
+/*  9 */ { 0x0809, LSS_NAND },         /* Moorestown NAND Ctrl */
+/* 10 */ { 0x080a, LSS_AUDIO },                /* Moorestown Audio Ctrl */
+/* 11 */ { 0x080b, LSS_IMAGING },      /* Moorestown ISP */
+/* 12 */ { 0x080c, LSS_SECURITY },     /* Moorestown Security Controller */
+/* 13 */ { 0x080d, LSS_DISPLAY },      /* Moorestown External Displays */
+/* 14 */ { 0x080e, 0 },                        /* Moorestown SCU IPC */
+/* 15 */ { 0x080f, LSS_GPIO },         /* Moorestown GPIO Controller */
+/* 16 */ { 0x0810, 0 },                        /* Moorestown Power Management Unit */
+/* 17 */ { 0x0811, LSS_USB_OTG },      /* Moorestown OTG Ctrl */
+/* 18 */ { 0x0812, LSS_SPI2 },         /* Moorestown SPI Ctrl 2 */
+/* 19 */ { 0x0813, 0 },                        /* Moorestown SC DMA */
+/* 20 */ { 0x0814, LSS_AUDIO_LPE },    /* Moorestown LPE DMA */
+/* 21 */ { 0x0815, LSS_AUDIO_SSP },    /* Moorestown SSP0 */
+
+/* 22 */ { 0x084F, LSS_SD_HC2 },       /* Moorestown SD Host Ctrl 2 */
+
+/* 23 */ { 0x4102, 0 },                        /* Lincroft */
+/* 24 */ { 0x4110, 0 },                        /* Lincroft */
+};
+
+/* n.b. We ignore PCI-id 0x815 in LSS9 b/c MeeGo has no driver for it */
+static u16 mrst_lss9_pci_ids[] = {0x080a, 0x0814, 0};
+static u16 mrst_lss10_pci_ids[] = {0x0800, 0x0801, 0x0802, 0x0803,
+                                       0x0804, 0x0805, 0x080f, 0};
+
+/* handle concurrent SMP invokations of pmu_pci_set_power_state() */
+static spinlock_t mrst_pmu_power_state_lock;
+
+static unsigned int wake_counters[MRST_NUM_LSS];       /* DEBUG only */
+static unsigned int pmu_irq_stats[INT_INVALID + 1];    /* DEBUG only */
+
+static int graphics_is_off;
+static int lss_s0i3_enabled;
+static bool mrst_pmu_s0i3_enable;
+
+/*  debug counters */
+static u32 pmu_wait_ready_calls;
+static u32 pmu_wait_ready_udelays;
+static u32 pmu_wait_ready_udelays_max;
+static u32 pmu_wait_done_calls;
+static u32 pmu_wait_done_udelays;
+static u32 pmu_wait_done_udelays_max;
+static u32 pmu_set_power_state_entry;
+static u32 pmu_set_power_state_send_cmd;
+
+static struct mrst_device *pci_id_2_mrst_dev(u16 pci_dev_num)
+{
+       int index = 0;
+
+       if ((pci_dev_num >= 0x0800) && (pci_dev_num <= 0x815))
+               index = pci_dev_num - 0x800;
+       else if (pci_dev_num == 0x084F)
+               index = 22;
+       else if (pci_dev_num == 0x4102)
+               index = 23;
+       else if (pci_dev_num == 0x4110)
+               index = 24;
+
+       if (pci_dev_num != mrst_devs[index].pci_dev_num) {
+               WARN_ONCE(1, FW_BUG "Unknown PCI device 0x%04X\n", pci_dev_num);
+               return 0;
+       }
+
+       return &mrst_devs[index];
+}
+
+/**
+ * mrst_pmu_validate_cstates
+ * @dev: cpuidle_device
+ *
+ * Certain states are not appropriate for governor to pick in some cases.
+ * This function will be called as cpuidle_device's prepare callback and
+ * thus tells governor to ignore such states when selecting the next state
+ * to enter.
+ */
+
+#define IDLE_STATE4_IS_C6      4
+#define IDLE_STATE5_IS_S0I3    5
+
+int mrst_pmu_invalid_cstates(void)
+{
+       int cpu = smp_processor_id();
+
+       /*
+        * Demote to C4 if the PMU is busy.
+        * Since LSS changes leave the busy bit clear...
+        * busy means either the PMU is waiting for an ACK-C6 that
+        * isn't coming due to an MWAIT that returned immediately;
+        * or we returned from S0i3 successfully, and the PMU
+        * is not done sending us interrupts.
+        */
+       if (pmu_read_busy_status())
+               return 1 << IDLE_STATE4_IS_C6 | 1 << IDLE_STATE5_IS_S0I3;
+
+       /*
+        * Disallow S0i3 if: PMU is not initialized, or CPU1 is active,
+        * or if device LSS is insufficient, or the GPU is active,
+        * or if it has been explicitly disabled.
+        */
+       if (!pmu_reg || !cpumask_equal(cpu_online_mask, cpumask_of(cpu)) ||
+           !lss_s0i3_enabled || !graphics_is_off || !mrst_pmu_s0i3_enable)
+               return 1 << IDLE_STATE5_IS_S0I3;
+       else
+               return 0;
+}
+
+/*
+ * pmu_update_wake_counters(): read PM_WKS, update wake_counters[]
+ * DEBUG only.
+ */
+static void pmu_update_wake_counters(void)
+{
+       int lss;
+       u32 wake_status;
+
+       wake_status = pmu_read_wks();
+
+       for (lss = 0; lss < MRST_NUM_LSS; ++lss) {
+               if (wake_status & (1 << lss))
+                       wake_counters[lss]++;
+       }
+}
+
+int mrst_pmu_s0i3_entry(void)
+{
+       int status;
+
+       /* Clear any possible error conditions */
+       pmu_write_ics(0x300);
+
+       /* set wake control to current D-states */
+       pmu_write_wssc(S0I3_SSS_TARGET);
+
+       status = mrst_s0i3_entry(PM_S0I3_COMMAND, &pmu_reg->pm_cmd);
+       pmu_update_wake_counters();
+       return status;
+}
+
+/* poll for maximum of 5ms for busy bit to clear */
+static int pmu_wait_ready(void)
+{
+       int udelays;
+
+       pmu_wait_ready_calls++;
+
+       for (udelays = 0; udelays < 500; ++udelays) {
+               if (udelays > pmu_wait_ready_udelays_max)
+                       pmu_wait_ready_udelays_max = udelays;
+
+               if (pmu_read_busy_status() == 0)
+                       return 0;
+
+               udelay(10);
+               pmu_wait_ready_udelays++;
+       }
+
+       /*
+        * if this fires, observe
+        * /sys/kernel/debug/mrst_pmu_wait_ready_calls
+        * /sys/kernel/debug/mrst_pmu_wait_ready_udelays
+        */
+       WARN_ONCE(1, "SCU not ready for 5ms");
+       return -EBUSY;
+}
+/* poll for maximum of 50ms us for busy bit to clear */
+static int pmu_wait_done(void)
+{
+       int udelays;
+
+       pmu_wait_done_calls++;
+
+       for (udelays = 0; udelays < 500; ++udelays) {
+               if (udelays > pmu_wait_done_udelays_max)
+                       pmu_wait_done_udelays_max = udelays;
+
+               if (pmu_read_busy_status() == 0)
+                       return 0;
+
+               udelay(100);
+               pmu_wait_done_udelays++;
+       }
+
+       /*
+        * if this fires, observe
+        * /sys/kernel/debug/mrst_pmu_wait_done_calls
+        * /sys/kernel/debug/mrst_pmu_wait_done_udelays
+        */
+       WARN_ONCE(1, "SCU not done for 50ms");
+       return -EBUSY;
+}
+
+u32 mrst_pmu_msi_is_disabled(void)
+{
+       return pmu_msi_is_disabled();
+}
+
+void mrst_pmu_enable_msi(void)
+{
+       pmu_msi_enable();
+}
+
+/**
+ * pmu_irq - pmu driver interrupt handler
+ * Context: interrupt context
+ */
+static irqreturn_t pmu_irq(int irq, void *dummy)
+{
+       union pmu_pm_ics pmu_ics;
+
+       pmu_ics.value = pmu_read_ics();
+
+       if (!pmu_ics.bits.pending)
+               return IRQ_NONE;
+
+       switch (pmu_ics.bits.cause) {
+       case INT_SPURIOUS:
+       case INT_CMD_DONE:
+       case INT_CMD_ERR:
+       case INT_WAKE_RX:
+       case INT_SS_ERROR:
+       case INT_S0IX_MISS:
+       case INT_NO_ACKC6:
+               pmu_irq_stats[pmu_ics.bits.cause]++;
+               break;
+       default:
+               pmu_irq_stats[INT_INVALID]++;
+       }
+
+       pmu_write_ics(pmu_ics.value); /* Clear pending interrupt */
+
+       return IRQ_HANDLED;
+}
+
+/*
+ * Translate PCI power management to MRST LSS D-states
+ */
+static int pci_2_mrst_state(int lss, pci_power_t pci_state)
+{
+       switch (pci_state) {
+       case PCI_D0:
+               if (SSMSK(D0i1, lss) & D0I1_ACG_SSS_TARGET)
+                       return D0i1;
+               else
+                       return D0;
+       case PCI_D1:
+               return D0i1;
+       case PCI_D2:
+               return D0i2;
+       case PCI_D3hot:
+       case PCI_D3cold:
+               return D0i3;
+       default:
+               WARN(1, "pci_state %d\n", pci_state);
+               return 0;
+       }
+}
+
+static int pmu_issue_command(u32 pm_ssc)
+{
+       union pmu_pm_set_cfg_cmd_t command;
+
+       if (pmu_read_busy_status()) {
+               pr_debug("pmu is busy, Operation not permitted\n");
+               return -1;
+       }
+
+       /*
+        * enable interrupts in PMU so that interrupts are
+        * propagated when ioc bit for a particular set
+        * command is set
+        */
+
+       pmu_irq_enable();
+
+       /* Configure the sub systems for pmu2 */
+
+       pmu_write_ssc(pm_ssc);
+
+       /*
+        * Send the set config command for pmu its configured
+        * for mode CM_IMMEDIATE & hence with No Trigger
+        */
+
+       command.pmu2_params.d_param.cfg_mode = CM_IMMEDIATE;
+       command.pmu2_params.d_param.cfg_delay = 0;
+       command.pmu2_params.d_param.rsvd = 0;
+
+       /* construct the command to send SET_CFG to particular PMU */
+       command.pmu2_params.d_param.cmd = SET_CFG_CMD;
+       command.pmu2_params.d_param.ioc = 0;
+       command.pmu2_params.d_param.mode_id = 0;
+       command.pmu2_params.d_param.sys_state = SYS_STATE_S0I0;
+
+       /* write the value of PM_CMD into particular PMU */
+       pr_debug("pmu command being written %x\n",
+                       command.pmu_pm_set_cfg_cmd_value);
+
+       pmu_write_cmd(command.pmu_pm_set_cfg_cmd_value);
+
+       return 0;
+}
+
+static u16 pmu_min_lss_pci_req(u16 *ids, u16 pci_state)
+{
+       u16 existing_request;
+       int i;
+
+       for (i = 0; ids[i]; ++i) {
+               struct mrst_device *mrst_dev;
+
+               mrst_dev = pci_id_2_mrst_dev(ids[i]);
+               if (unlikely(!mrst_dev))
+                       continue;
+
+               existing_request = mrst_dev->latest_request;
+               if (existing_request < pci_state)
+                       pci_state = existing_request;
+       }
+       return pci_state;
+}
+
+/**
+ * pmu_pci_set_power_state - Callback function is used by all the PCI devices
+ *                     for a platform  specific device power on/shutdown.
+ */
+
+int pmu_pci_set_power_state(struct pci_dev *pdev, pci_power_t pci_state)
+{
+       u32 old_sss, new_sss;
+       int status = 0;
+       struct mrst_device *mrst_dev;
+
+       pmu_set_power_state_entry++;
+
+       BUG_ON(pdev->vendor != PCI_VENDOR_ID_INTEL);
+       BUG_ON(pci_state < PCI_D0 || pci_state > PCI_D3cold);
+
+       mrst_dev = pci_id_2_mrst_dev(pdev->device);
+       if (unlikely(!mrst_dev))
+               return -ENODEV;
+
+       mrst_dev->pci_state_counts[pci_state]++;        /* count invocations */
+
+       /* PMU driver calls self as part of PCI initialization, ignore */
+       if (pdev->device == PCI_DEV_ID_MRST_PMU)
+               return 0;
+
+       BUG_ON(!pmu_reg); /* SW bug if called before initialized */
+
+       spin_lock(&mrst_pmu_power_state_lock);
+
+       if (pdev->d3_delay) {
+               dev_dbg(&pdev->dev, "d3_delay %d, should be 0\n",
+                       pdev->d3_delay);
+               pdev->d3_delay = 0;
+       }
+       /*
+        * If Lincroft graphics, simply remember state
+        */
+       if ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY
+               && !((pdev->class & PCI_SUB_CLASS_MASK) >> 8)) {
+               if (pci_state == PCI_D0)
+                       graphics_is_off = 0;
+               else
+                       graphics_is_off = 1;
+               goto ret;
+       }
+
+       if (!mrst_dev->lss)
+               goto ret;       /* device with no LSS */
+
+       if (mrst_dev->latest_request == pci_state)
+               goto ret;       /* no change */
+
+       mrst_dev->latest_request = pci_state;   /* record latest request */
+
+       /*
+        * LSS9 and LSS10 contain multiple PCI devices.
+        * Use the lowest numbered (highest power) state in the LSS
+        */
+       if (mrst_dev->lss == 9)
+               pci_state = pmu_min_lss_pci_req(mrst_lss9_pci_ids, pci_state);
+       else if (mrst_dev->lss == 10)
+               pci_state = pmu_min_lss_pci_req(mrst_lss10_pci_ids, pci_state);
+
+       status = pmu_wait_ready();
+       if (status)
+               goto ret;
+
+       old_sss = pmu_read_sss();
+       new_sss = old_sss & ~SSMSK(3, mrst_dev->lss);
+       new_sss |= SSMSK(pci_2_mrst_state(mrst_dev->lss, pci_state),
+                       mrst_dev->lss);
+
+       if (new_sss == old_sss)
+               goto ret;       /* nothing to do */
+
+       pmu_set_power_state_send_cmd++;
+
+       status = pmu_issue_command(new_sss);
+
+       if (unlikely(status != 0)) {
+               dev_err(&pdev->dev, "Failed to Issue a PM command\n");
+               goto ret;
+       }
+
+       if (pmu_wait_done())
+               goto ret;
+
+       lss_s0i3_enabled =
+       ((pmu_read_sss() & S0I3_SSS_TARGET) == S0I3_SSS_TARGET);
+ret:
+       spin_unlock(&mrst_pmu_power_state_lock);
+       return status;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static char *d0ix_names[] = {"D0", "D0i1", "D0i2", "D0i3"};
+
+static inline const char *d0ix_name(int state)
+{
+       return d0ix_names[(int) state];
+}
+
+static int debug_mrst_pmu_show(struct seq_file *s, void *unused)
+{
+       struct pci_dev *pdev = NULL;
+       u32 cur_pmsss;
+       int lss;
+
+       seq_printf(s, "0x%08X D0I1_ACG_SSS_TARGET\n", D0I1_ACG_SSS_TARGET);
+
+       cur_pmsss = pmu_read_sss();
+
+       seq_printf(s, "0x%08X S0I3_SSS_TARGET\n", S0I3_SSS_TARGET);
+
+       seq_printf(s, "0x%08X Current SSS ", cur_pmsss);
+       seq_printf(s, lss_s0i3_enabled ? "\n" : "[BLOCKS s0i3]\n");
+
+       if (cpumask_equal(cpu_online_mask, cpumask_of(0)))
+               seq_printf(s, "cpu0 is only cpu online\n");
+       else
+               seq_printf(s, "cpu0 is NOT only cpu online [BLOCKS S0i3]\n");
+
+       seq_printf(s, "GFX: %s\n", graphics_is_off ? "" : "[BLOCKS s0i3]");
+
+
+       for_each_pci_dev(pdev) {
+               int pos;
+               u16 pmcsr;
+               struct mrst_device *mrst_dev;
+               int i;
+
+               mrst_dev = pci_id_2_mrst_dev(pdev->device);
+
+               seq_printf(s, "%s %04x/%04X %-16.16s ",
+                       dev_name(&pdev->dev),
+                       pdev->vendor, pdev->device,
+                       dev_driver_string(&pdev->dev));
+
+               if (unlikely (!mrst_dev)) {
+                       seq_printf(s, " UNKNOWN\n");
+                       continue;
+               }
+
+               if (mrst_dev->lss)
+                       seq_printf(s, "LSS %2d %-4s ", mrst_dev->lss,
+                               d0ix_name(((cur_pmsss >>
+                                       (mrst_dev->lss * 2)) & 0x3)));
+               else
+                       seq_printf(s, "            ");
+
+               /* PCI PM config space setting */
+               pos = pci_find_capability(pdev, PCI_CAP_ID_PM);
+               if (pos != 0) {
+                       pci_read_config_word(pdev, pos + PCI_PM_CTRL, &pmcsr);
+               seq_printf(s, "PCI-%-4s",
+                       pci_power_name(pmcsr & PCI_PM_CTRL_STATE_MASK));
+               } else {
+                       seq_printf(s, "        ");
+               }
+
+               seq_printf(s, " %s ", pci_power_name(mrst_dev->latest_request));
+               for (i = 0; i <= PCI_D3cold; ++i)
+                       seq_printf(s, "%d ", mrst_dev->pci_state_counts[i]);
+
+               if (mrst_dev->lss) {
+                       unsigned int lssmask;
+
+                       lssmask = SSMSK(D0i3, mrst_dev->lss);
+
+                       if ((lssmask & S0I3_SSS_TARGET) &&
+                               ((lssmask & cur_pmsss) !=
+                                       (lssmask & S0I3_SSS_TARGET)))
+                                               seq_printf(s , "[BLOCKS s0i3]");
+               }
+
+               seq_printf(s, "\n");
+       }
+       seq_printf(s, "Wake Counters:\n");
+       for (lss = 0; lss < MRST_NUM_LSS; ++lss)
+               seq_printf(s, "LSS%d %d\n", lss, wake_counters[lss]);
+
+       seq_printf(s, "Interrupt Counters:\n");
+       seq_printf(s,
+               "INT_SPURIOUS \t%8u\n" "INT_CMD_DONE \t%8u\n"
+               "INT_CMD_ERR  \t%8u\n" "INT_WAKE_RX  \t%8u\n"
+               "INT_SS_ERROR \t%8u\n" "INT_S0IX_MISS\t%8u\n"
+               "INT_NO_ACKC6 \t%8u\n" "INT_INVALID  \t%8u\n",
+               pmu_irq_stats[INT_SPURIOUS], pmu_irq_stats[INT_CMD_DONE],
+               pmu_irq_stats[INT_CMD_ERR], pmu_irq_stats[INT_WAKE_RX],
+               pmu_irq_stats[INT_SS_ERROR], pmu_irq_stats[INT_S0IX_MISS],
+               pmu_irq_stats[INT_NO_ACKC6], pmu_irq_stats[INT_INVALID]);
+
+       seq_printf(s, "mrst_pmu_wait_ready_calls          %8d\n",
+                       pmu_wait_ready_calls);
+       seq_printf(s, "mrst_pmu_wait_ready_udelays        %8d\n",
+                       pmu_wait_ready_udelays);
+       seq_printf(s, "mrst_pmu_wait_ready_udelays_max    %8d\n",
+                       pmu_wait_ready_udelays_max);
+       seq_printf(s, "mrst_pmu_wait_done_calls           %8d\n",
+                       pmu_wait_done_calls);
+       seq_printf(s, "mrst_pmu_wait_done_udelays         %8d\n",
+                       pmu_wait_done_udelays);
+       seq_printf(s, "mrst_pmu_wait_done_udelays_max     %8d\n",
+                       pmu_wait_done_udelays_max);
+       seq_printf(s, "mrst_pmu_set_power_state_entry     %8d\n",
+                       pmu_set_power_state_entry);
+       seq_printf(s, "mrst_pmu_set_power_state_send_cmd  %8d\n",
+                       pmu_set_power_state_send_cmd);
+       seq_printf(s, "SCU busy: %d\n", pmu_read_busy_status());
+
+       return 0;
+}
+
+static int debug_mrst_pmu_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, debug_mrst_pmu_show, NULL);
+}
+
+static const struct file_operations devices_state_operations = {
+       .open           = debug_mrst_pmu_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+#endif /* DEBUG_FS */
+
+/*
+ * Validate SCU PCI shim PCI vendor capability byte
+ * against LSS hard-coded in mrst_devs[] above.
+ * DEBUG only.
+ */
+static void pmu_scu_firmware_debug(void)
+{
+       struct pci_dev *pdev = NULL;
+
+       for_each_pci_dev(pdev) {
+               struct mrst_device *mrst_dev;
+               u8 pci_config_lss;
+               int pos;
+
+               mrst_dev = pci_id_2_mrst_dev(pdev->device);
+               if (unlikely(!mrst_dev)) {
+                       printk(KERN_ERR FW_BUG "pmu: Unknown "
+                               "PCI device 0x%04X\n", pdev->device);
+                       continue;
+               }
+
+               if (mrst_dev->lss == 0)
+                       continue;        /* no LSS in our table */
+
+               pos = pci_find_capability(pdev, PCI_CAP_ID_VNDR);
+               if (!pos != 0) {
+                       printk(KERN_ERR FW_BUG "pmu: 0x%04X "
+                               "missing PCI Vendor Capability\n",
+                               pdev->device);
+                       continue;
+               }
+               pci_read_config_byte(pdev, pos + 4, &pci_config_lss);
+               if (!(pci_config_lss & PCI_VENDOR_CAP_LOG_SS_MASK)) {
+                       printk(KERN_ERR FW_BUG "pmu: 0x%04X "
+                               "invalid PCI Vendor Capability 0x%x "
+                               " expected LSS 0x%X\n",
+                               pdev->device, pci_config_lss, mrst_dev->lss);
+                       continue;
+               }
+               pci_config_lss &= PCI_VENDOR_CAP_LOG_ID_MASK;
+
+               if (mrst_dev->lss == pci_config_lss)
+                       continue;
+
+               printk(KERN_ERR FW_BUG "pmu: 0x%04X LSS = %d, expected %d\n",
+                       pdev->device, pci_config_lss, mrst_dev->lss);
+       }
+}
+
+/**
+ * pmu_probe
+ */
+static int __devinit pmu_probe(struct pci_dev *pdev,
+                                  const struct pci_device_id *pci_id)
+{
+       int ret;
+       struct mrst_pmu_reg *pmu;
+
+       /* Init the device */
+       ret = pci_enable_device(pdev);
+       if (ret) {
+               dev_err(&pdev->dev, "Unable to Enable PCI device\n");
+               return ret;
+       }
+
+       ret = pci_request_regions(pdev, MRST_PMU_DRV_NAME);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "Cannot obtain PCI resources, aborting\n");
+               goto out_err1;
+       }
+
+       /* Map the memory of PMU reg base */
+       pmu = pci_iomap(pdev, 0, 0);
+       if (!pmu) {
+               dev_err(&pdev->dev, "Unable to map the PMU address space\n");
+               ret = -ENOMEM;
+               goto out_err2;
+       }
+
+#ifdef CONFIG_DEBUG_FS
+       /* /sys/kernel/debug/mrst_pmu */
+       (void) debugfs_create_file("mrst_pmu", S_IFREG | S_IRUGO,
+                               NULL, NULL, &devices_state_operations);
+#endif
+       pmu_reg = pmu;  /* success */
+
+       if (request_irq(pdev->irq, pmu_irq, 0, MRST_PMU_DRV_NAME, NULL)) {
+               dev_err(&pdev->dev, "Registering isr has failed\n");
+               ret = -1;
+               goto out_err3;
+       }
+
+       pmu_scu_firmware_debug();
+
+       pmu_write_wkc(S0I3_WAKE_SOURCES);       /* Enable S0i3 wakeup sources */
+
+       pmu_wait_ready();
+
+       pmu_write_ssc(D0I1_ACG_SSS_TARGET);     /* Enable Auto-Clock_Gating */
+       pmu_write_cmd(0x201);
+
+       spin_lock_init(&mrst_pmu_power_state_lock);
+
+       /* Enable the hardware interrupt */
+       pmu_irq_enable();
+       return 0;
+
+out_err3:
+       free_irq(pdev->irq, NULL);
+       pci_iounmap(pdev, pmu_reg);
+       pmu_reg = NULL;
+out_err2:
+       pci_release_region(pdev, 0);
+out_err1:
+       pci_disable_device(pdev);
+       return ret;
+}
+
+static void __devexit pmu_remove(struct pci_dev *pdev)
+{
+       dev_err(&pdev->dev, "Mid PM pmu_remove called\n");
+
+       /* Freeing up the irq */
+       free_irq(pdev->irq, NULL);
+
+       pci_iounmap(pdev, pmu_reg);
+       pmu_reg = NULL;
+
+       /* disable the current PCI device */
+       pci_release_region(pdev, 0);
+       pci_disable_device(pdev);
+}
+
+static DEFINE_PCI_DEVICE_TABLE(pmu_pci_ids) = {
+       { PCI_VDEVICE(INTEL, PCI_DEV_ID_MRST_PMU), 0 },
+       { }
+};
+
+MODULE_DEVICE_TABLE(pci, pmu_pci_ids);
+
+static struct pci_driver driver = {
+       .name = MRST_PMU_DRV_NAME,
+       .id_table = pmu_pci_ids,
+       .probe = pmu_probe,
+       .remove = __devexit_p(pmu_remove),
+};
+
+/**
+ * pmu_pci_register - register the PMU driver as PCI device
+ */
+static int __init pmu_pci_register(void)
+{
+       return pci_register_driver(&driver);
+}
+
+/* Register and probe via fs_initcall() to preceed device_initcall() */
+fs_initcall(pmu_pci_register);
+
+static void __exit mid_pci_cleanup(void)
+{
+       pci_unregister_driver(&driver);
+}
+
+static int ia_major;
+static int ia_minor;
+
+static int pmu_sfi_parse_oem(struct sfi_table_header *table)
+{
+       struct sfi_table_simple *sb;
+
+       sb = (struct sfi_table_simple *)table;
+       ia_major = (sb->pentry[1] >> 0) & 0xFFFF;
+       ia_minor = (sb->pentry[1] >> 16) & 0xFFFF;
+       printk(KERN_INFO "mrst_pmu: IA FW version v%x.%x\n",
+               ia_major, ia_minor);
+
+       return 0;
+}
+
+static int __init scu_fw_check(void)
+{
+       int ret;
+       u32 fw_version;
+
+       if (!pmu_reg)
+               return 0;       /* this driver didn't probe-out */
+
+       sfi_table_parse("OEMB", NULL, NULL, pmu_sfi_parse_oem);
+
+       if (ia_major < 0x6005 || ia_minor < 0x1525) {
+               WARN(1, "mrst_pmu: IA FW version too old\n");
+               return -1;
+       }
+
+       ret = intel_scu_ipc_command(IPCMSG_FW_REVISION, 0, NULL, 0,
+                                       &fw_version, 1);
+
+       if (ret) {
+               WARN(1, "mrst_pmu: IPC FW version? %d\n", ret);
+       } else {
+               int scu_major = (fw_version >> 8) & 0xFF;
+               int scu_minor = (fw_version >> 0) & 0xFF;
+
+               printk(KERN_INFO "mrst_pmu: firmware v%x\n", fw_version);
+
+               if ((scu_major >= 0xC0) && (scu_minor >= 0x49)) {
+                       printk(KERN_INFO "mrst_pmu: enabling S0i3\n");
+                       mrst_pmu_s0i3_enable = true;
+               } else {
+                       WARN(1, "mrst_pmu: S0i3 disabled, old firmware %X.%X",
+                                       scu_major, scu_minor);
+               }
+       }
+       return 0;
+}
+late_initcall(scu_fw_check);
+module_exit(mid_pci_cleanup);
diff --git a/arch/x86/platform/mrst/pmu.h b/arch/x86/platform/mrst/pmu.h
new file mode 100644 (file)
index 0000000..bfbfe64
--- /dev/null
@@ -0,0 +1,234 @@
+/*
+ * mrst/pmu.h - private definitions for MRST Power Management Unit mrst/pmu.c
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef _MRST_PMU_H_
+#define _MRST_PMU_H_
+
+#define PCI_DEV_ID_MRST_PMU            0x0810
+#define MRST_PMU_DRV_NAME              "mrst_pmu"
+#define        PCI_SUB_CLASS_MASK              0xFF00
+
+#define        PCI_VENDOR_CAP_LOG_ID_MASK      0x7F
+#define PCI_VENDOR_CAP_LOG_SS_MASK     0x80
+
+#define SUB_SYS_ALL_D0I1       0x01155555
+#define S0I3_WAKE_SOURCES      0x00001FFF
+
+#define PM_S0I3_COMMAND                                        \
+       ((0 << 31) |    /* Reserved */                  \
+       (0 << 30) |     /* Core must be idle */         \
+       (0xc2 << 22) |  /* ACK C6 trigger */            \
+       (3 << 19) |     /* Trigger on DMI message */    \
+       (3 << 16) |     /* Enter S0i3 */                \
+       (0 << 13) |     /* Numeric mode ID (sw) */      \
+       (3 << 9) |      /* Trigger mode */              \
+       (0 << 8) |      /* Do not interrupt */          \
+       (1 << 0))       /* Set configuration */
+
+#define        LSS_DMI         0
+#define        LSS_SD_HC0      1
+#define        LSS_SD_HC1      2
+#define        LSS_NAND        3
+#define        LSS_IMAGING     4
+#define        LSS_SECURITY    5
+#define        LSS_DISPLAY     6
+#define        LSS_USB_HC      7
+#define        LSS_USB_OTG     8
+#define        LSS_AUDIO       9
+#define        LSS_AUDIO_LPE   9
+#define        LSS_AUDIO_SSP   9
+#define        LSS_I2C0        10
+#define        LSS_I2C1        10
+#define        LSS_I2C2        10
+#define        LSS_KBD         10
+#define        LSS_SPI0        10
+#define        LSS_SPI1        10
+#define        LSS_SPI2        10
+#define        LSS_GPIO        10
+#define        LSS_SRAM        11      /* used by SCU, do not touch */
+#define        LSS_SD_HC2      12
+/* LSS hardware bits 15,14,13 are hardwired to 0, thus unusable */
+#define MRST_NUM_LSS   13
+
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+
+#define        SSMSK(mask, lss) ((mask) << ((lss) * 2))
+#define        D0      0
+#define        D0i1    1
+#define        D0i2    2
+#define        D0i3    3
+
+#define S0I3_SSS_TARGET        (               \
+       SSMSK(D0i1, LSS_DMI) |          \
+       SSMSK(D0i3, LSS_SD_HC0) |       \
+       SSMSK(D0i3, LSS_SD_HC1) |       \
+       SSMSK(D0i3, LSS_NAND) |         \
+       SSMSK(D0i3, LSS_SD_HC2) |       \
+       SSMSK(D0i3, LSS_IMAGING) |      \
+       SSMSK(D0i3, LSS_SECURITY) |     \
+       SSMSK(D0i3, LSS_DISPLAY) |      \
+       SSMSK(D0i3, LSS_USB_HC) |       \
+       SSMSK(D0i3, LSS_USB_OTG) |      \
+       SSMSK(D0i3, LSS_AUDIO) |        \
+       SSMSK(D0i1, LSS_I2C0))
+
+/*
+ * D0i1 on Langwell is Autonomous Clock Gating (ACG).
+ * Enable ACG on every LSS except camera and audio
+ */
+#define D0I1_ACG_SSS_TARGET     \
+       (SUB_SYS_ALL_D0I1 & ~SSMSK(D0i1, LSS_IMAGING) & ~SSMSK(D0i1, LSS_AUDIO))
+
+enum cm_mode {
+       CM_NOP,                 /* ignore the config mode value */
+       CM_IMMEDIATE,
+       CM_DELAY,
+       CM_TRIGGER,
+       CM_INVALID
+};
+
+enum sys_state {
+       SYS_STATE_S0I0,
+       SYS_STATE_S0I1,
+       SYS_STATE_S0I2,
+       SYS_STATE_S0I3,
+       SYS_STATE_S3,
+       SYS_STATE_S5
+};
+
+#define SET_CFG_CMD    1
+
+enum int_status {
+       INT_SPURIOUS = 0,
+       INT_CMD_DONE = 1,
+       INT_CMD_ERR = 2,
+       INT_WAKE_RX = 3,
+       INT_SS_ERROR = 4,
+       INT_S0IX_MISS = 5,
+       INT_NO_ACKC6 = 6,
+       INT_INVALID = 7,
+};
+
+/* PMU register interface */
+static struct mrst_pmu_reg {
+       u32 pm_sts;             /* 0x00 */
+       u32 pm_cmd;             /* 0x04 */
+       u32 pm_ics;             /* 0x08 */
+       u32 _resv1;             /* 0x0C */
+       u32 pm_wkc[2];          /* 0x10 */
+       u32 pm_wks[2];          /* 0x18 */
+       u32 pm_ssc[4];          /* 0x20 */
+       u32 pm_sss[4];          /* 0x30 */
+       u32 pm_wssc[4];         /* 0x40 */
+       u32 pm_c3c4;            /* 0x50 */
+       u32 pm_c5c6;            /* 0x54 */
+       u32 pm_msi_disable;     /* 0x58 */
+} *pmu_reg;
+
+static inline u32 pmu_read_sts(void) { return readl(&pmu_reg->pm_sts); }
+static inline u32 pmu_read_ics(void) { return readl(&pmu_reg->pm_ics); }
+static inline u32 pmu_read_wks(void) { return readl(&pmu_reg->pm_wks[0]); }
+static inline u32 pmu_read_sss(void) { return readl(&pmu_reg->pm_sss[0]); }
+
+static inline void pmu_write_cmd(u32 arg) { writel(arg, &pmu_reg->pm_cmd); }
+static inline void pmu_write_ics(u32 arg) { writel(arg, &pmu_reg->pm_ics); }
+static inline void pmu_write_wkc(u32 arg) { writel(arg, &pmu_reg->pm_wkc[0]); }
+static inline void pmu_write_ssc(u32 arg) { writel(arg, &pmu_reg->pm_ssc[0]); }
+static inline void pmu_write_wssc(u32 arg)
+                                       { writel(arg, &pmu_reg->pm_wssc[0]); }
+
+static inline void pmu_msi_enable(void) { writel(0, &pmu_reg->pm_msi_disable); }
+static inline u32 pmu_msi_is_disabled(void)
+                               { return readl(&pmu_reg->pm_msi_disable); }
+
+union pmu_pm_ics {
+       struct {
+               u32 cause:8;
+               u32 enable:1;
+               u32 pending:1;
+               u32 reserved:22;
+       } bits;
+       u32 value;
+};
+
+static inline void pmu_irq_enable(void)
+{
+       union pmu_pm_ics pmu_ics;
+
+       pmu_ics.value = pmu_read_ics();
+       pmu_ics.bits.enable = 1;
+       pmu_write_ics(pmu_ics.value);
+}
+
+union pmu_pm_status {
+       struct {
+               u32 pmu_rev:8;
+               u32 pmu_busy:1;
+               u32 mode_id:4;
+               u32 Reserved:19;
+       } pmu_status_parts;
+       u32 pmu_status_value;
+};
+
+static inline int pmu_read_busy_status(void)
+{
+       union pmu_pm_status result;
+
+       result.pmu_status_value = pmu_read_sts();
+
+       return result.pmu_status_parts.pmu_busy;
+}
+
+/* pmu set config parameters */
+struct cfg_delay_param_t {
+       u32 cmd:8;
+       u32 ioc:1;
+       u32 cfg_mode:4;
+       u32 mode_id:3;
+       u32 sys_state:3;
+       u32 cfg_delay:8;
+       u32 rsvd:5;
+};
+
+struct cfg_trig_param_t {
+       u32 cmd:8;
+       u32 ioc:1;
+       u32 cfg_mode:4;
+       u32 mode_id:3;
+       u32 sys_state:3;
+       u32 cfg_trig_type:3;
+       u32 cfg_trig_val:8;
+       u32 cmbi:1;
+       u32 rsvd1:1;
+};
+
+union pmu_pm_set_cfg_cmd_t {
+       union {
+               struct cfg_delay_param_t d_param;
+               struct cfg_trig_param_t t_param;
+       } pmu2_params;
+       u32 pmu_pm_set_cfg_cmd_value;
+};
+
+#ifdef FUTURE_PATCH
+extern int mrst_s0i3_entry(u32 regval, u32 *regaddr);
+#else
+static inline int mrst_s0i3_entry(u32 regval, u32 *regaddr) { return -1; }
+#endif
+#endif
index 60aeeb5..a9627e2 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/pm.h>
 #include <linux/memblock.h>
+#include <linux/cpuidle.h>
 
 #include <asm/elf.h>
 #include <asm/vdso.h>
@@ -426,7 +427,7 @@ void __init xen_arch_setup(void)
 #ifdef CONFIG_X86_32
        boot_cpu_data.hlt_works_ok = 1;
 #endif
-       pm_idle = default_idle;
+       disable_cpuidle();
        boot_option_idle_override = IDLE_HALT;
 
        fiddle_vdso();
index b850bed..b627558 100644 (file)
@@ -1368,8 +1368,10 @@ static bool should_fail_request(struct hd_struct *part, unsigned int bytes)
 
 static int __init fail_make_request_debugfs(void)
 {
-       return init_fault_attr_dentries(&fail_make_request,
-                                       "fail_make_request");
+       struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
+                                               NULL, &fail_make_request);
+
+       return IS_ERR(dir) ? PTR_ERR(dir) : 0;
 }
 
 late_initcall(fail_make_request_debugfs);
index 4f0c06c..7803548 100644 (file)
@@ -28,7 +28,10 @@ int blk_should_fake_timeout(struct request_queue *q)
 
 static int __init fail_io_timeout_debugfs(void)
 {
-       return init_fault_attr_dentries(&fail_io_timeout, "fail_io_timeout");
+       struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout",
+                                               NULL, &fail_io_timeout);
+
+       return IS_ERR(dir) ? PTR_ERR(dir) : 0;
 }
 
 late_initcall(fail_io_timeout_debugfs);
index 73863d8..76dc02f 100644 (file)
@@ -126,6 +126,12 @@ u8 ACPI_INIT_GLOBAL(acpi_gbl_copy_dsdt_locally, FALSE);
  */
 u8 ACPI_INIT_GLOBAL(acpi_gbl_truncate_io_addresses, FALSE);
 
+/*
+ * Disable runtime checking and repair of values returned by control methods.
+ * Use only if the repair is causing a problem on a particular machine.
+ */
+u8 ACPI_INIT_GLOBAL(acpi_gbl_disable_auto_repair, FALSE);
+
 /* acpi_gbl_FADT is a local copy of the FADT, converted to a common format. */
 
 struct acpi_table_fadt acpi_gbl_FADT;
index c7f743c..5552125 100644 (file)
@@ -357,6 +357,7 @@ struct acpi_predefined_data {
        char *pathname;
        const union acpi_predefined_info *predefined;
        union acpi_operand_object *parent_package;
+       struct acpi_namespace_node *node;
        u32 flags;
        u8 node_flags;
 };
index 94e73c9..c445cca 100644 (file)
@@ -468,6 +468,7 @@ static const union acpi_predefined_info predefined_names[] =
        {{"_SWS", 0, ACPI_RTYPE_INTEGER}},
        {{"_TC1", 0, ACPI_RTYPE_INTEGER}},
        {{"_TC2", 0, ACPI_RTYPE_INTEGER}},
+       {{"_TDL", 0, ACPI_RTYPE_INTEGER}},
        {{"_TIP", 1, ACPI_RTYPE_INTEGER}},
        {{"_TIV", 1, ACPI_RTYPE_INTEGER}},
        {{"_TMP", 0, ACPI_RTYPE_INTEGER}},
index 9fb03fa..c845c80 100644 (file)
@@ -193,14 +193,20 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node,
        }
 
        /*
-        * 1) We have a return value, but if one wasn't expected, just exit, this is
-        * not a problem. For example, if the "Implicit Return" feature is
-        * enabled, methods will always return a value.
+        * Return value validation and possible repair.
         *
-        * 2) If the return value can be of any type, then we cannot perform any
-        * validation, exit.
+        * 1) Don't perform return value validation/repair if this feature
+        * has been disabled via a global option.
+        *
+        * 2) We have a return value, but if one wasn't expected, just exit,
+        * this is not a problem. For example, if the "Implicit Return"
+        * feature is enabled, methods will always return a value.
+        *
+        * 3) If the return value can be of any type, then we cannot perform
+        * any validation, just exit.
         */
-       if ((!predefined->info.expected_btypes) ||
+       if (acpi_gbl_disable_auto_repair ||
+           (!predefined->info.expected_btypes) ||
            (predefined->info.expected_btypes == ACPI_RTYPE_ALL)) {
                goto cleanup;
        }
@@ -212,6 +218,7 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node,
                goto cleanup;
        }
        data->predefined = predefined;
+       data->node = node;
        data->node_flags = node->flags;
        data->pathname = pathname;
 
index 973883b..024c4f2 100644 (file)
@@ -503,6 +503,21 @@ acpi_ns_repair_TSS(struct acpi_predefined_data *data,
 {
        union acpi_operand_object *return_object = *return_object_ptr;
        acpi_status status;
+       struct acpi_namespace_node *node;
+
+       /*
+        * We can only sort the _TSS return package if there is no _PSS in the
+        * same scope. This is because if _PSS is present, the ACPI specification
+        * dictates that the _TSS Power Dissipation field is to be ignored, and
+        * therefore some BIOSs leave garbage values in the _TSS Power field(s).
+        * In this case, it is best to just return the _TSS package as-is.
+        * (May, 2011)
+        */
+       status =
+           acpi_ns_get_node(data->node, "^_PSS", ACPI_NS_NO_UPSEARCH, &node);
+       if (ACPI_SUCCESS(status)) {
+               return (AE_OK);
+       }
 
        status = acpi_ns_check_sorted_list(data, return_object, 5, 1,
                                           ACPI_SORT_DESCENDING,
index 48db094..62365f6 100644 (file)
@@ -126,12 +126,29 @@ acpi_tb_add_table(struct acpi_table_desc *table_desc, u32 *table_index)
        }
 
        /*
-        * Originally, we checked the table signature for "SSDT" or "PSDT" here.
-        * Next, we added support for OEMx tables, signature "OEM".
-        * Valid tables were encountered with a null signature, so we've just
-        * given up on validating the signature, since it seems to be a waste
-        * of code. The original code was removed (05/2008).
+        * Validate the incoming table signature.
+        *
+        * 1) Originally, we checked the table signature for "SSDT" or "PSDT".
+        * 2) We added support for OEMx tables, signature "OEM".
+        * 3) Valid tables were encountered with a null signature, so we just
+        *    gave up on validating the signature, (05/2008).
+        * 4) We encountered non-AML tables such as the MADT, which caused
+        *    interpreter errors and kernel faults. So now, we once again allow
+        *    only "SSDT", "OEMx", and now, also a null signature. (05/2011).
         */
+       if ((table_desc->pointer->signature[0] != 0x00) &&
+           (!ACPI_COMPARE_NAME(table_desc->pointer->signature, ACPI_SIG_SSDT))
+           && (ACPI_STRNCMP(table_desc->pointer->signature, "OEM", 3))) {
+               ACPI_ERROR((AE_INFO,
+                           "Table has invalid signature [%4.4s] (0x%8.8X), must be SSDT or OEMx",
+                           acpi_ut_valid_acpi_name(*(u32 *)table_desc->
+                                                   pointer->
+                                                   signature) ? table_desc->
+                           pointer->signature : "????",
+                           *(u32 *)table_desc->pointer->signature));
+
+               return_ACPI_STATUS(AE_BAD_SIGNATURE);
+       }
 
        (void)acpi_ut_acquire_mutex(ACPI_MTX_TABLES);
 
index f739a70..c34aa51 100644 (file)
@@ -10,9 +10,11 @@ config ACPI_APEI
          error injection.
 
 config ACPI_APEI_GHES
-       tristate "APEI Generic Hardware Error Source"
+       bool "APEI Generic Hardware Error Source"
        depends on ACPI_APEI && X86
        select ACPI_HED
+       select LLIST
+       select GENERIC_ALLOCATOR
        help
          Generic Hardware Error Source provides a way to report
          platform hardware errors (such as that from chipset). It
@@ -30,6 +32,13 @@ config ACPI_APEI_PCIEAER
          PCIe AER errors may be reported via APEI firmware first mode.
          Turn on this option to enable the corresponding support.
 
+config ACPI_APEI_MEMORY_FAILURE
+       bool "APEI memory error recovering support"
+       depends on ACPI_APEI && MEMORY_FAILURE
+       help
+         Memory errors may be reported via APEI firmware first mode.
+         Turn on this option to enable the memory recovering support.
+
 config ACPI_APEI_EINJ
        tristate "APEI Error INJection (EINJ)"
        depends on ACPI_APEI && DEBUG_FS
index 4a904a4..8041248 100644 (file)
@@ -157,9 +157,10 @@ EXPORT_SYMBOL_GPL(apei_exec_noop);
  * Interpret the specified action. Go through whole action table,
  * execute all instructions belong to the action.
  */
-int apei_exec_run(struct apei_exec_context *ctx, u8 action)
+int __apei_exec_run(struct apei_exec_context *ctx, u8 action,
+                   bool optional)
 {
-       int rc;
+       int rc = -ENOENT;
        u32 i, ip;
        struct acpi_whea_header *entry;
        apei_exec_ins_func_t run;
@@ -198,9 +199,9 @@ rewind:
                        goto rewind;
        }
 
-       return 0;
+       return !optional && rc < 0 ? rc : 0;
 }
-EXPORT_SYMBOL_GPL(apei_exec_run);
+EXPORT_SYMBOL_GPL(__apei_exec_run);
 
 typedef int (*apei_exec_entry_func_t)(struct apei_exec_context *ctx,
                                      struct acpi_whea_header *entry,
@@ -603,3 +604,29 @@ struct dentry *apei_get_debugfs_dir(void)
        return dapei;
 }
 EXPORT_SYMBOL_GPL(apei_get_debugfs_dir);
+
+int apei_osc_setup(void)
+{
+       static u8 whea_uuid_str[] = "ed855e0c-6c90-47bf-a62a-26de0fc5ad5c";
+       acpi_handle handle;
+       u32 capbuf[3];
+       struct acpi_osc_context context = {
+               .uuid_str       = whea_uuid_str,
+               .rev            = 1,
+               .cap.length     = sizeof(capbuf),
+               .cap.pointer    = capbuf,
+       };
+
+       capbuf[OSC_QUERY_TYPE] = OSC_QUERY_ENABLE;
+       capbuf[OSC_SUPPORT_TYPE] = 0;
+       capbuf[OSC_CONTROL_TYPE] = 0;
+
+       if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))
+           || ACPI_FAILURE(acpi_run_osc(handle, &context)))
+               return -EIO;
+       else {
+               kfree(context.ret.pointer);
+               return 0;
+       }
+}
+EXPORT_SYMBOL_GPL(apei_osc_setup);
index ef0581f..f57050e 100644 (file)
@@ -50,7 +50,18 @@ static inline u64 apei_exec_ctx_get_output(struct apei_exec_context *ctx)
        return ctx->value;
 }
 
-int apei_exec_run(struct apei_exec_context *ctx, u8 action);
+int __apei_exec_run(struct apei_exec_context *ctx, u8 action, bool optional);
+
+static inline int apei_exec_run(struct apei_exec_context *ctx, u8 action)
+{
+       return __apei_exec_run(ctx, action, 0);
+}
+
+/* It is optional whether the firmware provides the action */
+static inline int apei_exec_run_optional(struct apei_exec_context *ctx, u8 action)
+{
+       return __apei_exec_run(ctx, action, 1);
+}
 
 /* Common instruction implementation */
 
@@ -113,4 +124,6 @@ void apei_estatus_print(const char *pfx,
                        const struct acpi_hest_generic_status *estatus);
 int apei_estatus_check_header(const struct acpi_hest_generic_status *estatus);
 int apei_estatus_check(const struct acpi_hest_generic_status *estatus);
+
+int apei_osc_setup(void);
 #endif
index f74b2ea..589b96c 100644 (file)
@@ -46,7 +46,8 @@
  * Some BIOSes allow parameters to the SET_ERROR_TYPE entries in the
  * EINJ table through an unpublished extension. Use with caution as
  * most will ignore the parameter and make their own choice of address
- * for error injection.
+ * for error injection.  This extension is used only if
+ * param_extension module parameter is specified.
  */
 struct einj_parameter {
        u64 type;
@@ -65,6 +66,9 @@ struct einj_parameter {
        ((struct acpi_whea_header *)((char *)(tab) +                    \
                                    sizeof(struct acpi_table_einj)))
 
+static bool param_extension;
+module_param(param_extension, bool, 0);
+
 static struct acpi_table_einj *einj_tab;
 
 static struct apei_resources einj_resources;
@@ -285,7 +289,7 @@ static int __einj_error_inject(u32 type, u64 param1, u64 param2)
 
        einj_exec_ctx_init(&ctx);
 
-       rc = apei_exec_run(&ctx, ACPI_EINJ_BEGIN_OPERATION);
+       rc = apei_exec_run_optional(&ctx, ACPI_EINJ_BEGIN_OPERATION);
        if (rc)
                return rc;
        apei_exec_ctx_set_input(&ctx, type);
@@ -323,7 +327,7 @@ static int __einj_error_inject(u32 type, u64 param1, u64 param2)
        rc = __einj_error_trigger(trigger_paddr);
        if (rc)
                return rc;
-       rc = apei_exec_run(&ctx, ACPI_EINJ_END_OPERATION);
+       rc = apei_exec_run_optional(&ctx, ACPI_EINJ_END_OPERATION);
 
        return rc;
 }
@@ -489,14 +493,6 @@ static int __init einj_init(void)
                                     einj_debug_dir, NULL, &error_type_fops);
        if (!fentry)
                goto err_cleanup;
-       fentry = debugfs_create_x64("param1", S_IRUSR | S_IWUSR,
-                                   einj_debug_dir, &error_param1);
-       if (!fentry)
-               goto err_cleanup;
-       fentry = debugfs_create_x64("param2", S_IRUSR | S_IWUSR,
-                                   einj_debug_dir, &error_param2);
-       if (!fentry)
-               goto err_cleanup;
        fentry = debugfs_create_file("error_inject", S_IWUSR,
                                     einj_debug_dir, NULL, &error_inject_fops);
        if (!fentry)
@@ -513,12 +509,23 @@ static int __init einj_init(void)
        rc = apei_exec_pre_map_gars(&ctx);
        if (rc)
                goto err_release;
-       param_paddr = einj_get_parameter_address();
-       if (param_paddr) {
-               einj_param = ioremap(param_paddr, sizeof(*einj_param));
-               rc = -ENOMEM;
-               if (!einj_param)
-                       goto err_unmap;
+       if (param_extension) {
+               param_paddr = einj_get_parameter_address();
+               if (param_paddr) {
+                       einj_param = ioremap(param_paddr, sizeof(*einj_param));
+                       rc = -ENOMEM;
+                       if (!einj_param)
+                               goto err_unmap;
+                       fentry = debugfs_create_x64("param1", S_IRUSR | S_IWUSR,
+                                                   einj_debug_dir, &error_param1);
+                       if (!fentry)
+                               goto err_unmap;
+                       fentry = debugfs_create_x64("param2", S_IRUSR | S_IWUSR,
+                                                   einj_debug_dir, &error_param2);
+                       if (!fentry)
+                               goto err_unmap;
+               } else
+                       pr_warn(EINJ_PFX "Parameter extension is not supported.\n");
        }
 
        pr_info(EINJ_PFX "Error INJection is initialized.\n");
@@ -526,6 +533,8 @@ static int __init einj_init(void)
        return 0;
 
 err_unmap:
+       if (einj_param)
+               iounmap(einj_param);
        apei_exec_post_unmap_gars(&ctx);
 err_release:
        apei_resources_release(&einj_resources);
index a4cfb64..903549d 100644 (file)
@@ -33,7 +33,7 @@
 
 #define ERST_DBG_PFX                   "ERST DBG: "
 
-#define ERST_DBG_RECORD_LEN_MAX                4096
+#define ERST_DBG_RECORD_LEN_MAX                0x4000
 
 static void *erst_dbg_buf;
 static unsigned int erst_dbg_buf_len;
@@ -213,6 +213,10 @@ static struct miscdevice erst_dbg_dev = {
 
 static __init int erst_dbg_init(void)
 {
+       if (erst_disable) {
+               pr_info(ERST_DBG_PFX "ERST support is disabled.\n");
+               return -ENODEV;
+       }
        return misc_register(&erst_dbg_dev);
 }
 
index 6053f47..2ca59dc 100644 (file)
@@ -642,7 +642,7 @@ static int __erst_write_to_storage(u64 offset)
        int rc;
 
        erst_exec_ctx_init(&ctx);
-       rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_WRITE);
+       rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_WRITE);
        if (rc)
                return rc;
        apei_exec_ctx_set_input(&ctx, offset);
@@ -666,7 +666,7 @@ static int __erst_write_to_storage(u64 offset)
        if (rc)
                return rc;
        val = apei_exec_ctx_get_output(&ctx);
-       rc = apei_exec_run(&ctx, ACPI_ERST_END);
+       rc = apei_exec_run_optional(&ctx, ACPI_ERST_END);
        if (rc)
                return rc;
 
@@ -681,7 +681,7 @@ static int __erst_read_from_storage(u64 record_id, u64 offset)
        int rc;
 
        erst_exec_ctx_init(&ctx);
-       rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_READ);
+       rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_READ);
        if (rc)
                return rc;
        apei_exec_ctx_set_input(&ctx, offset);
@@ -709,7 +709,7 @@ static int __erst_read_from_storage(u64 record_id, u64 offset)
        if (rc)
                return rc;
        val = apei_exec_ctx_get_output(&ctx);
-       rc = apei_exec_run(&ctx, ACPI_ERST_END);
+       rc = apei_exec_run_optional(&ctx, ACPI_ERST_END);
        if (rc)
                return rc;
 
@@ -724,7 +724,7 @@ static int __erst_clear_from_storage(u64 record_id)
        int rc;
 
        erst_exec_ctx_init(&ctx);
-       rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_CLEAR);
+       rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_CLEAR);
        if (rc)
                return rc;
        apei_exec_ctx_set_input(&ctx, record_id);
@@ -748,7 +748,7 @@ static int __erst_clear_from_storage(u64 record_id)
        if (rc)
                return rc;
        val = apei_exec_ctx_get_output(&ctx);
-       rc = apei_exec_run(&ctx, ACPI_ERST_END);
+       rc = apei_exec_run_optional(&ctx, ACPI_ERST_END);
        if (rc)
                return rc;
 
index f703b28..0784f99 100644 (file)
@@ -12,7 +12,7 @@
  * For more information about Generic Hardware Error Source, please
  * refer to ACPI Specification version 4.0, section 17.3.2.6
  *
- * Copyright 2010 Intel Corp.
+ * Copyright 2010,2011 Intel Corp.
  *   Author: Huang Ying <ying.huang@intel.com>
  *
  * This program is free software; you can redistribute it and/or
@@ -42,6 +42,9 @@
 #include <linux/mutex.h>
 #include <linux/ratelimit.h>
 #include <linux/vmalloc.h>
+#include <linux/irq_work.h>
+#include <linux/llist.h>
+#include <linux/genalloc.h>
 #include <acpi/apei.h>
 #include <acpi/atomicio.h>
 #include <acpi/hed.h>
 #define GHES_PFX       "GHES: "
 
 #define GHES_ESTATUS_MAX_SIZE          65536
+#define GHES_ESOURCE_PREALLOC_MAX_SIZE 65536
+
+#define GHES_ESTATUS_POOL_MIN_ALLOC_ORDER 3
+
+/* This is just an estimation for memory pool allocation */
+#define GHES_ESTATUS_CACHE_AVG_SIZE    512
+
+#define GHES_ESTATUS_CACHES_SIZE       4
+
+#define GHES_ESTATUS_IN_CACHE_MAX_NSEC 10000000000ULL
+/* Prevent too many caches are allocated because of RCU */
+#define GHES_ESTATUS_CACHE_ALLOCED_MAX (GHES_ESTATUS_CACHES_SIZE * 3 / 2)
+
+#define GHES_ESTATUS_CACHE_LEN(estatus_len)                    \
+       (sizeof(struct ghes_estatus_cache) + (estatus_len))
+#define GHES_ESTATUS_FROM_CACHE(estatus_cache)                 \
+       ((struct acpi_hest_generic_status *)                    \
+        ((struct ghes_estatus_cache *)(estatus_cache) + 1))
+
+#define GHES_ESTATUS_NODE_LEN(estatus_len)                     \
+       (sizeof(struct ghes_estatus_node) + (estatus_len))
+#define GHES_ESTATUS_FROM_NODE(estatus_node)                           \
+       ((struct acpi_hest_generic_status *)                            \
+        ((struct ghes_estatus_node *)(estatus_node) + 1))
 
 /*
  * One struct ghes is created for each generic hardware error source.
@@ -77,6 +104,22 @@ struct ghes {
        };
 };
 
+struct ghes_estatus_node {
+       struct llist_node llnode;
+       struct acpi_hest_generic *generic;
+};
+
+struct ghes_estatus_cache {
+       u32 estatus_len;
+       atomic_t count;
+       struct acpi_hest_generic *generic;
+       unsigned long long time_in;
+       struct rcu_head rcu;
+};
+
+int ghes_disable;
+module_param_named(disable, ghes_disable, bool, 0);
+
 static int ghes_panic_timeout  __read_mostly = 30;
 
 /*
@@ -121,6 +164,22 @@ static struct vm_struct *ghes_ioremap_area;
 static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi);
 static DEFINE_SPINLOCK(ghes_ioremap_lock_irq);
 
+/*
+ * printk is not safe in NMI context.  So in NMI handler, we allocate
+ * required memory from lock-less memory allocator
+ * (ghes_estatus_pool), save estatus into it, put them into lock-less
+ * list (ghes_estatus_llist), then delay printk into IRQ context via
+ * irq_work (ghes_proc_irq_work).  ghes_estatus_size_request record
+ * required pool size by all NMI error source.
+ */
+static struct gen_pool *ghes_estatus_pool;
+static unsigned long ghes_estatus_pool_size_request;
+static struct llist_head ghes_estatus_llist;
+static struct irq_work ghes_proc_irq_work;
+
+struct ghes_estatus_cache *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE];
+static atomic_t ghes_estatus_cache_alloced;
+
 static int ghes_ioremap_init(void)
 {
        ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES,
@@ -180,6 +239,55 @@ static void ghes_iounmap_irq(void __iomem *vaddr_ptr)
        __flush_tlb_one(vaddr);
 }
 
+static int ghes_estatus_pool_init(void)
+{
+       ghes_estatus_pool = gen_pool_create(GHES_ESTATUS_POOL_MIN_ALLOC_ORDER, -1);
+       if (!ghes_estatus_pool)
+               return -ENOMEM;
+       return 0;
+}
+
+static void ghes_estatus_pool_free_chunk_page(struct gen_pool *pool,
+                                             struct gen_pool_chunk *chunk,
+                                             void *data)
+{
+       free_page(chunk->start_addr);
+}
+
+static void ghes_estatus_pool_exit(void)
+{
+       gen_pool_for_each_chunk(ghes_estatus_pool,
+                               ghes_estatus_pool_free_chunk_page, NULL);
+       gen_pool_destroy(ghes_estatus_pool);
+}
+
+static int ghes_estatus_pool_expand(unsigned long len)
+{
+       unsigned long i, pages, size, addr;
+       int ret;
+
+       ghes_estatus_pool_size_request += PAGE_ALIGN(len);
+       size = gen_pool_size(ghes_estatus_pool);
+       if (size >= ghes_estatus_pool_size_request)
+               return 0;
+       pages = (ghes_estatus_pool_size_request - size) / PAGE_SIZE;
+       for (i = 0; i < pages; i++) {
+               addr = __get_free_page(GFP_KERNEL);
+               if (!addr)
+                       return -ENOMEM;
+               ret = gen_pool_add(ghes_estatus_pool, addr, PAGE_SIZE, -1);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static void ghes_estatus_pool_shrink(unsigned long len)
+{
+       ghes_estatus_pool_size_request -= PAGE_ALIGN(len);
+}
+
 static struct ghes *ghes_new(struct acpi_hest_generic *generic)
 {
        struct ghes *ghes;
@@ -341,43 +449,196 @@ static void ghes_clear_estatus(struct ghes *ghes)
        ghes->flags &= ~GHES_TO_CLEAR;
 }
 
-static void ghes_do_proc(struct ghes *ghes)
+static void ghes_do_proc(const struct acpi_hest_generic_status *estatus)
 {
-       int sev, processed = 0;
+       int sev, sec_sev;
        struct acpi_hest_generic_data *gdata;
 
-       sev = ghes_severity(ghes->estatus->error_severity);
-       apei_estatus_for_each_section(ghes->estatus, gdata) {
-#ifdef CONFIG_X86_MCE
+       sev = ghes_severity(estatus->error_severity);
+       apei_estatus_for_each_section(estatus, gdata) {
+               sec_sev = ghes_severity(gdata->error_severity);
                if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
                                 CPER_SEC_PLATFORM_MEM)) {
-                       apei_mce_report_mem_error(
-                               sev == GHES_SEV_CORRECTED,
-                               (struct cper_sec_mem_err *)(gdata+1));
-                       processed = 1;
-               }
+                       struct cper_sec_mem_err *mem_err;
+                       mem_err = (struct cper_sec_mem_err *)(gdata+1);
+#ifdef CONFIG_X86_MCE
+                       apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED,
+                                                 mem_err);
 #endif
+#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
+                       if (sev == GHES_SEV_RECOVERABLE &&
+                           sec_sev == GHES_SEV_RECOVERABLE &&
+                           mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) {
+                               unsigned long pfn;
+                               pfn = mem_err->physical_addr >> PAGE_SHIFT;
+                               memory_failure_queue(pfn, 0, 0);
+                       }
+#endif
+               }
        }
 }
 
-static void ghes_print_estatus(const char *pfx, struct ghes *ghes)
+static void __ghes_print_estatus(const char *pfx,
+                                const struct acpi_hest_generic *generic,
+                                const struct acpi_hest_generic_status *estatus)
 {
-       /* Not more than 2 messages every 5 seconds */
-       static DEFINE_RATELIMIT_STATE(ratelimit, 5*HZ, 2);
-
        if (pfx == NULL) {
-               if (ghes_severity(ghes->estatus->error_severity) <=
+               if (ghes_severity(estatus->error_severity) <=
                    GHES_SEV_CORRECTED)
                        pfx = KERN_WARNING HW_ERR;
                else
                        pfx = KERN_ERR HW_ERR;
        }
-       if (__ratelimit(&ratelimit)) {
-               printk(
-       "%s""Hardware error from APEI Generic Hardware Error Source: %d\n",
-       pfx, ghes->generic->header.source_id);
-               apei_estatus_print(pfx, ghes->estatus);
+       printk("%s""Hardware error from APEI Generic Hardware Error Source: %d\n",
+              pfx, generic->header.source_id);
+       apei_estatus_print(pfx, estatus);
+}
+
+static int ghes_print_estatus(const char *pfx,
+                             const struct acpi_hest_generic *generic,
+                             const struct acpi_hest_generic_status *estatus)
+{
+       /* Not more than 2 messages every 5 seconds */
+       static DEFINE_RATELIMIT_STATE(ratelimit_corrected, 5*HZ, 2);
+       static DEFINE_RATELIMIT_STATE(ratelimit_uncorrected, 5*HZ, 2);
+       struct ratelimit_state *ratelimit;
+
+       if (ghes_severity(estatus->error_severity) <= GHES_SEV_CORRECTED)
+               ratelimit = &ratelimit_corrected;
+       else
+               ratelimit = &ratelimit_uncorrected;
+       if (__ratelimit(ratelimit)) {
+               __ghes_print_estatus(pfx, generic, estatus);
+               return 1;
        }
+       return 0;
+}
+
+/*
+ * GHES error status reporting throttle, to report more kinds of
+ * errors, instead of just most frequently occurred errors.
+ */
+static int ghes_estatus_cached(struct acpi_hest_generic_status *estatus)
+{
+       u32 len;
+       int i, cached = 0;
+       unsigned long long now;
+       struct ghes_estatus_cache *cache;
+       struct acpi_hest_generic_status *cache_estatus;
+
+       len = apei_estatus_len(estatus);
+       rcu_read_lock();
+       for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) {
+               cache = rcu_dereference(ghes_estatus_caches[i]);
+               if (cache == NULL)
+                       continue;
+               if (len != cache->estatus_len)
+                       continue;
+               cache_estatus = GHES_ESTATUS_FROM_CACHE(cache);
+               if (memcmp(estatus, cache_estatus, len))
+                       continue;
+               atomic_inc(&cache->count);
+               now = sched_clock();
+               if (now - cache->time_in < GHES_ESTATUS_IN_CACHE_MAX_NSEC)
+                       cached = 1;
+               break;
+       }
+       rcu_read_unlock();
+       return cached;
+}
+
+static struct ghes_estatus_cache *ghes_estatus_cache_alloc(
+       struct acpi_hest_generic *generic,
+       struct acpi_hest_generic_status *estatus)
+{
+       int alloced;
+       u32 len, cache_len;
+       struct ghes_estatus_cache *cache;
+       struct acpi_hest_generic_status *cache_estatus;
+
+       alloced = atomic_add_return(1, &ghes_estatus_cache_alloced);
+       if (alloced > GHES_ESTATUS_CACHE_ALLOCED_MAX) {
+               atomic_dec(&ghes_estatus_cache_alloced);
+               return NULL;
+       }
+       len = apei_estatus_len(estatus);
+       cache_len = GHES_ESTATUS_CACHE_LEN(len);
+       cache = (void *)gen_pool_alloc(ghes_estatus_pool, cache_len);
+       if (!cache) {
+               atomic_dec(&ghes_estatus_cache_alloced);
+               return NULL;
+       }
+       cache_estatus = GHES_ESTATUS_FROM_CACHE(cache);
+       memcpy(cache_estatus, estatus, len);
+       cache->estatus_len = len;
+       atomic_set(&cache->count, 0);
+       cache->generic = generic;
+       cache->time_in = sched_clock();
+       return cache;
+}
+
+static void ghes_estatus_cache_free(struct ghes_estatus_cache *cache)
+{
+       u32 len;
+
+       len = apei_estatus_len(GHES_ESTATUS_FROM_CACHE(cache));
+       len = GHES_ESTATUS_CACHE_LEN(len);
+       gen_pool_free(ghes_estatus_pool, (unsigned long)cache, len);
+       atomic_dec(&ghes_estatus_cache_alloced);
+}
+
+static void ghes_estatus_cache_rcu_free(struct rcu_head *head)
+{
+       struct ghes_estatus_cache *cache;
+
+       cache = container_of(head, struct ghes_estatus_cache, rcu);
+       ghes_estatus_cache_free(cache);
+}
+
+static void ghes_estatus_cache_add(
+       struct acpi_hest_generic *generic,
+       struct acpi_hest_generic_status *estatus)
+{
+       int i, slot = -1, count;
+       unsigned long long now, duration, period, max_period = 0;
+       struct ghes_estatus_cache *cache, *slot_cache = NULL, *new_cache;
+
+       new_cache = ghes_estatus_cache_alloc(generic, estatus);
+       if (new_cache == NULL)
+               return;
+       rcu_read_lock();
+       now = sched_clock();
+       for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) {
+               cache = rcu_dereference(ghes_estatus_caches[i]);
+               if (cache == NULL) {
+                       slot = i;
+                       slot_cache = NULL;
+                       break;
+               }
+               duration = now - cache->time_in;
+               if (duration >= GHES_ESTATUS_IN_CACHE_MAX_NSEC) {
+                       slot = i;
+                       slot_cache = cache;
+                       break;
+               }
+               count = atomic_read(&cache->count);
+               period = duration;
+               do_div(period, (count + 1));
+               if (period > max_period) {
+                       max_period = period;
+                       slot = i;
+                       slot_cache = cache;
+               }
+       }
+       /* new_cache must be put into array after its contents are written */
+       smp_wmb();
+       if (slot != -1 && cmpxchg(ghes_estatus_caches + slot,
+                                 slot_cache, new_cache) == slot_cache) {
+               if (slot_cache)
+                       call_rcu(&slot_cache->rcu, ghes_estatus_cache_rcu_free);
+       } else
+               ghes_estatus_cache_free(new_cache);
+       rcu_read_unlock();
 }
 
 static int ghes_proc(struct ghes *ghes)
@@ -387,9 +648,11 @@ static int ghes_proc(struct ghes *ghes)
        rc = ghes_read_estatus(ghes, 0);
        if (rc)
                goto out;
-       ghes_print_estatus(NULL, ghes);
-       ghes_do_proc(ghes);
-
+       if (!ghes_estatus_cached(ghes->estatus)) {
+               if (ghes_print_estatus(NULL, ghes->generic, ghes->estatus))
+                       ghes_estatus_cache_add(ghes->generic, ghes->estatus);
+       }
+       ghes_do_proc(ghes->estatus);
 out:
        ghes_clear_estatus(ghes);
        return 0;
@@ -447,6 +710,45 @@ static int ghes_notify_sci(struct notifier_block *this,
        return ret;
 }
 
+static void ghes_proc_in_irq(struct irq_work *irq_work)
+{
+       struct llist_node *llnode, *next, *tail = NULL;
+       struct ghes_estatus_node *estatus_node;
+       struct acpi_hest_generic *generic;
+       struct acpi_hest_generic_status *estatus;
+       u32 len, node_len;
+
+       /*
+        * Because the time order of estatus in list is reversed,
+        * revert it back to proper order.
+        */
+       llnode = llist_del_all(&ghes_estatus_llist);
+       while (llnode) {
+               next = llnode->next;
+               llnode->next = tail;
+               tail = llnode;
+               llnode = next;
+       }
+       llnode = tail;
+       while (llnode) {
+               next = llnode->next;
+               estatus_node = llist_entry(llnode, struct ghes_estatus_node,
+                                          llnode);
+               estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
+               len = apei_estatus_len(estatus);
+               node_len = GHES_ESTATUS_NODE_LEN(len);
+               ghes_do_proc(estatus);
+               if (!ghes_estatus_cached(estatus)) {
+                       generic = estatus_node->generic;
+                       if (ghes_print_estatus(NULL, generic, estatus))
+                               ghes_estatus_cache_add(generic, estatus);
+               }
+               gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node,
+                             node_len);
+               llnode = next;
+       }
+}
+
 static int ghes_notify_nmi(struct notifier_block *this,
                                  unsigned long cmd, void *data)
 {
@@ -476,7 +778,8 @@ static int ghes_notify_nmi(struct notifier_block *this,
 
        if (sev_global >= GHES_SEV_PANIC) {
                oops_begin();
-               ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global);
+               __ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global->generic,
+                                    ghes_global->estatus);
                /* reboot to log the error! */
                if (panic_timeout == 0)
                        panic_timeout = ghes_panic_timeout;
@@ -484,12 +787,34 @@ static int ghes_notify_nmi(struct notifier_block *this,
        }
 
        list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
+#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+               u32 len, node_len;
+               struct ghes_estatus_node *estatus_node;
+               struct acpi_hest_generic_status *estatus;
+#endif
                if (!(ghes->flags & GHES_TO_CLEAR))
                        continue;
-               /* Do not print estatus because printk is not NMI safe */
-               ghes_do_proc(ghes);
+#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+               if (ghes_estatus_cached(ghes->estatus))
+                       goto next;
+               /* Save estatus for further processing in IRQ context */
+               len = apei_estatus_len(ghes->estatus);
+               node_len = GHES_ESTATUS_NODE_LEN(len);
+               estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool,
+                                                     node_len);
+               if (estatus_node) {
+                       estatus_node->generic = ghes->generic;
+                       estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
+                       memcpy(estatus, ghes->estatus, len);
+                       llist_add(&estatus_node->llnode, &ghes_estatus_llist);
+               }
+next:
+#endif
                ghes_clear_estatus(ghes);
        }
+#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+       irq_work_queue(&ghes_proc_irq_work);
+#endif
 
 out:
        raw_spin_unlock(&ghes_nmi_lock);
@@ -504,10 +829,26 @@ static struct notifier_block ghes_notifier_nmi = {
        .notifier_call = ghes_notify_nmi,
 };
 
+static unsigned long ghes_esource_prealloc_size(
+       const struct acpi_hest_generic *generic)
+{
+       unsigned long block_length, prealloc_records, prealloc_size;
+
+       block_length = min_t(unsigned long, generic->error_block_length,
+                            GHES_ESTATUS_MAX_SIZE);
+       prealloc_records = max_t(unsigned long,
+                                generic->records_to_preallocate, 1);
+       prealloc_size = min_t(unsigned long, block_length * prealloc_records,
+                             GHES_ESOURCE_PREALLOC_MAX_SIZE);
+
+       return prealloc_size;
+}
+
 static int __devinit ghes_probe(struct platform_device *ghes_dev)
 {
        struct acpi_hest_generic *generic;
        struct ghes *ghes = NULL;
+       unsigned long len;
        int rc = -EINVAL;
 
        generic = *(struct acpi_hest_generic **)ghes_dev->dev.platform_data;
@@ -573,6 +914,8 @@ static int __devinit ghes_probe(struct platform_device *ghes_dev)
                mutex_unlock(&ghes_list_mutex);
                break;
        case ACPI_HEST_NOTIFY_NMI:
+               len = ghes_esource_prealloc_size(generic);
+               ghes_estatus_pool_expand(len);
                mutex_lock(&ghes_list_mutex);
                if (list_empty(&ghes_nmi))
                        register_die_notifier(&ghes_notifier_nmi);
@@ -597,6 +940,7 @@ static int __devexit ghes_remove(struct platform_device *ghes_dev)
 {
        struct ghes *ghes;
        struct acpi_hest_generic *generic;
+       unsigned long len;
 
        ghes = platform_get_drvdata(ghes_dev);
        generic = ghes->generic;
@@ -627,6 +971,8 @@ static int __devexit ghes_remove(struct platform_device *ghes_dev)
                 * freed after NMI handler finishes.
                 */
                synchronize_rcu();
+               len = ghes_esource_prealloc_size(generic);
+               ghes_estatus_pool_shrink(len);
                break;
        default:
                BUG();
@@ -662,15 +1008,43 @@ static int __init ghes_init(void)
                return -EINVAL;
        }
 
+       if (ghes_disable) {
+               pr_info(GHES_PFX "GHES is not enabled!\n");
+               return -EINVAL;
+       }
+
+       init_irq_work(&ghes_proc_irq_work, ghes_proc_in_irq);
+
        rc = ghes_ioremap_init();
        if (rc)
                goto err;
 
-       rc = platform_driver_register(&ghes_platform_driver);
+       rc = ghes_estatus_pool_init();
        if (rc)
                goto err_ioremap_exit;
 
+       rc = ghes_estatus_pool_expand(GHES_ESTATUS_CACHE_AVG_SIZE *
+                                     GHES_ESTATUS_CACHE_ALLOCED_MAX);
+       if (rc)
+               goto err_pool_exit;
+
+       rc = platform_driver_register(&ghes_platform_driver);
+       if (rc)
+               goto err_pool_exit;
+
+       rc = apei_osc_setup();
+       if (rc == 0 && osc_sb_apei_support_acked)
+               pr_info(GHES_PFX "APEI firmware first mode is enabled by APEI bit and WHEA _OSC.\n");
+       else if (rc == 0 && !osc_sb_apei_support_acked)
+               pr_info(GHES_PFX "APEI firmware first mode is enabled by WHEA _OSC.\n");
+       else if (rc && osc_sb_apei_support_acked)
+               pr_info(GHES_PFX "APEI firmware first mode is enabled by APEI bit.\n");
+       else
+               pr_info(GHES_PFX "Failed to enable APEI firmware first mode.\n");
+
        return 0;
+err_pool_exit:
+       ghes_estatus_pool_exit();
 err_ioremap_exit:
        ghes_ioremap_exit();
 err:
@@ -680,6 +1054,7 @@ err:
 static void __exit ghes_exit(void)
 {
        platform_driver_unregister(&ghes_platform_driver);
+       ghes_estatus_pool_exit();
        ghes_ioremap_exit();
 }
 
index 181bc2f..05fee06 100644 (file)
@@ -231,16 +231,17 @@ void __init acpi_hest_init(void)
                goto err;
        }
 
-       rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count);
-       if (rc)
-               goto err;
-
-       rc = hest_ghes_dev_register(ghes_count);
-       if (!rc) {
-               pr_info(HEST_PFX "Table parsing has been initialized.\n");
-               return;
+       if (!ghes_disable) {
+               rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count);
+               if (rc)
+                       goto err;
+               rc = hest_ghes_dev_register(ghes_count);
+               if (rc)
+                       goto err;
        }
 
+       pr_info(HEST_PFX "Table parsing has been initialized.\n");
+       return;
 err:
        hest_disable = 1;
 }
index 2c66135..87c0a8d 100644 (file)
@@ -55,6 +55,9 @@
 #define ACPI_BATTERY_NOTIFY_INFO       0x81
 #define ACPI_BATTERY_NOTIFY_THRESHOLD   0x82
 
+/* Battery power unit: 0 means mW, 1 means mA */
+#define ACPI_BATTERY_POWER_UNIT_MA     1
+
 #define _COMPONENT             ACPI_BATTERY_COMPONENT
 
 ACPI_MODULE_NAME("battery");
@@ -91,11 +94,6 @@ MODULE_DEVICE_TABLE(acpi, battery_device_ids);
 enum {
        ACPI_BATTERY_ALARM_PRESENT,
        ACPI_BATTERY_XINFO_PRESENT,
-       /* For buggy DSDTs that report negative 16-bit values for either
-        * charging or discharging current and/or report 0 as 65536
-        * due to bad math.
-        */
-       ACPI_BATTERY_QUIRK_SIGNED16_CURRENT,
        ACPI_BATTERY_QUIRK_PERCENTAGE_CAPACITY,
 };
 
@@ -301,7 +299,8 @@ static enum power_supply_property energy_battery_props[] = {
 #ifdef CONFIG_ACPI_PROCFS_POWER
 inline char *acpi_battery_units(struct acpi_battery *battery)
 {
-       return (battery->power_unit)?"mA":"mW";
+       return (battery->power_unit == ACPI_BATTERY_POWER_UNIT_MA) ?
+               "mA" : "mW";
 }
 #endif
 
@@ -461,9 +460,17 @@ static int acpi_battery_get_state(struct acpi_battery *battery)
        battery->update_time = jiffies;
        kfree(buffer.pointer);
 
-       if (test_bit(ACPI_BATTERY_QUIRK_SIGNED16_CURRENT, &battery->flags) &&
-           battery->rate_now != -1)
+       /* For buggy DSDTs that report negative 16-bit values for either
+        * charging or discharging current and/or report 0 as 65536
+        * due to bad math.
+        */
+       if (battery->power_unit == ACPI_BATTERY_POWER_UNIT_MA &&
+               battery->rate_now != ACPI_BATTERY_VALUE_UNKNOWN &&
+               (s16)(battery->rate_now) < 0) {
                battery->rate_now = abs((s16)battery->rate_now);
+               printk_once(KERN_WARNING FW_BUG "battery: (dis)charge rate"
+                       " invalid.\n");
+       }
 
        if (test_bit(ACPI_BATTERY_QUIRK_PERCENTAGE_CAPACITY, &battery->flags)
            && battery->capacity_now >= 0 && battery->capacity_now <= 100)
@@ -544,7 +551,7 @@ static int sysfs_add_battery(struct acpi_battery *battery)
 {
        int result;
 
-       if (battery->power_unit) {
+       if (battery->power_unit == ACPI_BATTERY_POWER_UNIT_MA) {
                battery->bat.properties = charge_battery_props;
                battery->bat.num_properties =
                        ARRAY_SIZE(charge_battery_props);
@@ -566,18 +573,16 @@ static int sysfs_add_battery(struct acpi_battery *battery)
 
 static void sysfs_remove_battery(struct acpi_battery *battery)
 {
-       if (!battery->bat.dev)
+       mutex_lock(&battery->lock);
+       if (!battery->bat.dev) {
+               mutex_unlock(&battery->lock);
                return;
+       }
+
        device_remove_file(battery->bat.dev, &alarm_attr);
        power_supply_unregister(&battery->bat);
        battery->bat.dev = NULL;
-}
-
-static void acpi_battery_quirks(struct acpi_battery *battery)
-{
-       if (dmi_name_in_vendors("Acer") && battery->power_unit) {
-               set_bit(ACPI_BATTERY_QUIRK_SIGNED16_CURRENT, &battery->flags);
-       }
+       mutex_unlock(&battery->lock);
 }
 
 /*
@@ -592,7 +597,7 @@ static void acpi_battery_quirks(struct acpi_battery *battery)
  *
  * Handle this correctly so that they won't break userspace.
  */
-static void acpi_battery_quirks2(struct acpi_battery *battery)
+static void acpi_battery_quirks(struct acpi_battery *battery)
 {
        if (test_bit(ACPI_BATTERY_QUIRK_PERCENTAGE_CAPACITY, &battery->flags))
                return ;
@@ -623,13 +628,15 @@ static int acpi_battery_update(struct acpi_battery *battery)
                result = acpi_battery_get_info(battery);
                if (result)
                        return result;
-               acpi_battery_quirks(battery);
                acpi_battery_init_alarm(battery);
        }
-       if (!battery->bat.dev)
-               sysfs_add_battery(battery);
+       if (!battery->bat.dev) {
+               result = sysfs_add_battery(battery);
+               if (result)
+                       return result;
+       }
        result = acpi_battery_get_state(battery);
-       acpi_battery_quirks2(battery);
+       acpi_battery_quirks(battery);
        return result;
 }
 
@@ -863,7 +870,7 @@ DECLARE_FILE_FUNCTIONS(alarm);
                }, \
        }
 
-static struct battery_file {
+static const struct battery_file {
        struct file_operations ops;
        mode_t mode;
        const char *name;
@@ -948,9 +955,12 @@ static int battery_notify(struct notifier_block *nb,
        struct acpi_battery *battery = container_of(nb, struct acpi_battery,
                                                    pm_nb);
        switch (mode) {
+       case PM_POST_HIBERNATION:
        case PM_POST_SUSPEND:
-               sysfs_remove_battery(battery);
-               sysfs_add_battery(battery);
+               if (battery->bat.dev) {
+                       sysfs_remove_battery(battery);
+                       sysfs_add_battery(battery);
+               }
                break;
        }
 
@@ -975,25 +985,33 @@ static int acpi_battery_add(struct acpi_device *device)
        if (ACPI_SUCCESS(acpi_get_handle(battery->device->handle,
                        "_BIX", &handle)))
                set_bit(ACPI_BATTERY_XINFO_PRESENT, &battery->flags);
-       acpi_battery_update(battery);
+       result = acpi_battery_update(battery);
+       if (result)
+               goto fail;
 #ifdef CONFIG_ACPI_PROCFS_POWER
        result = acpi_battery_add_fs(device);
 #endif
-       if (!result) {
-               printk(KERN_INFO PREFIX "%s Slot [%s] (battery %s)\n",
-                       ACPI_BATTERY_DEVICE_NAME, acpi_device_bid(device),
-                       device->status.battery_present ? "present" : "absent");
-       } else {
+       if (result) {
 #ifdef CONFIG_ACPI_PROCFS_POWER
                acpi_battery_remove_fs(device);
 #endif
-               kfree(battery);
+               goto fail;
        }
 
+       printk(KERN_INFO PREFIX "%s Slot [%s] (battery %s)\n",
+               ACPI_BATTERY_DEVICE_NAME, acpi_device_bid(device),
+               device->status.battery_present ? "present" : "absent");
+
        battery->pm_nb.notifier_call = battery_notify;
        register_pm_notifier(&battery->pm_nb);
 
        return result;
+
+fail:
+       sysfs_remove_battery(battery);
+       mutex_destroy(&battery->lock);
+       kfree(battery);
+       return result;
 }
 
 static int acpi_battery_remove(struct acpi_device *device, int type)
index d1e06c1..437ddbf 100644 (file)
@@ -39,6 +39,7 @@
 #include <linux/pci.h>
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
+#include <acpi/apei.h>
 #include <linux/dmi.h>
 #include <linux/suspend.h>
 
@@ -519,6 +520,7 @@ out_kfree:
 }
 EXPORT_SYMBOL(acpi_run_osc);
 
+bool osc_sb_apei_support_acked;
 static u8 sb_uuid_str[] = "0811B06E-4A27-44F9-8D60-3CBBC22E7B48";
 static void acpi_bus_osc_support(void)
 {
@@ -541,11 +543,19 @@ static void acpi_bus_osc_support(void)
 #if defined(CONFIG_ACPI_PROCESSOR) || defined(CONFIG_ACPI_PROCESSOR_MODULE)
        capbuf[OSC_SUPPORT_TYPE] |= OSC_SB_PPC_OST_SUPPORT;
 #endif
+
+       if (!ghes_disable)
+               capbuf[OSC_SUPPORT_TYPE] |= OSC_SB_APEI_SUPPORT;
        if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle)))
                return;
-       if (ACPI_SUCCESS(acpi_run_osc(handle, &context)))
+       if (ACPI_SUCCESS(acpi_run_osc(handle, &context))) {
+               u32 *capbuf_ret = context.ret.pointer;
+               if (context.ret.length > OSC_SUPPORT_TYPE)
+                       osc_sb_apei_support_acked =
+                               capbuf_ret[OSC_SUPPORT_TYPE] & OSC_SB_APEI_SUPPORT;
                kfree(context.ret.pointer);
-       /* do we need to check the returned cap? Sounds no */
+       }
+       /* do we need to check other returned cap? Sounds no */
 }
 
 /* --------------------------------------------------------------------------
index 1864ad3..19a6113 100644 (file)
@@ -77,7 +77,7 @@ struct dock_dependent_device {
        struct list_head list;
        struct list_head hotplug_list;
        acpi_handle handle;
-       struct acpi_dock_ops *ops;
+       const struct acpi_dock_ops *ops;
        void *context;
 };
 
@@ -589,7 +589,7 @@ EXPORT_SYMBOL_GPL(unregister_dock_notifier);
  * the dock driver after _DCK is executed.
  */
 int
-register_hotplug_dock_device(acpi_handle handle, struct acpi_dock_ops *ops,
+register_hotplug_dock_device(acpi_handle handle, const struct acpi_dock_ops *ops,
                             void *context)
 {
        struct dock_dependent_device *dd;
index 05b4420..22f918b 100644 (file)
@@ -92,7 +92,7 @@ static ssize_t acpi_ec_write_io(struct file *f, const char __user *buf,
        return count;
 }
 
-static struct file_operations acpi_ec_io_ops = {
+static const struct file_operations acpi_ec_io_ops = {
        .owner = THIS_MODULE,
        .open  = acpi_ec_open_io,
        .read  = acpi_ec_read_io,
index 467479f..0f0356c 100644 (file)
@@ -110,7 +110,7 @@ fan_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state)
        return result;
 }
 
-static struct thermal_cooling_device_ops fan_cooling_ops = {
+static const struct thermal_cooling_device_ops fan_cooling_ops = {
        .get_max_state = fan_get_max_state,
        .get_cur_state = fan_get_cur_state,
        .set_cur_state = fan_set_cur_state,
index 372f9b7..fa32f58 100644 (file)
@@ -155,7 +155,7 @@ static u32 acpi_osi_handler(acpi_string interface, u32 supported)
 {
        if (!strcmp("Linux", interface)) {
 
-               printk(KERN_NOTICE FW_BUG PREFIX
+               printk_once(KERN_NOTICE FW_BUG PREFIX
                        "BIOS _OSI(Linux) query %s%s\n",
                        osi_linux.enable ? "honored" : "ignored",
                        osi_linux.cmdline ? " via cmdline" :
@@ -237,8 +237,23 @@ void acpi_os_vprintf(const char *fmt, va_list args)
 #endif
 }
 
+#ifdef CONFIG_KEXEC
+static unsigned long acpi_rsdp;
+static int __init setup_acpi_rsdp(char *arg)
+{
+       acpi_rsdp = simple_strtoul(arg, NULL, 16);
+       return 0;
+}
+early_param("acpi_rsdp", setup_acpi_rsdp);
+#endif
+
 acpi_physical_address __init acpi_os_get_root_pointer(void)
 {
+#ifdef CONFIG_KEXEC
+       if (acpi_rsdp)
+               return acpi_rsdp;
+#endif
+
        if (efi_enabled) {
                if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
                        return efi.acpi20;
@@ -1083,7 +1098,13 @@ struct osi_setup_entry {
        bool enable;
 };
 
-static struct osi_setup_entry __initdata osi_setup_entries[OSI_STRING_ENTRIES_MAX];
+static struct osi_setup_entry __initdata
+               osi_setup_entries[OSI_STRING_ENTRIES_MAX] = {
+       {"Module Device", true},
+       {"Processor Device", true},
+       {"3.0 _SCP Extensions", true},
+       {"Processor Aggregator Device", true},
+};
 
 void __init acpi_osi_setup(char *str)
 {
index f907cfb..7f9eba9 100644 (file)
@@ -303,6 +303,61 @@ void acpi_pci_irq_del_prt(struct pci_bus *bus)
 /* --------------------------------------------------------------------------
                           PCI Interrupt Routing Support
    -------------------------------------------------------------------------- */
+#ifdef CONFIG_X86_IO_APIC
+extern int noioapicquirk;
+extern int noioapicreroute;
+
+static int bridge_has_boot_interrupt_variant(struct pci_bus *bus)
+{
+       struct pci_bus *bus_it;
+
+       for (bus_it = bus ; bus_it ; bus_it = bus_it->parent) {
+               if (!bus_it->self)
+                       return 0;
+               if (bus_it->self->irq_reroute_variant)
+                       return bus_it->self->irq_reroute_variant;
+       }
+       return 0;
+}
+
+/*
+ * Some chipsets (e.g. Intel 6700PXH) generate a legacy INTx when the IRQ
+ * entry in the chipset's IO-APIC is masked (as, e.g. the RT kernel does
+ * during interrupt handling). When this INTx generation cannot be disabled,
+ * we reroute these interrupts to their legacy equivalent to get rid of
+ * spurious interrupts.
+ */
+static int acpi_reroute_boot_interrupt(struct pci_dev *dev,
+                                      struct acpi_prt_entry *entry)
+{
+       if (noioapicquirk || noioapicreroute) {
+               return 0;
+       } else {
+               switch (bridge_has_boot_interrupt_variant(dev->bus)) {
+               case 0:
+                       /* no rerouting necessary */
+                       return 0;
+               case INTEL_IRQ_REROUTE_VARIANT:
+                       /*
+                        * Remap according to INTx routing table in 6700PXH
+                        * specs, intel order number 302628-002, section
+                        * 2.15.2. Other chipsets (80332, ...) have the same
+                        * mapping and are handled here as well.
+                        */
+                       dev_info(&dev->dev, "PCI IRQ %d -> rerouted to legacy "
+                                "IRQ %d\n", entry->index,
+                                (entry->index % 4) + 16);
+                       entry->index = (entry->index % 4) + 16;
+                       return 1;
+               default:
+                       dev_warn(&dev->dev, "Cannot reroute IRQ %d to legacy "
+                                "IRQ: unknown mapping\n", entry->index);
+                       return -1;
+               }
+       }
+}
+#endif /* CONFIG_X86_IO_APIC */
+
 static struct acpi_prt_entry *acpi_pci_irq_lookup(struct pci_dev *dev, int pin)
 {
        struct acpi_prt_entry *entry;
@@ -311,6 +366,9 @@ static struct acpi_prt_entry *acpi_pci_irq_lookup(struct pci_dev *dev, int pin)
 
        entry = acpi_pci_irq_find_prt_entry(dev, pin);
        if (entry) {
+#ifdef CONFIG_X86_IO_APIC
+               acpi_reroute_boot_interrupt(dev, entry);
+#endif /* CONFIG_X86_IO_APIC */
                ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %s[%c] _PRT entry\n",
                                  pci_name(dev), pin_name(pin)));
                return entry;
index d06078d..2672c79 100644 (file)
@@ -485,7 +485,8 @@ static int __devinit acpi_pci_root_add(struct acpi_device *device)
                root->secondary.end = 0xFF;
                printk(KERN_WARNING FW_BUG PREFIX
                       "no secondary bus range in _CRS\n");
-               status = acpi_evaluate_integer(device->handle, METHOD_NAME__BBN,                                               NULL, &bus);
+               status = acpi_evaluate_integer(device->handle, METHOD_NAME__BBN,
+                                              NULL, &bus);
                if (ACPI_SUCCESS(status))
                        root->secondary.start = bus;
                else if (status == AE_NOT_FOUND)
index 79cb653..870550d 100644 (file)
@@ -244,7 +244,7 @@ processor_set_cur_state(struct thermal_cooling_device *cdev,
        return result;
 }
 
-struct thermal_cooling_device_ops processor_cooling_ops = {
+const struct thermal_cooling_device_ops processor_cooling_ops = {
        .get_max_state = processor_get_max_state,
        .get_cur_state = processor_get_cur_state,
        .set_cur_state = processor_set_cur_state,
index 50658ff..6e36d0c 100644 (file)
@@ -130,6 +130,9 @@ struct acpi_sbs {
 
 #define to_acpi_sbs(x) container_of(x, struct acpi_sbs, charger)
 
+static int acpi_sbs_remove(struct acpi_device *device, int type);
+static int acpi_battery_get_state(struct acpi_battery *battery);
+
 static inline int battery_scale(int log)
 {
        int scale = 1;
@@ -195,6 +198,8 @@ static int acpi_sbs_battery_get_property(struct power_supply *psy,
 
        if ((!battery->present) && psp != POWER_SUPPLY_PROP_PRESENT)
                return -ENODEV;
+
+       acpi_battery_get_state(battery);
        switch (psp) {
        case POWER_SUPPLY_PROP_STATUS:
                if (battery->rate_now < 0)
@@ -225,11 +230,17 @@ static int acpi_sbs_battery_get_property(struct power_supply *psy,
        case POWER_SUPPLY_PROP_POWER_NOW:
                val->intval = abs(battery->rate_now) *
                                acpi_battery_ipscale(battery) * 1000;
+               val->intval *= (acpi_battery_mode(battery)) ?
+                               (battery->voltage_now *
+                               acpi_battery_vscale(battery) / 1000) : 1;
                break;
        case POWER_SUPPLY_PROP_CURRENT_AVG:
        case POWER_SUPPLY_PROP_POWER_AVG:
                val->intval = abs(battery->rate_avg) *
                                acpi_battery_ipscale(battery) * 1000;
+               val->intval *= (acpi_battery_mode(battery)) ?
+                               (battery->voltage_now *
+                               acpi_battery_vscale(battery) / 1000) : 1;
                break;
        case POWER_SUPPLY_PROP_CAPACITY:
                val->intval = battery->state_of_charge;
@@ -903,8 +914,6 @@ static void acpi_sbs_callback(void *context)
        }
 }
 
-static int acpi_sbs_remove(struct acpi_device *device, int type);
-
 static int acpi_sbs_add(struct acpi_device *device)
 {
        struct acpi_sbs *sbs;
index 6c94960..3ed80b2 100644 (file)
@@ -428,6 +428,22 @@ static struct dmi_system_id __initdata acpisleep_dmi_table[] = {
                DMI_MATCH(DMI_PRODUCT_NAME, "1000 Series"),
                },
        },
+       {
+       .callback = init_old_suspend_ordering,
+       .ident = "Asus A8N-SLI DELUXE",
+       .matches = {
+               DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+               DMI_MATCH(DMI_BOARD_NAME, "A8N-SLI DELUXE"),
+               },
+       },
+       {
+       .callback = init_old_suspend_ordering,
+       .ident = "Asus A8N-SLI Premium",
+       .matches = {
+               DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+               DMI_MATCH(DMI_BOARD_NAME, "A8N-SLI Premium"),
+               },
+       },
        {},
 };
 #endif /* CONFIG_SUSPEND */
index 77255f2..c538d0e 100644 (file)
@@ -149,12 +149,12 @@ static int param_get_debug_level(char *buffer, const struct kernel_param *kp)
        return result;
 }
 
-static struct kernel_param_ops param_ops_debug_layer = {
+static const struct kernel_param_ops param_ops_debug_layer = {
        .set = param_set_uint,
        .get = param_get_debug_layer,
 };
 
-static struct kernel_param_ops param_ops_debug_level = {
+static const struct kernel_param_ops param_ops_debug_level = {
        .set = param_set_uint,
        .get = param_get_debug_level,
 };
index 2607e17..48fbc64 100644 (file)
@@ -812,7 +812,7 @@ acpi_thermal_unbind_cooling_device(struct thermal_zone_device *thermal,
                                thermal_zone_unbind_cooling_device);
 }
 
-static struct thermal_zone_device_ops acpi_thermal_zone_ops = {
+static const struct thermal_zone_device_ops acpi_thermal_zone_ops = {
        .bind = acpi_thermal_bind_cooling_device,
        .unbind = acpi_thermal_unbind_cooling_device,
        .get_temp = thermal_get_temp,
index ada4b4d..08a44b5 100644 (file)
@@ -307,7 +307,7 @@ video_set_cur_state(struct thermal_cooling_device *cooling_dev, unsigned long st
        return acpi_video_device_lcd_set_level(video, level);
 }
 
-static struct thermal_cooling_device_ops video_cooling_ops = {
+static const struct thermal_cooling_device_ops video_cooling_ops = {
        .get_max_state = video_get_max_state,
        .get_cur_state = video_get_cur_state,
        .set_cur_state = video_set_cur_state,
index e0a5b55..bb7c5f1 100644 (file)
@@ -218,12 +218,12 @@ static void ata_acpi_dev_uevent(acpi_handle handle, u32 event, void *data)
        ata_acpi_uevent(dev->link->ap, dev, event);
 }
 
-static struct acpi_dock_ops ata_acpi_dev_dock_ops = {
+static const struct acpi_dock_ops ata_acpi_dev_dock_ops = {
        .handler = ata_acpi_dev_notify_dock,
        .uevent = ata_acpi_dev_uevent,
 };
 
-static struct acpi_dock_ops ata_acpi_ap_dock_ops = {
+static const struct acpi_dock_ops ata_acpi_ap_dock_ops = {
        .handler = ata_acpi_ap_notify_dock,
        .uevent = ata_acpi_ap_uevent,
 };
index 49502bc..423fd56 100644 (file)
@@ -616,5 +616,16 @@ config MSM_SMD_PKT
          Enables userspace clients to read and write to some packet SMD
          ports via device interface for MSM chipset.
 
+config TILE_SROM
+       bool "Character-device access via hypervisor to the Tilera SPI ROM"
+       depends on TILE
+       default y
+       ---help---
+         This device provides character-level read-write access
+         to the SROM, typically via the "0", "1", and "2" devices
+         in /dev/srom/.  The Tilera hypervisor makes the flash
+         device appear much like a simple EEPROM, and knows
+         how to partition a single ROM for multiple purposes.
+
 endmenu
 
index 7a00672..32762ba 100644 (file)
@@ -63,3 +63,5 @@ obj-$(CONFIG_RAMOOPS)         += ramoops.o
 
 obj-$(CONFIG_JS_RTC)           += js-rtc.o
 js-rtc-y = rtc.o
+
+obj-$(CONFIG_TILE_SROM)                += tile-srom.o
index fca0c51..810aff9 100644 (file)
@@ -147,6 +147,14 @@ static int __init ramoops_probe(struct platform_device *pdev)
        cxt->phys_addr = pdata->mem_address;
        cxt->record_size = pdata->record_size;
        cxt->dump_oops = pdata->dump_oops;
+       /*
+        * Update the module parameter variables as well so they are visible
+        * through /sys/module/ramoops/parameters/
+        */
+       mem_size = pdata->mem_size;
+       mem_address = pdata->mem_address;
+       record_size = pdata->record_size;
+       dump_oops = pdata->dump_oops;
 
        if (!request_mem_region(cxt->phys_addr, cxt->size, "ramoops")) {
                pr_err("request mem region failed\n");
diff --git a/drivers/char/tile-srom.c b/drivers/char/tile-srom.c
new file mode 100644 (file)
index 0000000..cf3ee00
--- /dev/null
@@ -0,0 +1,481 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * SPI Flash ROM driver
+ *
+ * This source code is derived from code provided in "Linux Device
+ * Drivers, Third Edition", by Jonathan Corbet, Alessandro Rubini, and
+ * Greg Kroah-Hartman, published by O'Reilly Media, Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/kernel.h>      /* printk() */
+#include <linux/slab.h>                /* kmalloc() */
+#include <linux/fs.h>          /* everything... */
+#include <linux/errno.h>       /* error codes */
+#include <linux/types.h>       /* size_t */
+#include <linux/proc_fs.h>
+#include <linux/fcntl.h>       /* O_ACCMODE */
+#include <linux/aio.h>
+#include <linux/pagemap.h>
+#include <linux/hugetlb.h>
+#include <linux/uaccess.h>
+#include <linux/platform_device.h>
+#include <hv/hypervisor.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/delay.h>
+#include <hv/drv_srom_intf.h>
+
+/*
+ * Size of our hypervisor I/O requests.  We break up large transfers
+ * so that we don't spend large uninterrupted spans of time in the
+ * hypervisor.  Erasing an SROM sector takes a significant fraction of
+ * a second, so if we allowed the user to, say, do one I/O to write the
+ * entire ROM, we'd get soft lockup timeouts, or worse.
+ */
+#define SROM_CHUNK_SIZE ((size_t)4096)
+
+/*
+ * When hypervisor is busy (e.g. erasing), poll the status periodically.
+ */
+
+/*
+ * Interval to poll the state in msec
+ */
+#define SROM_WAIT_TRY_INTERVAL 20
+
+/*
+ * Maximum times to poll the state
+ */
+#define SROM_MAX_WAIT_TRY_TIMES 1000
+
+struct srom_dev {
+       int hv_devhdl;                  /* Handle for hypervisor device */
+       u32 total_size;                 /* Size of this device */
+       u32 sector_size;                /* Size of a sector */
+       u32 page_size;                  /* Size of a page */
+       struct mutex lock;              /* Allow only one accessor at a time */
+};
+
+static int srom_major;                 /* Dynamic major by default */
+module_param(srom_major, int, 0);
+MODULE_AUTHOR("Tilera Corporation");
+MODULE_LICENSE("GPL");
+
+static int srom_devs;                  /* Number of SROM partitions */
+static struct cdev srom_cdev;
+static struct class *srom_class;
+static struct srom_dev *srom_devices;
+
+/*
+ * Handle calling the hypervisor and managing EAGAIN/EBUSY.
+ */
+
+static ssize_t _srom_read(int hv_devhdl, void *buf,
+                         loff_t off, size_t count)
+{
+       int retval, retries = SROM_MAX_WAIT_TRY_TIMES;
+       for (;;) {
+               retval = hv_dev_pread(hv_devhdl, 0, (HV_VirtAddr)buf,
+                                     count, off);
+               if (retval >= 0)
+                       return retval;
+               if (retval == HV_EAGAIN)
+                       continue;
+               if (retval == HV_EBUSY && --retries > 0) {
+                       msleep(SROM_WAIT_TRY_INTERVAL);
+                       continue;
+               }
+               pr_err("_srom_read: error %d\n", retval);
+               return -EIO;
+       }
+}
+
+static ssize_t _srom_write(int hv_devhdl, const void *buf,
+                          loff_t off, size_t count)
+{
+       int retval, retries = SROM_MAX_WAIT_TRY_TIMES;
+       for (;;) {
+               retval = hv_dev_pwrite(hv_devhdl, 0, (HV_VirtAddr)buf,
+                                      count, off);
+               if (retval >= 0)
+                       return retval;
+               if (retval == HV_EAGAIN)
+                       continue;
+               if (retval == HV_EBUSY && --retries > 0) {
+                       msleep(SROM_WAIT_TRY_INTERVAL);
+                       continue;
+               }
+               pr_err("_srom_write: error %d\n", retval);
+               return -EIO;
+       }
+}
+
+/**
+ * srom_open() - Device open routine.
+ * @inode: Inode for this device.
+ * @filp: File for this specific open of the device.
+ *
+ * Returns zero, or an error code.
+ */
+static int srom_open(struct inode *inode, struct file *filp)
+{
+       filp->private_data = &srom_devices[iminor(inode)];
+       return 0;
+}
+
+
+/**
+ * srom_release() - Device release routine.
+ * @inode: Inode for this device.
+ * @filp: File for this specific open of the device.
+ *
+ * Returns zero, or an error code.
+ */
+static int srom_release(struct inode *inode, struct file *filp)
+{
+       struct srom_dev *srom = filp->private_data;
+       char dummy;
+
+       /* Make sure we've flushed anything written to the ROM. */
+       mutex_lock(&srom->lock);
+       if (srom->hv_devhdl >= 0)
+               _srom_write(srom->hv_devhdl, &dummy, SROM_FLUSH_OFF, 1);
+       mutex_unlock(&srom->lock);
+
+       filp->private_data = NULL;
+
+       return 0;
+}
+
+
+/**
+ * srom_read() - Read data from the device.
+ * @filp: File for this specific open of the device.
+ * @buf: User's data buffer.
+ * @count: Number of bytes requested.
+ * @f_pos: File position.
+ *
+ * Returns number of bytes read, or an error code.
+ */
+static ssize_t srom_read(struct file *filp, char __user *buf,
+                        size_t count, loff_t *f_pos)
+{
+       int retval = 0;
+       void *kernbuf;
+       struct srom_dev *srom = filp->private_data;
+
+       kernbuf = kmalloc(SROM_CHUNK_SIZE, GFP_KERNEL);
+       if (!kernbuf)
+               return -ENOMEM;
+
+       if (mutex_lock_interruptible(&srom->lock)) {
+               retval = -ERESTARTSYS;
+               kfree(kernbuf);
+               return retval;
+       }
+
+       while (count) {
+               int hv_retval;
+               int bytes_this_pass = min(count, SROM_CHUNK_SIZE);
+
+               hv_retval = _srom_read(srom->hv_devhdl, kernbuf,
+                                      *f_pos, bytes_this_pass);
+               if (hv_retval > 0) {
+                       if (copy_to_user(buf, kernbuf, hv_retval) != 0) {
+                               retval = -EFAULT;
+                               break;
+                       }
+               } else if (hv_retval <= 0) {
+                       if (retval == 0)
+                               retval = hv_retval;
+                       break;
+               }
+
+               retval += hv_retval;
+               *f_pos += hv_retval;
+               buf += hv_retval;
+               count -= hv_retval;
+       }
+
+       mutex_unlock(&srom->lock);
+       kfree(kernbuf);
+
+       return retval;
+}
+
+/**
+ * srom_write() - Write data to the device.
+ * @filp: File for this specific open of the device.
+ * @buf: User's data buffer.
+ * @count: Number of bytes requested.
+ * @f_pos: File position.
+ *
+ * Returns number of bytes written, or an error code.
+ */
+static ssize_t srom_write(struct file *filp, const char __user *buf,
+                         size_t count, loff_t *f_pos)
+{
+       int retval = 0;
+       void *kernbuf;
+       struct srom_dev *srom = filp->private_data;
+
+       kernbuf = kmalloc(SROM_CHUNK_SIZE, GFP_KERNEL);
+       if (!kernbuf)
+               return -ENOMEM;
+
+       if (mutex_lock_interruptible(&srom->lock)) {
+               retval = -ERESTARTSYS;
+               kfree(kernbuf);
+               return retval;
+       }
+
+       while (count) {
+               int hv_retval;
+               int bytes_this_pass = min(count, SROM_CHUNK_SIZE);
+
+               if (copy_from_user(kernbuf, buf, bytes_this_pass) != 0) {
+                       retval = -EFAULT;
+                       break;
+               }
+
+               hv_retval = _srom_write(srom->hv_devhdl, kernbuf,
+                                       *f_pos, bytes_this_pass);
+               if (hv_retval <= 0) {
+                       if (retval == 0)
+                               retval = hv_retval;
+                       break;
+               }
+
+               retval += hv_retval;
+               *f_pos += hv_retval;
+               buf += hv_retval;
+               count -= hv_retval;
+       }
+
+       mutex_unlock(&srom->lock);
+       kfree(kernbuf);
+
+       return retval;
+}
+
+/* Provide our own implementation so we can use srom->total_size. */
+loff_t srom_llseek(struct file *filp, loff_t offset, int origin)
+{
+       struct srom_dev *srom = filp->private_data;
+
+       if (mutex_lock_interruptible(&srom->lock))
+               return -ERESTARTSYS;
+
+       switch (origin) {
+       case SEEK_END:
+               offset += srom->total_size;
+               break;
+       case SEEK_CUR:
+               offset += filp->f_pos;
+               break;
+       }
+
+       if (offset < 0 || offset > srom->total_size) {
+               offset = -EINVAL;
+       } else {
+               filp->f_pos = offset;
+               filp->f_version = 0;
+       }
+
+       mutex_unlock(&srom->lock);
+
+       return offset;
+}
+
+static ssize_t total_show(struct device *dev,
+                         struct device_attribute *attr, char *buf)
+{
+       struct srom_dev *srom = dev_get_drvdata(dev);
+       return sprintf(buf, "%u\n", srom->total_size);
+}
+
+static ssize_t sector_show(struct device *dev,
+                          struct device_attribute *attr, char *buf)
+{
+       struct srom_dev *srom = dev_get_drvdata(dev);
+       return sprintf(buf, "%u\n", srom->sector_size);
+}
+
+static ssize_t page_show(struct device *dev,
+                        struct device_attribute *attr, char *buf)
+{
+       struct srom_dev *srom = dev_get_drvdata(dev);
+       return sprintf(buf, "%u\n", srom->page_size);
+}
+
+static struct device_attribute srom_dev_attrs[] = {
+       __ATTR(total_size, S_IRUGO, total_show, NULL),
+       __ATTR(sector_size, S_IRUGO, sector_show, NULL),
+       __ATTR(page_size, S_IRUGO, page_show, NULL),
+       __ATTR_NULL
+};
+
+static char *srom_devnode(struct device *dev, mode_t *mode)
+{
+       *mode = S_IRUGO | S_IWUSR;
+       return kasprintf(GFP_KERNEL, "srom/%s", dev_name(dev));
+}
+
+/*
+ * The fops
+ */
+static const struct file_operations srom_fops = {
+       .owner =     THIS_MODULE,
+       .llseek =    srom_llseek,
+       .read =      srom_read,
+       .write =     srom_write,
+       .open =      srom_open,
+       .release =   srom_release,
+};
+
+/**
+ * srom_setup_minor() - Initialize per-minor information.
+ * @srom: Per-device SROM state.
+ * @index: Device to set up.
+ */
+static int srom_setup_minor(struct srom_dev *srom, int index)
+{
+       struct device *dev;
+       int devhdl = srom->hv_devhdl;
+
+       mutex_init(&srom->lock);
+
+       if (_srom_read(devhdl, &srom->total_size,
+                      SROM_TOTAL_SIZE_OFF, sizeof(srom->total_size)) < 0)
+               return -EIO;
+       if (_srom_read(devhdl, &srom->sector_size,
+                      SROM_SECTOR_SIZE_OFF, sizeof(srom->sector_size)) < 0)
+               return -EIO;
+       if (_srom_read(devhdl, &srom->page_size,
+                      SROM_PAGE_SIZE_OFF, sizeof(srom->page_size)) < 0)
+               return -EIO;
+
+       dev = device_create(srom_class, &platform_bus,
+                           MKDEV(srom_major, index), srom, "%d", index);
+       return IS_ERR(dev) ? PTR_ERR(dev) : 0;
+}
+
+/** srom_init() - Initialize the driver's module. */
+static int srom_init(void)
+{
+       int result, i;
+       dev_t dev = MKDEV(srom_major, 0);
+
+       /*
+        * Start with a plausible number of partitions; the krealloc() call
+        * below will yield about log(srom_devs) additional allocations.
+        */
+       srom_devices = kzalloc(4 * sizeof(struct srom_dev), GFP_KERNEL);
+
+       /* Discover the number of srom partitions. */
+       for (i = 0; ; i++) {
+               int devhdl;
+               char buf[20];
+               struct srom_dev *new_srom_devices =
+                       krealloc(srom_devices, (i+1) * sizeof(struct srom_dev),
+                                GFP_KERNEL | __GFP_ZERO);
+               if (!new_srom_devices) {
+                       result = -ENOMEM;
+                       goto fail_mem;
+               }
+               srom_devices = new_srom_devices;
+               sprintf(buf, "srom/0/%d", i);
+               devhdl = hv_dev_open((HV_VirtAddr)buf, 0);
+               if (devhdl < 0) {
+                       if (devhdl != HV_ENODEV)
+                               pr_notice("srom/%d: hv_dev_open failed: %d.\n",
+                                         i, devhdl);
+                       break;
+               }
+               srom_devices[i].hv_devhdl = devhdl;
+       }
+       srom_devs = i;
+
+       /* Bail out early if we have no partitions at all. */
+       if (srom_devs == 0) {
+               result = -ENODEV;
+               goto fail_mem;
+       }
+
+       /* Register our major, and accept a dynamic number. */
+       if (srom_major)
+               result = register_chrdev_region(dev, srom_devs, "srom");
+       else {
+               result = alloc_chrdev_region(&dev, 0, srom_devs, "srom");
+               srom_major = MAJOR(dev);
+       }
+       if (result < 0)
+               goto fail_mem;
+
+       /* Register a character device. */
+       cdev_init(&srom_cdev, &srom_fops);
+       srom_cdev.owner = THIS_MODULE;
+       srom_cdev.ops = &srom_fops;
+       result = cdev_add(&srom_cdev, dev, srom_devs);
+       if (result < 0)
+               goto fail_chrdev;
+
+       /* Create a sysfs class. */
+       srom_class = class_create(THIS_MODULE, "srom");
+       if (IS_ERR(srom_class)) {
+               result = PTR_ERR(srom_class);
+               goto fail_cdev;
+       }
+       srom_class->dev_attrs = srom_dev_attrs;
+       srom_class->devnode = srom_devnode;
+
+       /* Do per-partition initialization */
+       for (i = 0; i < srom_devs; i++) {
+               result = srom_setup_minor(srom_devices + i, i);
+               if (result < 0)
+                       goto fail_class;
+       }
+
+       return 0;
+
+fail_class:
+       for (i = 0; i < srom_devs; i++)
+               device_destroy(srom_class, MKDEV(srom_major, i));
+       class_destroy(srom_class);
+fail_cdev:
+       cdev_del(&srom_cdev);
+fail_chrdev:
+       unregister_chrdev_region(dev, srom_devs);
+fail_mem:
+       kfree(srom_devices);
+       return result;
+}
+
+/** srom_cleanup() - Clean up the driver's module. */
+static void srom_cleanup(void)
+{
+       int i;
+       for (i = 0; i < srom_devs; i++)
+               device_destroy(srom_class, MKDEV(srom_major, i));
+       class_destroy(srom_class);
+       cdev_del(&srom_cdev);
+       unregister_chrdev_region(MKDEV(srom_major, 0), srom_devs);
+       kfree(srom_devices);
+}
+
+module_init(srom_init);
+module_exit(srom_cleanup);
index 7fc2f10..3f4051a 100644 (file)
@@ -80,7 +80,7 @@ enum tis_defaults {
 static LIST_HEAD(tis_chips);
 static DEFINE_SPINLOCK(tis_lock);
 
-#ifdef CONFIG_PNP
+#if defined(CONFIG_PNP) && defined(CONFIG_ACPI)
 static int is_itpm(struct pnp_dev *dev)
 {
        struct acpi_device *acpi = pnp_acpi_device(dev);
@@ -93,6 +93,11 @@ static int is_itpm(struct pnp_dev *dev)
 
        return 0;
 }
+#else
+static inline int is_itpm(struct pnp_dev *dev)
+{
+       return 0;
+}
 #endif
 
 static int check_locality(struct tpm_chip *chip, int l)
index bf50924..d4c5423 100644 (file)
@@ -25,9 +25,19 @@ DEFINE_PER_CPU(struct cpuidle_device *, cpuidle_devices);
 
 DEFINE_MUTEX(cpuidle_lock);
 LIST_HEAD(cpuidle_detected_devices);
-static void (*pm_idle_old)(void);
 
 static int enabled_devices;
+static int off __read_mostly;
+static int initialized __read_mostly;
+
+int cpuidle_disabled(void)
+{
+       return off;
+}
+void disable_cpuidle(void)
+{
+       off = 1;
+}
 
 #if defined(CONFIG_ARCH_HAS_CPU_IDLE_WAIT)
 static void cpuidle_kick_cpus(void)
@@ -46,25 +56,23 @@ static int __cpuidle_register_device(struct cpuidle_device *dev);
  * cpuidle_idle_call - the main idle loop
  *
  * NOTE: no locks or semaphores should be used here
+ * return non-zero on failure
  */
-static void cpuidle_idle_call(void)
+int cpuidle_idle_call(void)
 {
        struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
        struct cpuidle_state *target_state;
        int next_state;
 
+       if (off)
+               return -ENODEV;
+
+       if (!initialized)
+               return -ENODEV;
+
        /* check if the device is ready */
-       if (!dev || !dev->enabled) {
-               if (pm_idle_old)
-                       pm_idle_old();
-               else
-#if defined(CONFIG_ARCH_HAS_DEFAULT_IDLE)
-                       default_idle();
-#else
-                       local_irq_enable();
-#endif
-               return;
-       }
+       if (!dev || !dev->enabled)
+               return -EBUSY;
 
 #if 0
        /* shows regressions, re-enable for 2.6.29 */
@@ -89,7 +97,7 @@ static void cpuidle_idle_call(void)
        next_state = cpuidle_curr_governor->select(dev);
        if (need_resched()) {
                local_irq_enable();
-               return;
+               return 0;
        }
 
        target_state = &dev->states[next_state];
@@ -114,6 +122,8 @@ static void cpuidle_idle_call(void)
        /* give the governor an opportunity to reflect on the outcome */
        if (cpuidle_curr_governor->reflect)
                cpuidle_curr_governor->reflect(dev);
+
+       return 0;
 }
 
 /**
@@ -121,10 +131,10 @@ static void cpuidle_idle_call(void)
  */
 void cpuidle_install_idle_handler(void)
 {
-       if (enabled_devices && (pm_idle != cpuidle_idle_call)) {
+       if (enabled_devices) {
                /* Make sure all changes finished before we switch to new idle */
                smp_wmb();
-               pm_idle = cpuidle_idle_call;
+               initialized = 1;
        }
 }
 
@@ -133,8 +143,8 @@ void cpuidle_install_idle_handler(void)
  */
 void cpuidle_uninstall_idle_handler(void)
 {
-       if (enabled_devices && pm_idle_old && (pm_idle != pm_idle_old)) {
-               pm_idle = pm_idle_old;
+       if (enabled_devices) {
+               initialized = 0;
                cpuidle_kick_cpus();
        }
 }
@@ -427,7 +437,8 @@ static int __init cpuidle_init(void)
 {
        int ret;
 
-       pm_idle_old = pm_idle;
+       if (cpuidle_disabled())
+               return -ENODEV;
 
        ret = cpuidle_add_class_sysfs(&cpu_sysdev_class);
        if (ret)
@@ -438,4 +449,5 @@ static int __init cpuidle_init(void)
        return 0;
 }
 
+module_param(off, int, 0444);
 core_initcall(cpuidle_init);
index 33e50d5..38c3fd8 100644 (file)
@@ -13,6 +13,7 @@ extern struct list_head cpuidle_governors;
 extern struct list_head cpuidle_detected_devices;
 extern struct mutex cpuidle_lock;
 extern spinlock_t cpuidle_driver_lock;
+extern int cpuidle_disabled(void);
 
 /* idle loop */
 extern void cpuidle_install_idle_handler(void);
index fd1601e..3f7e3ce 100644 (file)
@@ -26,6 +26,9 @@ int cpuidle_register_driver(struct cpuidle_driver *drv)
        if (!drv)
                return -EINVAL;
 
+       if (cpuidle_disabled())
+               return -ENODEV;
+
        spin_lock(&cpuidle_driver_lock);
        if (cpuidle_curr_driver) {
                spin_unlock(&cpuidle_driver_lock);
index 724c164..ea2f8e7 100644 (file)
@@ -81,6 +81,9 @@ int cpuidle_register_governor(struct cpuidle_governor *gov)
        if (!gov || !gov->select)
                return -EINVAL;
 
+       if (cpuidle_disabled())
+               return -ENODEV;
+
        mutex_lock(&cpuidle_lock);
        if (__cpuidle_find_governor(gov->name) == NULL) {
                ret = 0;
index 30da70d..cdae207 100644 (file)
@@ -45,13 +45,13 @@ static int __init pci_eisa_init(struct pci_dev *pdev,
        return 0;
 }
 
-static struct pci_device_id __initdata pci_eisa_pci_tbl[] = {
+static struct pci_device_id pci_eisa_pci_tbl[] = {
        { PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
          PCI_CLASS_BRIDGE_EISA << 8, 0xffff00, 0 },
        { 0, }
 };
 
-static struct pci_driver __initdata pci_eisa_driver = {
+static struct pci_driver __refdata pci_eisa_driver = {
        .name           = "pci_eisa",
        .id_table       = pci_eisa_pci_tbl,
        .probe          = pci_eisa_init,
index eacb05e..eb80b54 100644 (file)
@@ -157,7 +157,7 @@ utf16_strnlen(efi_char16_t *s, size_t maxlength)
        return length;
 }
 
-static unsigned long
+static inline unsigned long
 utf16_strlen(efi_char16_t *s)
 {
        return utf16_strnlen(s, ~0UL);
@@ -580,8 +580,8 @@ static ssize_t efi_pstore_read(u64 *id, enum pstore_type_id *type,
        return -1;
 }
 
-static u64 efi_pstore_write(enum pstore_type_id type, int part, size_t size,
-                           struct pstore_info *psi)
+static u64 efi_pstore_write(enum pstore_type_id type, unsigned int part,
+                           size_t size, struct pstore_info *psi)
 {
        return 0;
 }
index ce281d1..67df91a 100644 (file)
@@ -483,7 +483,7 @@ static int gpio_keys_get_devtree_pdata(struct device *dev,
 
        buttons = kzalloc(pdata->nbuttons * (sizeof *buttons), GFP_KERNEL);
        if (!buttons)
-               return -ENODEV;
+               return -ENOMEM;
 
        pp = NULL;
        i = 0;
index ab0acaf..756348a 100644 (file)
@@ -754,8 +754,11 @@ fail3:
        device_remove_file(&client->dev, &dev_attr_disable_kp);
 fail2:
        while (--pwm >= 0)
-               if (lm->pwm[pwm].enabled)
+               if (lm->pwm[pwm].enabled) {
+                       device_remove_file(lm->pwm[pwm].cdev.dev,
+                                          &dev_attr_time);
                        led_classdev_unregister(&lm->pwm[pwm].cdev);
+               }
 fail1:
        input_free_device(idev);
        kfree(lm);
@@ -775,8 +778,10 @@ static int __devexit lm8323_remove(struct i2c_client *client)
        device_remove_file(&lm->client->dev, &dev_attr_disable_kp);
 
        for (i = 0; i < 3; i++)
-               if (lm->pwm[i].enabled)
+               if (lm->pwm[i].enabled) {
+                       device_remove_file(lm->pwm[i].cdev.dev, &dev_attr_time);
                        led_classdev_unregister(&lm->pwm[i].cdev);
+               }
 
        kfree(lm);
 
index da3828f..f270447 100644 (file)
@@ -19,6 +19,7 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  */
 
+#include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/input.h>
 #include <linux/platform_device.h>
@@ -37,7 +38,7 @@
 #define KBC_ROW_SCAN_DLY       5
 
 /* KBC uses a 32KHz clock so a cycle = 1/32Khz */
-#define KBC_CYCLE_USEC 32
+#define KBC_CYCLE_MS   32
 
 /* KBC Registers */
 
@@ -647,7 +648,7 @@ static int __devinit tegra_kbc_probe(struct platform_device *pdev)
        debounce_cnt = min(pdata->debounce_cnt, KBC_MAX_DEBOUNCE_CNT);
        scan_time_rows = (KBC_ROW_SCAN_TIME + debounce_cnt) * num_rows;
        kbc->repoll_dly = KBC_ROW_SCAN_DLY + scan_time_rows + pdata->repeat_cnt;
-       kbc->repoll_dly = ((kbc->repoll_dly * KBC_CYCLE_USEC) + 999) / 1000;
+       kbc->repoll_dly = DIV_ROUND_UP(kbc->repoll_dly, KBC_CYCLE_MS);
 
        input_dev->name = pdev->name;
        input_dev->id.bustype = BUS_HOST;
index c456f63..783597a 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/i2c.h>
 #include <linux/input.h>
 #include <linux/interrupt.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/input/kxtj9.h>
 #include <linux/input-polldev.h>
index 20f8f92..6c76cf7 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/delay.h>
 #include <linux/i2c.h>
 #include <linux/input-polldev.h>
+#include <linux/of_device.h>
 
 #define MMA8450_DRV_NAME       "mma8450"
 
@@ -229,10 +230,17 @@ static const struct i2c_device_id mma8450_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, mma8450_id);
 
+static const struct of_device_id mma8450_dt_ids[] = {
+       { .compatible = "fsl,mma8450", },
+       { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(i2c, mma8450_dt_ids);
+
 static struct i2c_driver mma8450_driver = {
        .driver = {
                .name   = MMA8450_DRV_NAME,
                .owner  = THIS_MODULE,
+               .of_match_table = mma8450_dt_ids,
        },
        .probe          = mma8450_probe,
        .remove         = __devexit_p(mma8450_remove),
index 95577c1..4d17d9f 100644 (file)
@@ -32,6 +32,7 @@
 #define DEBUG
 #include <linux/slab.h>
 #include <linux/input.h>
+#include <linux/module.h>
 #include <linux/serio.h>
 #include <linux/libps2.h>
 #include <linux/delay.h>
index bc3b518..131f9d1 100644 (file)
@@ -249,12 +249,14 @@ static void __ad7879_enable(struct ad7879 *ts)
 
 static void __ad7879_disable(struct ad7879 *ts)
 {
+       u16 reg = (ts->cmd_crtl2 & ~AD7879_PM(-1)) |
+               AD7879_PM(AD7879_PM_SHUTDOWN);
        disable_irq(ts->irq);
 
        if (del_timer_sync(&ts->timer))
                ad7879_ts_event_release(ts);
 
-       ad7879_write(ts, AD7879_REG_CTRL2, AD7879_PM(AD7879_PM_SHUTDOWN));
+       ad7879_write(ts, AD7879_REG_CTRL2, reg);
 }
 
 
index 3ff22e3..fb28b5a 100644 (file)
  *      as published by the Free Software Foundation; either version
  *      2 of the License, or (at your option) any later version.
  */
+#include <linux/ctype.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
 
+/**
+ * struct alias_prop - Alias property in 'aliases' node
+ * @link:      List node to link the structure in aliases_lookup list
+ * @alias:     Alias property name
+ * @np:                Pointer to device_node that the alias stands for
+ * @id:                Index value from end of alias name
+ * @stem:      Alias string without the index
+ *
+ * The structure represents one alias property of 'aliases' node as
+ * an entry in aliases_lookup list.
+ */
+struct alias_prop {
+       struct list_head link;
+       const char *alias;
+       struct device_node *np;
+       int id;
+       char stem[0];
+};
+
+static LIST_HEAD(aliases_lookup);
+
 struct device_node *allnodes;
 struct device_node *of_chosen;
+struct device_node *of_aliases;
+
+static DEFINE_MUTEX(of_aliases_mutex);
 
 /* use when traversing tree through the allnext, child, sibling,
  * or parent members of struct device_node.
@@ -988,3 +1013,108 @@ out_unlock:
 }
 #endif /* defined(CONFIG_OF_DYNAMIC) */
 
+static void of_alias_add(struct alias_prop *ap, struct device_node *np,
+                        int id, const char *stem, int stem_len)
+{
+       ap->id = id;
+       ap->np = np;
+       strncpy(ap->stem, stem, stem_len);
+       ap->stem[stem_len] = 0;
+       list_add_tail(&ap->link, &aliases_lookup);
+       pr_debug("adding DT alias:%s: stem=%s id=%i node=%s\n",
+                ap->alias, ap->stem, ap->id, np ? np->full_name : NULL);
+}
+
+/**
+ * of_alias_scan() - Scan all properties of 'aliases' node
+ *
+ * The function scans all the properties of 'aliases' node and populate
+ * the global lookup table with the properties.  It returns the
+ * number of alias_prop found, or error code in error case.
+ */
+__init void of_alias_scan(void)
+{
+       struct property *pp;
+
+       if (!of_aliases)
+               return;
+
+       for_each_property(pp, of_aliases->properties) {
+               const char *start = pp->name;
+               const char *end = start + strlen(start);
+               struct device_node *np;
+               struct alias_prop *ap;
+               int id, len;
+
+               /* Skip those we do not want to proceed */
+               if (!strcmp(pp->name, "name") ||
+                   !strcmp(pp->name, "phandle") ||
+                   !strcmp(pp->name, "linux,phandle"))
+                       continue;
+
+               np = of_find_node_by_path(pp->value);
+               if (!np)
+                       continue;
+
+               /* walk alias backwards to extract the id and 'stem' string */
+               while (isdigit(*(end-1)) && end > start)
+                       end--;
+               len = end - start;
+               id = strlen(end) ? simple_strtoul(end, NULL, 10) : -1;
+
+               /* Allocate an alias_prop with enough space for the stem */
+               ap = early_init_dt_alloc_memory_arch(sizeof(*ap) + len + 1, 4);
+               if (!ap)
+                       continue;
+               ap->alias = start;
+               of_alias_add(ap, np, id, start, len);
+       }
+}
+
+/**
+ * of_alias_get_id() - Get alias id for the given device_node
+ * @np:                Pointer to the given device_node
+ * @stem:      Alias stem of the given device_node
+ *
+ * The function travels the lookup table to get alias id for the given
+ * device_node and alias stem.  It returns the alias id if find it.
+ * If not, dynamically creates one in the lookup table and returns it,
+ * or returns error code if fail to create.
+ */
+int of_alias_get_id(struct device_node *np, const char *stem)
+{
+       struct alias_prop *app;
+       int id = 0;
+       bool found = false;
+
+       mutex_lock(&of_aliases_mutex);
+       list_for_each_entry(app, &aliases_lookup, link) {
+               if (strcmp(app->stem, stem) != 0)
+                       continue;
+
+               if (np == app->np) {
+                       found = true;
+                       id = app->id;
+                       break;
+               }
+
+               if (id <= app->id)
+                       id = app->id + 1;
+       }
+
+       /* If an id is not found, then allocate a new one */
+       if (!found) {
+               app = kzalloc(sizeof(*app) + strlen(stem) + 1, 4);
+               if (!app) {
+                       id = -ENODEV;
+                       goto out;
+               }
+               of_alias_add(app, np, id, stem, strlen(stem));
+       }
+
+ out:
+       mutex_unlock(&of_aliases_mutex);
+
+       return id;
+}
+EXPORT_SYMBOL_GPL(of_alias_get_id);
index 65200af..13d6d3a 100644 (file)
@@ -707,10 +707,12 @@ void __init unflatten_device_tree(void)
        __unflatten_device_tree(initial_boot_params, &allnodes,
                                early_init_dt_alloc_memory_arch);
 
-       /* Get pointer to OF "/chosen" node for use everywhere */
+       /* Get pointer to "/chosen" and "/aliasas" nodes for use everywhere */
        of_chosen = of_find_node_by_path("/chosen");
        if (of_chosen == NULL)
                of_chosen = of_find_node_by_path("/chosen@0");
+       of_aliases = of_find_node_by_path("/aliases");
+       of_alias_scan();
 }
 
 #endif /* CONFIG_OF_EARLY_FLATTREE */
index a70fa89..2202857 100644 (file)
@@ -110,7 +110,7 @@ static int post_dock_fixups(struct notifier_block *nb, unsigned long val,
 }
 
 
-static struct acpi_dock_ops acpiphp_dock_ops = {
+static const struct acpi_dock_ops acpiphp_dock_ops = {
        .handler = handle_hotplug_event_func,
 };
 
index bcae8dd..7789002 100644 (file)
@@ -368,7 +368,7 @@ static int __init omap_rtc_probe(struct platform_device *pdev)
                pr_info("%s: already running\n", pdev->name);
 
        /* force to 24 hour mode */
-       new_ctrl = reg & ~(OMAP_RTC_CTRL_SPLIT|OMAP_RTC_CTRL_AUTO_COMP);
+       new_ctrl = reg & (OMAP_RTC_CTRL_SPLIT|OMAP_RTC_CTRL_AUTO_COMP);
        new_ctrl |= OMAP_RTC_CTRL_STOP;
 
        /* BOARD-SPECIFIC CUSTOMIZATION CAN GO HERE:
index 564ff4e..8345fb4 100644 (file)
@@ -1,5 +1,6 @@
 config ISCSI_TARGET
        tristate "Linux-iSCSI.org iSCSI Target Mode Stack"
+       depends on NET
        select CRYPTO
        select CRYPTO_CRC32C
        select CRYPTO_CRC32C_INTEL if X86
index 14c81c4..c24fb10 100644 (file)
@@ -120,7 +120,7 @@ struct iscsi_tiqn *iscsit_add_tiqn(unsigned char *buf)
        struct iscsi_tiqn *tiqn = NULL;
        int ret;
 
-       if (strlen(buf) > ISCSI_IQN_LEN) {
+       if (strlen(buf) >= ISCSI_IQN_LEN) {
                pr_err("Target IQN exceeds %d bytes\n",
                                ISCSI_IQN_LEN);
                return ERR_PTR(-EINVAL);
@@ -1857,7 +1857,7 @@ static int iscsit_handle_text_cmd(
        char *text_ptr, *text_in;
        int cmdsn_ret, niov = 0, rx_got, rx_size;
        u32 checksum = 0, data_crc = 0, payload_length;
-       u32 padding = 0, text_length = 0;
+       u32 padding = 0, pad_bytes = 0, text_length = 0;
        struct iscsi_cmd *cmd;
        struct kvec iov[3];
        struct iscsi_text *hdr;
@@ -1896,7 +1896,7 @@ static int iscsit_handle_text_cmd(
 
                padding = ((-payload_length) & 3);
                if (padding != 0) {
-                       iov[niov].iov_base = cmd->pad_bytes;
+                       iov[niov].iov_base = &pad_bytes;
                        iov[niov++].iov_len  = padding;
                        rx_size += padding;
                        pr_debug("Receiving %u additional bytes"
@@ -1917,7 +1917,7 @@ static int iscsit_handle_text_cmd(
                if (conn->conn_ops->DataDigest) {
                        iscsit_do_crypto_hash_buf(&conn->conn_rx_hash,
                                        text_in, text_length,
-                                       padding, cmd->pad_bytes,
+                                       padding, (u8 *)&pad_bytes,
                                        (u8 *)&data_crc);
 
                        if (checksum != data_crc) {
@@ -3468,7 +3468,12 @@ static inline void iscsit_thread_check_cpumask(
 }
 
 #else
-#define iscsit_thread_get_cpumask(X) ({})
+
+void iscsit_thread_get_cpumask(struct iscsi_conn *conn)
+{
+       return;
+}
+
 #define iscsit_thread_check_cpumask(X, Y, Z) ({})
 #endif /* CONFIG_SMP */
 
index 32bb92c..f095e65 100644 (file)
@@ -181,7 +181,7 @@ struct se_tpg_np *lio_target_call_addnptotpg(
                return ERR_PTR(-EOVERFLOW);
        }
        memset(buf, 0, MAX_PORTAL_LEN + 1);
-       snprintf(buf, MAX_PORTAL_LEN, "%s", name);
+       snprintf(buf, MAX_PORTAL_LEN + 1, "%s", name);
 
        memset(&sockaddr, 0, sizeof(struct __kernel_sockaddr_storage));
 
index 713a4d2..4d087ac 100644 (file)
@@ -978,7 +978,7 @@ struct iscsi_login *iscsi_target_init_negotiation(
                pr_err("Unable to allocate memory for struct iscsi_login.\n");
                iscsit_tx_login_rsp(conn, ISCSI_STATUS_CLS_TARGET_ERR,
                                ISCSI_LOGIN_STATUS_NO_RESOURCES);
-               goto out;
+               return NULL;
        }
 
        login->req = kzalloc(ISCSI_HDR_LEN, GFP_KERNEL);
index c75a01a..8976032 100644 (file)
@@ -1747,6 +1747,8 @@ int transport_generic_handle_cdb(
 }
 EXPORT_SYMBOL(transport_generic_handle_cdb);
 
+static void transport_generic_request_failure(struct se_cmd *,
+                       struct se_device *, int, int);
 /*
  * Used by fabric module frontends to queue tasks directly.
  * Many only be used from process context only
@@ -1754,6 +1756,8 @@ EXPORT_SYMBOL(transport_generic_handle_cdb);
 int transport_handle_cdb_direct(
        struct se_cmd *cmd)
 {
+       int ret;
+
        if (!cmd->se_lun) {
                dump_stack();
                pr_err("cmd->se_lun is NULL\n");
@@ -1765,8 +1769,31 @@ int transport_handle_cdb_direct(
                                " from interrupt context\n");
                return -EINVAL;
        }
-
-       return transport_generic_new_cmd(cmd);
+       /*
+        * Set TRANSPORT_NEW_CMD state and cmd->t_transport_active=1 following
+        * transport_generic_handle_cdb*() -> transport_add_cmd_to_queue()
+        * in existing usage to ensure that outstanding descriptors are handled
+        * correctly during shutdown via transport_generic_wait_for_tasks()
+        *
+        * Also, we don't take cmd->t_state_lock here as we only expect
+        * this to be called for initial descriptor submission.
+        */
+       cmd->t_state = TRANSPORT_NEW_CMD;
+       atomic_set(&cmd->t_transport_active, 1);
+       /*
+        * transport_generic_new_cmd() is already handling QUEUE_FULL,
+        * so follow TRANSPORT_NEW_CMD processing thread context usage
+        * and call transport_generic_request_failure() if necessary..
+        */
+       ret = transport_generic_new_cmd(cmd);
+       if (ret == -EAGAIN)
+               return 0;
+       else if (ret < 0) {
+               cmd->transport_error_status = ret;
+               transport_generic_request_failure(cmd, NULL, 0,
+                               (cmd->data_direction != DMA_TO_DEVICE));
+       }
+       return 0;
 }
 EXPORT_SYMBOL(transport_handle_cdb_direct);
 
@@ -3324,7 +3351,7 @@ static int transport_generic_cmd_sequencer(
                        goto out_invalid_cdb_field;
                }
 
-               cmd->t_task_lba = get_unaligned_be16(&cdb[2]);
+               cmd->t_task_lba = get_unaligned_be64(&cdb[2]);
                passthrough = (dev->transport->transport_type ==
                                TRANSPORT_PLUGIN_PHBA_PDEV);
                /*
index f7fff7e..bd4fe21 100644 (file)
@@ -187,4 +187,9 @@ void ft_dump_cmd(struct ft_cmd *, const char *caller);
 
 ssize_t ft_format_wwn(char *, size_t, u64);
 
+/*
+ * Underlying HW specific helper function
+ */
+void ft_invl_hw_context(struct ft_cmd *);
+
 #endif /* __TCM_FC_H__ */
index 09df38b..5654dc2 100644 (file)
@@ -320,6 +320,7 @@ static void ft_recv_seq(struct fc_seq *sp, struct fc_frame *fp, void *arg)
        default:
                pr_debug("%s: unhandled frame r_ctl %x\n",
                       __func__, fh->fh_r_ctl);
+               ft_invl_hw_context(cmd);
                fc_frame_free(fp);
                transport_generic_free_cmd(&cmd->se_cmd, 0, 0);
                break;
index 8e2a46d..c37f4cd 100644 (file)
@@ -213,62 +213,49 @@ void ft_recv_write_data(struct ft_cmd *cmd, struct fc_frame *fp)
        if (!(ntoh24(fh->fh_f_ctl) & FC_FC_REL_OFF))
                goto drop;
 
+       f_ctl = ntoh24(fh->fh_f_ctl);
+       ep = fc_seq_exch(seq);
+       lport = ep->lp;
+       if (cmd->was_ddp_setup) {
+               BUG_ON(!ep);
+               BUG_ON(!lport);
+       }
+
        /*
-        * Doesn't expect even single byte of payload. Payload
+        * Doesn't expect payload if DDP is setup. Payload
         * is expected to be copied directly to user buffers
-        * due to DDP (Large Rx offload) feature, hence
-        * BUG_ON if BUF is non-NULL
+        * due to DDP (Large Rx offload),
         */
        buf = fc_frame_payload_get(fp, 1);
-       if (cmd->was_ddp_setup && buf) {
-               pr_debug("%s: When DDP was setup, not expected to"
-                                "receive frame with payload, Payload shall be"
-                                "copied directly to buffer instead of coming "
-                                "via. legacy receive queues\n", __func__);
-               BUG_ON(buf);
-       }
+       if (buf)
+               pr_err("%s: xid 0x%x, f_ctl 0x%x, cmd->sg %p, "
+                               "cmd->sg_cnt 0x%x. DDP was setup"
+                               " hence not expected to receive frame with "
+                               "payload, Frame will be dropped if "
+                               "'Sequence Initiative' bit in f_ctl is "
+                               "not set\n", __func__, ep->xid, f_ctl,
+                               cmd->sg, cmd->sg_cnt);
+       /*
+        * Invalidate HW DDP context if it was setup for respective
+        * command. Invalidation of HW DDP context is requited in both
+        * situation (success and error). 
+        */
+       ft_invl_hw_context(cmd);
 
        /*
-        * If ft_cmd indicated 'ddp_setup', in that case only the last frame
-        * should come with 'TSI bit being set'. If 'TSI bit is not set and if
-        * data frame appears here, means error condition. In both the cases
-        * release the DDP context (ddp_put) and in error case, as well
-        * initiate error recovery mechanism.
+        * If "Sequence Initiative (TSI)" bit set in f_ctl, means last
+        * write data frame is received successfully where payload is
+        * posted directly to user buffer and only the last frame's
+        * header is posted in receive queue.
+        *
+        * If "Sequence Initiative (TSI)" bit is not set, means error
+        * condition w.r.t. DDP, hence drop the packet and let explict
+        * ABORTS from other end of exchange timer trigger the recovery.
         */
-       ep = fc_seq_exch(seq);
-       if (cmd->was_ddp_setup) {
-               BUG_ON(!ep);
-               lport = ep->lp;
-               BUG_ON(!lport);
-       }
-       if (cmd->was_ddp_setup && ep->xid != FC_XID_UNKNOWN) {
-               f_ctl = ntoh24(fh->fh_f_ctl);
-               /*
-                * If TSI bit set in f_ctl, means last write data frame is
-                * received successfully where payload is posted directly
-                * to user buffer and only the last frame's header is posted
-                * in legacy receive queue
-                */
-               if (f_ctl & FC_FC_SEQ_INIT) { /* TSI bit set in FC frame */
-                       cmd->write_data_len = lport->tt.ddp_done(lport,
-                                                               ep->xid);
-                       goto last_frame;
-               } else {
-                       /*
-                        * Updating the write_data_len may be meaningless at
-                        * this point, but just in case if required in future
-                        * for debugging or any other purpose
-                        */
-                       pr_err("%s: Received frame with TSI bit not"
-                                       " being SET, dropping the frame, "
-                                       "cmd->sg <%p>, cmd->sg_cnt <0x%x>\n",
-                                       __func__, cmd->sg, cmd->sg_cnt);
-                       cmd->write_data_len = lport->tt.ddp_done(lport,
-                                                             ep->xid);
-                       lport->tt.seq_exch_abort(cmd->seq, 0);
-                       goto drop;
-               }
-       }
+       if (f_ctl & FC_FC_SEQ_INIT)
+               goto last_frame;
+       else
+               goto drop;
 
        rel_off = ntohl(fh->fh_parm_offset);
        frame_len = fr_len(fp);
@@ -331,3 +318,39 @@ last_frame:
 drop:
        fc_frame_free(fp);
 }
+
+/*
+ * Handle and cleanup any HW specific resources if
+ * received ABORTS, errors, timeouts.
+ */
+void ft_invl_hw_context(struct ft_cmd *cmd)
+{
+       struct fc_seq *seq = cmd->seq;
+       struct fc_exch *ep = NULL;
+       struct fc_lport *lport = NULL;
+
+       BUG_ON(!cmd);
+
+       /* Cleanup the DDP context in HW if DDP was setup */
+       if (cmd->was_ddp_setup && seq) {
+               ep = fc_seq_exch(seq);
+               if (ep) {
+                       lport = ep->lp;
+                       if (lport && (ep->xid <= lport->lro_xid))
+                               /*
+                                * "ddp_done" trigger invalidation of HW
+                                * specific DDP context
+                                */
+                               cmd->write_data_len = lport->tt.ddp_done(lport,
+                                                                     ep->xid);
+
+                               /*
+                                * Resetting same variable to indicate HW's
+                                * DDP context has been invalidated to avoid
+                                * re_invalidation of same context (context is
+                                * identified using ep->xid)
+                                */
+                               cmd->was_ddp_setup = 0;
+               }
+       }
+}
index bf7c687..f7f71b2 100644 (file)
@@ -14,11 +14,7 @@ menuconfig THERMAL
          If you want this support, you should say Y or M here.
 
 config THERMAL_HWMON
-       bool "Hardware monitoring support"
+       bool
        depends on THERMAL
        depends on HWMON=y || HWMON=THERMAL
-       help
-         The generic thermal sysfs driver's hardware monitoring support
-         requires a 2.10.7/3.0.2 or later lm-sensors userspace.
-
-         Say Y if your user-space is new enough.
+       default y
index 0b1c82a..708f8e9 100644 (file)
@@ -420,6 +420,29 @@ thermal_cooling_device_trip_point_show(struct device *dev,
 
 /* hwmon sys I/F */
 #include <linux/hwmon.h>
+
+/* thermal zone devices with the same type share one hwmon device */
+struct thermal_hwmon_device {
+       char type[THERMAL_NAME_LENGTH];
+       struct device *device;
+       int count;
+       struct list_head tz_list;
+       struct list_head node;
+};
+
+struct thermal_hwmon_attr {
+       struct device_attribute attr;
+       char name[16];
+};
+
+/* one temperature input for each thermal zone */
+struct thermal_hwmon_temp {
+       struct list_head hwmon_node;
+       struct thermal_zone_device *tz;
+       struct thermal_hwmon_attr temp_input;   /* hwmon sys attr */
+       struct thermal_hwmon_attr temp_crit;    /* hwmon sys attr */
+};
+
 static LIST_HEAD(thermal_hwmon_list);
 
 static ssize_t
@@ -437,9 +460,10 @@ temp_input_show(struct device *dev, struct device_attribute *attr, char *buf)
        int ret;
        struct thermal_hwmon_attr *hwmon_attr
                        = container_of(attr, struct thermal_hwmon_attr, attr);
-       struct thermal_zone_device *tz
-                       = container_of(hwmon_attr, struct thermal_zone_device,
+       struct thermal_hwmon_temp *temp
+                       = container_of(hwmon_attr, struct thermal_hwmon_temp,
                                       temp_input);
+       struct thermal_zone_device *tz = temp->tz;
 
        ret = tz->ops->get_temp(tz, &temperature);
 
@@ -455,9 +479,10 @@ temp_crit_show(struct device *dev, struct device_attribute *attr,
 {
        struct thermal_hwmon_attr *hwmon_attr
                        = container_of(attr, struct thermal_hwmon_attr, attr);
-       struct thermal_zone_device *tz
-                       = container_of(hwmon_attr, struct thermal_zone_device,
+       struct thermal_hwmon_temp *temp
+                       = container_of(hwmon_attr, struct thermal_hwmon_temp,
                                       temp_crit);
+       struct thermal_zone_device *tz = temp->tz;
        long temperature;
        int ret;
 
@@ -469,22 +494,54 @@ temp_crit_show(struct device *dev, struct device_attribute *attr,
 }
 
 
-static int
-thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
+static struct thermal_hwmon_device *
+thermal_hwmon_lookup_by_type(const struct thermal_zone_device *tz)
 {
        struct thermal_hwmon_device *hwmon;
-       int new_hwmon_device = 1;
-       int result;
 
        mutex_lock(&thermal_list_lock);
        list_for_each_entry(hwmon, &thermal_hwmon_list, node)
                if (!strcmp(hwmon->type, tz->type)) {
-                       new_hwmon_device = 0;
                        mutex_unlock(&thermal_list_lock);
-                       goto register_sys_interface;
+                       return hwmon;
+               }
+       mutex_unlock(&thermal_list_lock);
+
+       return NULL;
+}
+
+/* Find the temperature input matching a given thermal zone */
+static struct thermal_hwmon_temp *
+thermal_hwmon_lookup_temp(const struct thermal_hwmon_device *hwmon,
+                         const struct thermal_zone_device *tz)
+{
+       struct thermal_hwmon_temp *temp;
+
+       mutex_lock(&thermal_list_lock);
+       list_for_each_entry(temp, &hwmon->tz_list, hwmon_node)
+               if (temp->tz == tz) {
+                       mutex_unlock(&thermal_list_lock);
+                       return temp;
                }
        mutex_unlock(&thermal_list_lock);
 
+       return NULL;
+}
+
+static int
+thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
+{
+       struct thermal_hwmon_device *hwmon;
+       struct thermal_hwmon_temp *temp;
+       int new_hwmon_device = 1;
+       int result;
+
+       hwmon = thermal_hwmon_lookup_by_type(tz);
+       if (hwmon) {
+               new_hwmon_device = 0;
+               goto register_sys_interface;
+       }
+
        hwmon = kzalloc(sizeof(struct thermal_hwmon_device), GFP_KERNEL);
        if (!hwmon)
                return -ENOMEM;
@@ -502,30 +559,36 @@ thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
                goto free_mem;
 
  register_sys_interface:
-       tz->hwmon = hwmon;
+       temp = kzalloc(sizeof(struct thermal_hwmon_temp), GFP_KERNEL);
+       if (!temp) {
+               result = -ENOMEM;
+               goto unregister_name;
+       }
+
+       temp->tz = tz;
        hwmon->count++;
 
-       snprintf(tz->temp_input.name, THERMAL_NAME_LENGTH,
+       snprintf(temp->temp_input.name, THERMAL_NAME_LENGTH,
                 "temp%d_input", hwmon->count);
-       tz->temp_input.attr.attr.name = tz->temp_input.name;
-       tz->temp_input.attr.attr.mode = 0444;
-       tz->temp_input.attr.show = temp_input_show;
-       sysfs_attr_init(&tz->temp_input.attr.attr);
-       result = device_create_file(hwmon->device, &tz->temp_input.attr);
+       temp->temp_input.attr.attr.name = temp->temp_input.name;
+       temp->temp_input.attr.attr.mode = 0444;
+       temp->temp_input.attr.show = temp_input_show;
+       sysfs_attr_init(&temp->temp_input.attr.attr);
+       result = device_create_file(hwmon->device, &temp->temp_input.attr);
        if (result)
-               goto unregister_name;
+               goto free_temp_mem;
 
        if (tz->ops->get_crit_temp) {
                unsigned long temperature;
                if (!tz->ops->get_crit_temp(tz, &temperature)) {
-                       snprintf(tz->temp_crit.name, THERMAL_NAME_LENGTH,
+                       snprintf(temp->temp_crit.name, THERMAL_NAME_LENGTH,
                                "temp%d_crit", hwmon->count);
-                       tz->temp_crit.attr.attr.name = tz->temp_crit.name;
-                       tz->temp_crit.attr.attr.mode = 0444;
-                       tz->temp_crit.attr.show = temp_crit_show;
-                       sysfs_attr_init(&tz->temp_crit.attr.attr);
+                       temp->temp_crit.attr.attr.name = temp->temp_crit.name;
+                       temp->temp_crit.attr.attr.mode = 0444;
+                       temp->temp_crit.attr.show = temp_crit_show;
+                       sysfs_attr_init(&temp->temp_crit.attr.attr);
                        result = device_create_file(hwmon->device,
-                                                   &tz->temp_crit.attr);
+                                                   &temp->temp_crit.attr);
                        if (result)
                                goto unregister_input;
                }
@@ -534,13 +597,15 @@ thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
        mutex_lock(&thermal_list_lock);
        if (new_hwmon_device)
                list_add_tail(&hwmon->node, &thermal_hwmon_list);
-       list_add_tail(&tz->hwmon_node, &hwmon->tz_list);
+       list_add_tail(&temp->hwmon_node, &hwmon->tz_list);
        mutex_unlock(&thermal_list_lock);
 
        return 0;
 
  unregister_input:
-       device_remove_file(hwmon->device, &tz->temp_input.attr);
+       device_remove_file(hwmon->device, &temp->temp_input.attr);
+ free_temp_mem:
+       kfree(temp);
  unregister_name:
        if (new_hwmon_device) {
                device_remove_file(hwmon->device, &dev_attr_name);
@@ -556,15 +621,30 @@ thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
 static void
 thermal_remove_hwmon_sysfs(struct thermal_zone_device *tz)
 {
-       struct thermal_hwmon_device *hwmon = tz->hwmon;
+       struct thermal_hwmon_device *hwmon;
+       struct thermal_hwmon_temp *temp;
+
+       hwmon = thermal_hwmon_lookup_by_type(tz);
+       if (unlikely(!hwmon)) {
+               /* Should never happen... */
+               dev_dbg(&tz->device, "hwmon device lookup failed!\n");
+               return;
+       }
+
+       temp = thermal_hwmon_lookup_temp(hwmon, tz);
+       if (unlikely(!temp)) {
+               /* Should never happen... */
+               dev_dbg(&tz->device, "temperature input lookup failed!\n");
+               return;
+       }
 
-       tz->hwmon = NULL;
-       device_remove_file(hwmon->device, &tz->temp_input.attr);
+       device_remove_file(hwmon->device, &temp->temp_input.attr);
        if (tz->ops->get_crit_temp)
-               device_remove_file(hwmon->device, &tz->temp_crit.attr);
+               device_remove_file(hwmon->device, &temp->temp_crit.attr);
 
        mutex_lock(&thermal_list_lock);
-       list_del(&tz->hwmon_node);
+       list_del(&temp->hwmon_node);
+       kfree(temp);
        if (!list_empty(&hwmon->tz_list)) {
                mutex_unlock(&thermal_list_lock);
                return;
index 69407e7..278aeaa 100644 (file)
@@ -336,7 +336,7 @@ config BACKLIGHT_PCF50633
          enable its driver.
 
 config BACKLIGHT_AAT2870
-       bool "AnalogicTech AAT2870 Backlight"
+       tristate "AnalogicTech AAT2870 Backlight"
        depends on BACKLIGHT_CLASS_DEVICE && MFD_AAT2870_CORE
        help
          If you have a AnalogicTech AAT2870 say Y to enable the
index 4952a61..331f1ef 100644 (file)
@@ -44,7 +44,7 @@ static inline int aat2870_brightness(struct aat2870_bl_driver_data *aat2870_bl,
        struct backlight_device *bd = aat2870_bl->bd;
        int val;
 
-       val = brightness * aat2870_bl->max_current;
+       val = brightness * (aat2870_bl->max_current - 1);
        val /= bd->props.max_brightness;
 
        return val;
@@ -158,10 +158,10 @@ static int aat2870_bl_probe(struct platform_device *pdev)
        props.type = BACKLIGHT_RAW;
        bd = backlight_device_register("aat2870-backlight", &pdev->dev,
                                       aat2870_bl, &aat2870_bl_ops, &props);
-       if (!bd) {
+       if (IS_ERR(bd)) {
                dev_err(&pdev->dev,
                        "Failed allocate memory for backlight device\n");
-               ret = -ENOMEM;
+               ret = PTR_ERR(bd);
                goto out_kfree;
        }
 
@@ -175,7 +175,7 @@ static int aat2870_bl_probe(struct platform_device *pdev)
        else
                aat2870_bl->channels = AAT2870_BL_CH_ALL;
 
-       if (pdata->max_brightness > 0)
+       if (pdata->max_current > 0)
                aat2870_bl->max_current = pdata->max_current;
        else
                aat2870_bl->max_current = AAT2870_CURRENT_27_9;
index 19891aa..9fe0b34 100644 (file)
@@ -127,14 +127,21 @@ config TMPFS_POSIX_ACL
        select TMPFS_XATTR
        select GENERIC_ACL
        help
-         POSIX Access Control Lists (ACLs) support permissions for users and
-         groups beyond the owner/group/world scheme.
+         POSIX Access Control Lists (ACLs) support additional access rights
+         for users and groups beyond the standard owner/group/world scheme,
+         and this option selects support for ACLs specifically for tmpfs
+         filesystems.
+
+         If you've selected TMPFS, it's possible that you'll also need
+         this option as there are a number of Linux distros that require
+         POSIX ACL support under /dev for certain features to work properly.
+         For example, some distros need this feature for ALSA-related /dev
+         files for sound to work properly.  In short, if you're not sure,
+         say Y.
 
          To learn more about Access Control Lists, visit the POSIX ACLs for
          Linux website <http://acl.bestbits.at/>.
 
-         If you don't know what Access Control Lists are, say N.
-
 config TMPFS_XATTR
        bool "Tmpfs extended attributes"
        depends on TMPFS
index 2347cdb..c83cae1 100644 (file)
@@ -795,6 +795,7 @@ relock:
 
 /**
  * prune_dcache_sb - shrink the dcache
+ * @sb: superblock
  * @nr_to_scan: number of entries to try to free
  *
  * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
index e2d88ba..4687fea 100644 (file)
@@ -124,7 +124,7 @@ void *ext4_kvzalloc(size_t size, gfp_t flags)
 {
        void *ret;
 
-       ret = kmalloc(size, flags);
+       ret = kzalloc(size, flags);
        if (!ret)
                ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
        return ret;
index 4a6f7f4..b4f2ab4 100644 (file)
@@ -29,10 +29,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
         *
         * We don't actually know what locking is used at the lower level;
         * but if it's a filesystem that supports quotas, it will be using
-        * i_lock as in inode_add_bytes().  tmpfs uses other locking, and
-        * its 32-bit is (just) able to exceed 2TB i_size with the aid of
-        * holes; but its i_blocks cannot carry into the upper long without
-        * almost 2TB swap - let's ignore that case.
+        * i_lock as in inode_add_bytes().
         */
        if (sizeof(i_blocks) > sizeof(long))
                spin_lock(&src->i_lock);
index 3090471..e49c36d 100644 (file)
@@ -128,7 +128,7 @@ extern int is_dock_device(acpi_handle handle);
 extern int register_dock_notifier(struct notifier_block *nb);
 extern void unregister_dock_notifier(struct notifier_block *nb);
 extern int register_hotplug_dock_device(acpi_handle handle,
-                                       struct acpi_dock_ops *ops,
+                                       const struct acpi_dock_ops *ops,
                                        void *context);
 extern void unregister_hotplug_dock_device(acpi_handle handle);
 #else
index 2ed0a84..f554a93 100644 (file)
@@ -47,7 +47,7 @@
 
 /* Current ACPICA subsystem version in YYYYMMDD format */
 
-#define ACPI_CA_VERSION                 0x20110413
+#define ACPI_CA_VERSION                 0x20110623
 
 #include "actypes.h"
 #include "actbl.h"
@@ -69,6 +69,7 @@ extern u32 acpi_gbl_trace_flags;
 extern u32 acpi_gbl_enable_aml_debug_object;
 extern u8 acpi_gbl_copy_dsdt_locally;
 extern u8 acpi_gbl_truncate_io_addresses;
+extern u8 acpi_gbl_disable_auto_repair;
 
 extern u32 acpi_current_gpe_count;
 extern struct acpi_table_fadt acpi_gbl_FADT;
index e67b523..51a527d 100644 (file)
 
 extern int hest_disable;
 extern int erst_disable;
+#ifdef CONFIG_ACPI_APEI_GHES
+extern int ghes_disable;
+#else
+#define ghes_disable 1
+#endif
 
 #ifdef CONFIG_ACPI_APEI
 void __init acpi_hest_init(void);
index ba4928c..67055f1 100644 (file)
@@ -337,7 +337,7 @@ extern struct cpuidle_driver acpi_idle_driver;
 
 /* in processor_thermal.c */
 int acpi_processor_get_limit_info(struct acpi_processor *pr);
-extern struct thermal_cooling_device_ops processor_cooling_ops;
+extern const struct thermal_cooling_device_ops processor_cooling_ops;
 #ifdef CONFIG_CPU_FREQ
 void acpi_thermal_cpufreq_init(void);
 void acpi_thermal_cpufreq_exit(void);
index 1deb2a7..6001b4d 100644 (file)
@@ -238,7 +238,6 @@ extern int acpi_paddr_to_node(u64 start_addr, u64 size);
 extern int pnpacpi_disabled;
 
 #define PXM_INVAL      (-1)
-#define NID_INVAL      (-1)
 
 int acpi_check_resource_conflict(const struct resource *res);
 
@@ -280,6 +279,8 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);
 #define OSC_SB_CPUHP_OST_SUPPORT       8
 #define OSC_SB_APEI_SUPPORT            16
 
+extern bool osc_sb_apei_support_acked;
+
 /* PCI defined _OSC bits */
 /* _OSC DW1 Definition (OS Support Fields) */
 #define OSC_EXT_PCI_CONFIG_SUPPORT             1
index 3bac44c..7ad6345 100644 (file)
@@ -146,6 +146,7 @@ extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
 extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);
 extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits);
 
+#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG))
 #define BITMAP_LAST_WORD_MASK(nbits)                                   \
 (                                                                      \
        ((nbits) % BITS_PER_LONG) ?                                     \
index 36719ea..b51629e 100644 (file)
@@ -122,6 +122,8 @@ struct cpuidle_driver {
 };
 
 #ifdef CONFIG_CPU_IDLE
+extern void disable_cpuidle(void);
+extern int cpuidle_idle_call(void);
 
 extern int cpuidle_register_driver(struct cpuidle_driver *drv);
 struct cpuidle_driver *cpuidle_get_driver(void);
@@ -135,6 +137,8 @@ extern int cpuidle_enable_device(struct cpuidle_device *dev);
 extern void cpuidle_disable_device(struct cpuidle_device *dev);
 
 #else
+static inline void disable_cpuidle(void) { }
+static inline int cpuidle_idle_call(void) { return -ENODEV; }
 
 static inline int cpuidle_register_driver(struct cpuidle_driver *drv)
 {return -ENODEV; }
index 3ff060a..c6f996f 100644 (file)
@@ -25,10 +25,6 @@ struct fault_attr {
        unsigned long reject_end;
 
        unsigned long count;
-
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-       struct dentry *dir;
-#endif
 };
 
 #define FAULT_ATTR_INITIALIZER {                               \
@@ -45,19 +41,15 @@ bool should_fail(struct fault_attr *attr, ssize_t size);
 
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 
-int init_fault_attr_dentries(struct fault_attr *attr, const char *name);
-void cleanup_fault_attr_dentries(struct fault_attr *attr);
+struct dentry *fault_create_debugfs_attr(const char *name,
+                       struct dentry *parent, struct fault_attr *attr);
 
 #else /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 
-static inline int init_fault_attr_dentries(struct fault_attr *attr,
-                                         const char *name)
-{
-       return -ENODEV;
-}
-
-static inline void cleanup_fault_attr_dentries(struct fault_attr *attr)
+static inline struct dentry *fault_create_debugfs_attr(const char *name,
+                       struct dentry *parent, struct fault_attr *attr)
 {
+       return ERR_PTR(-ENODEV);
 }
 
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
index 5bbebda..5e98eeb 100644 (file)
@@ -1,8 +1,26 @@
 /*
- * Basic general purpose allocator for managing special purpose memory
- * not managed by the regular kmalloc/kfree interface.
- * Uses for this includes on-device special memory, uncached memory
- * etc.
+ * Basic general purpose allocator for managing special purpose
+ * memory, for example, memory that is not managed by the regular
+ * kmalloc/kfree interface.  Uses for this includes on-device special
+ * memory, uncached memory etc.
+ *
+ * It is safe to use the allocator in NMI handlers and other special
+ * unblockable contexts that could otherwise deadlock on locks.  This
+ * is implemented by using atomic operations and retries on any
+ * conflicts.  The disadvantage is that there may be livelocks in
+ * extreme cases.  For better scalability, one allocator can be used
+ * for each CPU.
+ *
+ * The lockless operation only works if there is enough memory
+ * available.  If new memory is added to the pool a lock has to be
+ * still taken.  So any user relying on locklessness has to ensure
+ * that sufficient memory is preallocated.
+ *
+ * The basic atomic operation of this allocator is cmpxchg on long.
+ * On architectures that don't have NMI-safe cmpxchg implementation,
+ * the allocator can NOT be used in NMI handler.  So code uses the
+ * allocator in NMI handler should depend on
+ * CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
  *
  * This source code is licensed under the GNU General Public License,
  * Version 2.  See the file COPYING for more details.
@@ -15,7 +33,7 @@
  *  General purpose special memory pool descriptor.
  */
 struct gen_pool {
-       rwlock_t lock;
+       spinlock_t lock;
        struct list_head chunks;        /* list of chunks in this pool */
        int min_alloc_order;            /* minimum allocation order */
 };
@@ -24,8 +42,8 @@ struct gen_pool {
  *  General purpose special memory pool chunk descriptor.
  */
 struct gen_pool_chunk {
-       spinlock_t lock;
        struct list_head next_chunk;    /* next chunk in pool */
+       atomic_t avail;
        phys_addr_t phys_addr;          /* physical starting address of memory chunk */
        unsigned long start_addr;       /* starting address of memory chunk */
        unsigned long end_addr;         /* ending address of memory chunk */
@@ -56,4 +74,8 @@ static inline int gen_pool_add(struct gen_pool *pool, unsigned long addr,
 extern void gen_pool_destroy(struct gen_pool *);
 extern unsigned long gen_pool_alloc(struct gen_pool *, size_t);
 extern void gen_pool_free(struct gen_pool *, unsigned long, size_t);
+extern void gen_pool_for_each_chunk(struct gen_pool *,
+       void (*)(struct gen_pool *, struct gen_pool_chunk *, void *), void *);
+extern size_t gen_pool_avail(struct gen_pool *);
+extern size_t gen_pool_size(struct gen_pool *);
 #endif /* __GENALLOC_H__ */
index cb40892..3a76faf 100644 (file)
@@ -92,7 +92,7 @@ struct vm_area_struct;
  */
 #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
 
-#define __GFP_BITS_SHIFT 23    /* Room for 23 __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 24    /* Room for N __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /* This equals 0, but use constants in case they ever change */
index 13a801f..255491c 100644 (file)
@@ -146,6 +146,10 @@ void ida_remove(struct ida *ida, int id);
 void ida_destroy(struct ida *ida);
 void ida_init(struct ida *ida);
 
+int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
+                  gfp_t gfp_mask);
+void ida_simple_remove(struct ida *ida, unsigned int id);
+
 void __init idr_init_cache(void);
 
 #endif /* __IDR_H__ */
diff --git a/include/linux/llist.h b/include/linux/llist.h
new file mode 100644 (file)
index 0000000..aa0c8b5
--- /dev/null
@@ -0,0 +1,126 @@
+#ifndef LLIST_H
+#define LLIST_H
+/*
+ * Lock-less NULL terminated single linked list
+ *
+ * If there are multiple producers and multiple consumers, llist_add
+ * can be used in producers and llist_del_all can be used in
+ * consumers.  They can work simultaneously without lock.  But
+ * llist_del_first can not be used here.  Because llist_del_first
+ * depends on list->first->next does not changed if list->first is not
+ * changed during its operation, but llist_del_first, llist_add,
+ * llist_add (or llist_del_all, llist_add, llist_add) sequence in
+ * another consumer may violate that.
+ *
+ * If there are multiple producers and one consumer, llist_add can be
+ * used in producers and llist_del_all or llist_del_first can be used
+ * in the consumer.
+ *
+ * This can be summarized as follow:
+ *
+ *           |   add    | del_first |  del_all
+ * add       |    -     |     -     |     -
+ * del_first |          |     L     |     L
+ * del_all   |          |           |     -
+ *
+ * Where "-" stands for no lock is needed, while "L" stands for lock
+ * is needed.
+ *
+ * The list entries deleted via llist_del_all can be traversed with
+ * traversing function such as llist_for_each etc.  But the list
+ * entries can not be traversed safely before deleted from the list.
+ * The order of deleted entries is from the newest to the oldest added
+ * one.  If you want to traverse from the oldest to the newest, you
+ * must reverse the order by yourself before traversing.
+ *
+ * The basic atomic operation of this list is cmpxchg on long.  On
+ * architectures that don't have NMI-safe cmpxchg implementation, the
+ * list can NOT be used in NMI handler.  So code uses the list in NMI
+ * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
+ */
+
+struct llist_head {
+       struct llist_node *first;
+};
+
+struct llist_node {
+       struct llist_node *next;
+};
+
+#define LLIST_HEAD_INIT(name)  { NULL }
+#define LLIST_HEAD(name)       struct llist_head name = LLIST_HEAD_INIT(name)
+
+/**
+ * init_llist_head - initialize lock-less list head
+ * @head:      the head for your lock-less list
+ */
+static inline void init_llist_head(struct llist_head *list)
+{
+       list->first = NULL;
+}
+
+/**
+ * llist_entry - get the struct of this entry
+ * @ptr:       the &struct llist_node pointer.
+ * @type:      the type of the struct this is embedded in.
+ * @member:    the name of the llist_node within the struct.
+ */
+#define llist_entry(ptr, type, member)         \
+       container_of(ptr, type, member)
+
+/**
+ * llist_for_each - iterate over some deleted entries of a lock-less list
+ * @pos:       the &struct llist_node to use as a loop cursor
+ * @node:      the first entry of deleted list entries
+ *
+ * In general, some entries of the lock-less list can be traversed
+ * safely only after being deleted from list, so start with an entry
+ * instead of list head.
+ *
+ * If being used on entries deleted from lock-less list directly, the
+ * traverse order is from the newest to the oldest added entry.  If
+ * you want to traverse from the oldest to the newest, you must
+ * reverse the order by yourself before traversing.
+ */
+#define llist_for_each(pos, node)                      \
+       for ((pos) = (node); pos; (pos) = (pos)->next)
+
+/**
+ * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type
+ * @pos:       the type * to use as a loop cursor.
+ * @node:      the fist entry of deleted list entries.
+ * @member:    the name of the llist_node with the struct.
+ *
+ * In general, some entries of the lock-less list can be traversed
+ * safely only after being removed from list, so start with an entry
+ * instead of list head.
+ *
+ * If being used on entries deleted from lock-less list directly, the
+ * traverse order is from the newest to the oldest added entry.  If
+ * you want to traverse from the oldest to the newest, you must
+ * reverse the order by yourself before traversing.
+ */
+#define llist_for_each_entry(pos, node, member)                                \
+       for ((pos) = llist_entry((node), typeof(*(pos)), member);       \
+            &(pos)->member != NULL;                                    \
+            (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member))
+
+/**
+ * llist_empty - tests whether a lock-less list is empty
+ * @head:      the list to test
+ *
+ * Not guaranteed to be accurate or up to date.  Just a quick way to
+ * test whether the list is empty without deleting something from the
+ * list.
+ */
+static inline int llist_empty(const struct llist_head *head)
+{
+       return ACCESS_ONCE(head->first) == NULL;
+}
+
+void llist_add(struct llist_node *new, struct llist_head *head);
+void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
+                    struct llist_head *head);
+struct llist_node *llist_del_first(struct llist_head *head);
+struct llist_node *llist_del_all(struct llist_head *head);
+#endif /* LLIST_H */
index b966007..3b535db 100644 (file)
@@ -86,8 +86,6 @@ extern void mem_cgroup_uncharge_end(void);
 
 extern void mem_cgroup_uncharge_page(struct page *page);
 extern void mem_cgroup_uncharge_cache_page(struct page *page);
-extern int mem_cgroup_shmem_charge_fallback(struct page *page,
-                       struct mm_struct *mm, gfp_t gfp_mask);
 
 extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
@@ -225,12 +223,6 @@ static inline void mem_cgroup_uncharge_cache_page(struct page *page)
 {
 }
 
-static inline int mem_cgroup_shmem_charge_fallback(struct page *page,
-                       struct mm_struct *mm, gfp_t gfp_mask)
-{
-       return 0;
-}
-
 static inline void mem_cgroup_add_lru_list(struct page *page, int lru)
 {
 }
index 89212df..f7316c2 100644 (file)
@@ -89,7 +89,7 @@ enum aat2870_id {
 
 /* Backlight current magnitude (mA) */
 enum aat2870_current {
-       AAT2870_CURRENT_0_45,
+       AAT2870_CURRENT_0_45 = 1,
        AAT2870_CURRENT_0_90,
        AAT2870_CURRENT_1_80,
        AAT2870_CURRENT_2_70,
index 3172a1c..f2690cf 100644 (file)
@@ -1600,6 +1600,7 @@ enum mf_flags {
 };
 extern void memory_failure(unsigned long pfn, int trapno);
 extern int __memory_failure(unsigned long pfn, int trapno, int flags);
+extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
 extern int unpoison_memory(unsigned long pfn);
 extern int sysctl_memory_failure_early_kill;
 extern int sysctl_memory_failure_recovery;
index 0085bb0..bc3dc63 100644 (file)
@@ -68,6 +68,7 @@ struct device_node {
 /* Pointer for first entry in chain of all nodes. */
 extern struct device_node *allnodes;
 extern struct device_node *of_chosen;
+extern struct device_node *of_aliases;
 extern rwlock_t devtree_lock;
 
 static inline bool of_have_populated_dt(void)
@@ -209,6 +210,9 @@ extern int of_device_is_available(const struct device_node *device);
 extern const void *of_get_property(const struct device_node *node,
                                const char *name,
                                int *lenp);
+#define for_each_property(pp, properties) \
+       for (pp = properties; pp != NULL; pp = pp->next)
+
 extern int of_n_addr_cells(struct device_node *np);
 extern int of_n_size_cells(struct device_node *np);
 extern const struct of_device_id *of_match_node(
@@ -221,6 +225,10 @@ extern int of_parse_phandles_with_args(struct device_node *np,
        const char *list_name, const char *cells_name, int index,
        struct device_node **out_node, const void **out_args);
 
+extern void *early_init_dt_alloc_memory_arch(u64 size, u64 align);
+extern void of_alias_scan(void);
+extern int of_alias_get_id(struct device_node *np, const char *stem);
+
 extern int of_machine_is_compatible(const char *compat);
 
 extern int prom_add_property(struct device_node* np, struct property* prop);
index c84d900..b74b74f 100644 (file)
@@ -97,7 +97,6 @@ extern void early_init_dt_check_for_initrd(unsigned long node);
 extern int early_init_dt_scan_memory(unsigned long node, const char *uname,
                                     int depth, void *data);
 extern void early_init_dt_add_memory_arch(u64 base, u64 size);
-extern void * early_init_dt_alloc_memory_arch(u64 size, u64 align);
 extern u64 dt_mem_next_cell(int s, __be32 **cellp);
 
 /*
index 23241c2..9d4539c 100644 (file)
  * when it is shrunk, before we rcu free the node. See shrink code for
  * details.
  */
-#define RADIX_TREE_INDIRECT_PTR        1
+#define RADIX_TREE_INDIRECT_PTR                1
+/*
+ * A common use of the radix tree is to store pointers to struct pages;
+ * but shmem/tmpfs needs also to store swap entries in the same tree:
+ * those are marked as exceptional entries to distinguish them.
+ * EXCEPTIONAL_ENTRY tests the bit, EXCEPTIONAL_SHIFT shifts content past it.
+ */
+#define RADIX_TREE_EXCEPTIONAL_ENTRY   2
+#define RADIX_TREE_EXCEPTIONAL_SHIFT   2
 
 #define radix_tree_indirect_to_ptr(ptr) \
        radix_tree_indirect_to_ptr((void __force *)(ptr))
@@ -173,6 +181,28 @@ static inline int radix_tree_deref_retry(void *arg)
        return unlikely((unsigned long)arg & RADIX_TREE_INDIRECT_PTR);
 }
 
+/**
+ * radix_tree_exceptional_entry        - radix_tree_deref_slot gave exceptional entry?
+ * @arg:       value returned by radix_tree_deref_slot
+ * Returns:    0 if well-aligned pointer, non-0 if exceptional entry.
+ */
+static inline int radix_tree_exceptional_entry(void *arg)
+{
+       /* Not unlikely because radix_tree_exception often tested first */
+       return (unsigned long)arg & RADIX_TREE_EXCEPTIONAL_ENTRY;
+}
+
+/**
+ * radix_tree_exception        - radix_tree_deref_slot returned either exception?
+ * @arg:       value returned by radix_tree_deref_slot
+ * Returns:    0 if well-aligned pointer, non-0 if either kind of exception.
+ */
+static inline int radix_tree_exception(void *arg)
+{
+       return unlikely((unsigned long)arg &
+               (RADIX_TREE_INDIRECT_PTR | RADIX_TREE_EXCEPTIONAL_ENTRY));
+}
+
 /**
  * radix_tree_replace_slot     - replace item in a slot
  * @pslot:     pointer to slot, returned by radix_tree_lookup_slot
@@ -194,8 +224,8 @@ void *radix_tree_delete(struct radix_tree_root *, unsigned long);
 unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
                        unsigned long first_index, unsigned int max_items);
-unsigned int
-radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
+unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
+                       void ***results, unsigned long *indices,
                        unsigned long first_index, unsigned int max_items);
 unsigned long radix_tree_next_hole(struct radix_tree_root *root,
                                unsigned long index, unsigned long max_scan);
@@ -222,6 +252,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
                unsigned long nr_to_tag,
                unsigned int fromtag, unsigned int totag);
 int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
+unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
 
 static inline void radix_tree_preload_end(void)
 {
index aa08fa8..9291ac3 100644 (file)
@@ -8,22 +8,15 @@
 
 /* inode in-kernel data */
 
-#define SHMEM_NR_DIRECT 16
-
-#define SHMEM_SYMLINK_INLINE_LEN (SHMEM_NR_DIRECT * sizeof(swp_entry_t))
-
 struct shmem_inode_info {
        spinlock_t              lock;
        unsigned long           flags;
        unsigned long           alloced;        /* data pages alloced to file */
-       unsigned long           swapped;        /* subtotal assigned to swap */
-       unsigned long           next_index;     /* highest alloced index + 1 */
-       struct shared_policy    policy;         /* NUMA memory alloc policy */
-       struct page             *i_indirect;    /* top indirect blocks page */
        union {
-               swp_entry_t     i_direct[SHMEM_NR_DIRECT]; /* first blocks */
-               char            inline_symlink[SHMEM_SYMLINK_INLINE_LEN];
+               unsigned long   swapped;        /* subtotal assigned to swap */
+               char            *symlink;       /* unswappable short symlink */
        };
+       struct shared_policy    policy;         /* NUMA memory alloc policy */
        struct list_head        swaplist;       /* chain of maybes on swap */
        struct list_head        xattr_list;     /* list of shmem_xattr */
        struct inode            vfs_inode;
@@ -49,7 +42,7 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
 /*
  * Functions in mm/shmem.c called directly from elsewhere:
  */
-extern int init_tmpfs(void);
+extern int shmem_init(void);
 extern int shmem_fill_super(struct super_block *sb, void *data, int silent);
 extern struct file *shmem_file_setup(const char *name,
                                        loff_t size, unsigned long flags);
@@ -59,8 +52,6 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
                                        pgoff_t index, gfp_t gfp_mask);
 extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
-extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
-                                       struct page **pagep, swp_entry_t *ent);
 
 static inline struct page *shmem_read_mapping_page(
                                struct address_space *mapping, pgoff_t index)
index cd42e30..2189d3f 100644 (file)
@@ -1,3 +1,8 @@
+#ifndef _LINUX_SWAPOPS_H
+#define _LINUX_SWAPOPS_H
+
+#include <linux/radix-tree.h>
+
 /*
  * swapcache pages are stored in the swapper_space radix tree.  We want to
  * get good packing density in that tree, so the index should be dense in
@@ -76,6 +81,22 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry)
        return __swp_entry_to_pte(arch_entry);
 }
 
+static inline swp_entry_t radix_to_swp_entry(void *arg)
+{
+       swp_entry_t entry;
+
+       entry.val = (unsigned long)arg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
+       return entry;
+}
+
+static inline void *swp_to_radix_entry(swp_entry_t entry)
+{
+       unsigned long value;
+
+       value = entry.val << RADIX_TREE_EXCEPTIONAL_SHIFT;
+       return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY);
+}
+
 #ifdef CONFIG_MIGRATION
 static inline swp_entry_t make_migration_entry(struct page *page, int write)
 {
@@ -169,3 +190,5 @@ static inline int non_swap_entry(swp_entry_t entry)
        return 0;
 }
 #endif
+
+#endif /* _LINUX_SWAPOPS_H */
index d3ec89f..47b4a27 100644 (file)
@@ -85,22 +85,6 @@ struct thermal_cooling_device {
                                ((long)t-2732+5)/10 : ((long)t-2732-5)/10)
 #define CELSIUS_TO_KELVIN(t)   ((t)*10+2732)
 
-#if defined(CONFIG_THERMAL_HWMON)
-/* thermal zone devices with the same type share one hwmon device */
-struct thermal_hwmon_device {
-       char type[THERMAL_NAME_LENGTH];
-       struct device *device;
-       int count;
-       struct list_head tz_list;
-       struct list_head node;
-};
-
-struct thermal_hwmon_attr {
-       struct device_attribute attr;
-       char name[16];
-};
-#endif
-
 struct thermal_zone_device {
        int id;
        char type[THERMAL_NAME_LENGTH];
@@ -120,12 +104,6 @@ struct thermal_zone_device {
        struct mutex lock;      /* protect cooling devices list */
        struct list_head node;
        struct delayed_work poll_queue;
-#if defined(CONFIG_THERMAL_HWMON)
-       struct list_head hwmon_node;
-       struct thermal_hwmon_device *hwmon;
-       struct thermal_hwmon_attr temp_input;   /* hwmon sys attr */
-       struct thermal_hwmon_attr temp_crit;    /* hwmon sys attr */
-#endif
 };
 /* Adding event notification support elements */
 #define THERMAL_GENL_FAMILY_NAME                "thermal_event"
index d7211fa..9c51ee7 100644 (file)
@@ -369,9 +369,12 @@ static noinline void __init_refok rest_init(void)
        init_idle_bootup_task(current);
        preempt_enable_no_resched();
        schedule();
-       preempt_disable();
+
+       /* At this point, we can enable user mode helper functionality */
+       usermodehelper_enable();
 
        /* Call into cpu_idle with preempt disabled */
+       preempt_disable();
        cpu_idle();
 }
 
@@ -715,7 +718,7 @@ static void __init do_basic_setup(void)
 {
        cpuset_init_smp();
        usermodehelper_init();
-       init_tmpfs();
+       shmem_init();
        driver_init();
        init_irq_proc();
        do_ctors();
index 9fb044f..b5bae9d 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -294,7 +294,7 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data)
 void shm_destroy_orphaned(struct ipc_namespace *ns)
 {
        down_write(&shm_ids(ns).rw_mutex);
-       if (&shm_ids(ns).in_use)
+       if (shm_ids(ns).in_use)
                idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
        up_write(&shm_ids(ns).rw_mutex);
 }
@@ -304,9 +304,12 @@ void exit_shm(struct task_struct *task)
 {
        struct ipc_namespace *ns = task->nsproxy->ipc_ns;
 
+       if (shm_ids(ns).in_use == 0)
+               return;
+
        /* Destroy all already created segments, but not mapped yet */
        down_write(&shm_ids(ns).rw_mutex);
-       if (&shm_ids(ns).in_use)
+       if (shm_ids(ns).in_use)
                idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns);
        up_write(&shm_ids(ns).rw_mutex);
 }
index 47613df..ddc7644 100644 (file)
@@ -274,7 +274,7 @@ static void __call_usermodehelper(struct work_struct *work)
  * (used for preventing user land processes from being created after the user
  * land has been frozen during a system-wide hibernation or suspend operation).
  */
-static int usermodehelper_disabled;
+static int usermodehelper_disabled = 1;
 
 /* Number of helpers running */
 static atomic_t running_helpers = ATOMIC_INIT(0);
index d1db288..e19ce14 100644 (file)
@@ -291,30 +291,28 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
        if (!cpumask_subset(mask, cpu_possible_mask))
                return -EINVAL;
 
-       s = NULL;
        if (isadd == REGISTER) {
                for_each_cpu(cpu, mask) {
-                       if (!s)
-                               s = kmalloc_node(sizeof(struct listener),
-                                                GFP_KERNEL, cpu_to_node(cpu));
+                       s = kmalloc_node(sizeof(struct listener),
+                                       GFP_KERNEL, cpu_to_node(cpu));
                        if (!s)
                                goto cleanup;
+
                        s->pid = pid;
-                       INIT_LIST_HEAD(&s->list);
                        s->valid = 1;
 
                        listeners = &per_cpu(listener_array, cpu);
                        down_write(&listeners->sem);
-                       list_for_each_entry_safe(s2, tmp, &listeners->list, list) {
-                               if (s2->pid == pid)
-                                       goto next_cpu;
+                       list_for_each_entry(s2, &listeners->list, list) {
+                               if (s2->pid == pid && s2->valid)
+                                       goto exists;
                        }
                        list_add(&s->list, &listeners->list);
                        s = NULL;
-next_cpu:
+exists:
                        up_write(&listeners->sem);
+                       kfree(s); /* nop if NULL */
                }
-               kfree(s);
                return 0;
        }
 
index 32f3e5a..6c695ff 100644 (file)
@@ -276,4 +276,7 @@ config CORDIC
          so its calculations are in fixed point. Modules can select this
          when they require this function. Module will be called cordic.
 
+config LLIST
+       bool
+
 endmenu
index 892f4e2..6457af4 100644 (file)
@@ -115,6 +115,8 @@ obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o
 
 obj-$(CONFIG_CORDIC) += cordic.o
 
+obj-$(CONFIG_LLIST) += llist.o
+
 hostprogs-y    := gen_crc32table
 clean-files    := crc32table.h
 
index 37ef4b0..2f4412e 100644 (file)
@@ -271,8 +271,6 @@ int __bitmap_weight(const unsigned long *bitmap, int bits)
 }
 EXPORT_SYMBOL(__bitmap_weight);
 
-#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG))
-
 void bitmap_set(unsigned long *map, int start, int nr)
 {
        unsigned long *p = map + BIT_WORD(start);
index 2577b12..f193b77 100644 (file)
@@ -197,21 +197,15 @@ static struct dentry *debugfs_create_atomic_t(const char *name, mode_t mode,
        return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
 }
 
-void cleanup_fault_attr_dentries(struct fault_attr *attr)
-{
-       debugfs_remove_recursive(attr->dir);
-}
-
-int init_fault_attr_dentries(struct fault_attr *attr, const char *name)
+struct dentry *fault_create_debugfs_attr(const char *name,
+                       struct dentry *parent, struct fault_attr *attr)
 {
        mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
        struct dentry *dir;
 
-       dir = debugfs_create_dir(name, NULL);
+       dir = debugfs_create_dir(name, parent);
        if (!dir)
-               return -ENOMEM;
-
-       attr->dir = dir;
+               return ERR_PTR(-ENOMEM);
 
        if (!debugfs_create_ul("probability", mode, dir, &attr->probability))
                goto fail;
@@ -243,11 +237,11 @@ int init_fault_attr_dentries(struct fault_attr *attr, const char *name)
 
 #endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */
 
-       return 0;
+       return dir;
 fail:
-       debugfs_remove_recursive(attr->dir);
+       debugfs_remove_recursive(dir);
 
-       return -ENOMEM;
+       return ERR_PTR(-ENOMEM);
 }
 
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
index 577ddf8..f352cc4 100644 (file)
@@ -1,8 +1,26 @@
 /*
- * Basic general purpose allocator for managing special purpose memory
- * not managed by the regular kmalloc/kfree interface.
- * Uses for this includes on-device special memory, uncached memory
- * etc.
+ * Basic general purpose allocator for managing special purpose
+ * memory, for example, memory that is not managed by the regular
+ * kmalloc/kfree interface.  Uses for this includes on-device special
+ * memory, uncached memory etc.
+ *
+ * It is safe to use the allocator in NMI handlers and other special
+ * unblockable contexts that could otherwise deadlock on locks.  This
+ * is implemented by using atomic operations and retries on any
+ * conflicts.  The disadvantage is that there may be livelocks in
+ * extreme cases.  For better scalability, one allocator can be used
+ * for each CPU.
+ *
+ * The lockless operation only works if there is enough memory
+ * available.  If new memory is added to the pool a lock has to be
+ * still taken.  So any user relying on locklessness has to ensure
+ * that sufficient memory is preallocated.
+ *
+ * The basic atomic operation of this allocator is cmpxchg on long.
+ * On architectures that don't have NMI-safe cmpxchg implementation,
+ * the allocator can NOT be used in NMI handler.  So code uses the
+ * allocator in NMI handler should depend on
+ * CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
  *
  * Copyright 2005 (C) Jes Sorensen <jes@trained-monkey.org>
  *
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/bitmap.h>
+#include <linux/rculist.h>
+#include <linux/interrupt.h>
 #include <linux/genalloc.h>
 
+static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set)
+{
+       unsigned long val, nval;
+
+       nval = *addr;
+       do {
+               val = nval;
+               if (val & mask_to_set)
+                       return -EBUSY;
+               cpu_relax();
+       } while ((nval = cmpxchg(addr, val, val | mask_to_set)) != val);
+
+       return 0;
+}
+
+static int clear_bits_ll(unsigned long *addr, unsigned long mask_to_clear)
+{
+       unsigned long val, nval;
+
+       nval = *addr;
+       do {
+               val = nval;
+               if ((val & mask_to_clear) != mask_to_clear)
+                       return -EBUSY;
+               cpu_relax();
+       } while ((nval = cmpxchg(addr, val, val & ~mask_to_clear)) != val);
+
+       return 0;
+}
+
+/*
+ * bitmap_set_ll - set the specified number of bits at the specified position
+ * @map: pointer to a bitmap
+ * @start: a bit position in @map
+ * @nr: number of bits to set
+ *
+ * Set @nr bits start from @start in @map lock-lessly. Several users
+ * can set/clear the same bitmap simultaneously without lock. If two
+ * users set the same bit, one user will return remain bits, otherwise
+ * return 0.
+ */
+static int bitmap_set_ll(unsigned long *map, int start, int nr)
+{
+       unsigned long *p = map + BIT_WORD(start);
+       const int size = start + nr;
+       int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
+       unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
+
+       while (nr - bits_to_set >= 0) {
+               if (set_bits_ll(p, mask_to_set))
+                       return nr;
+               nr -= bits_to_set;
+               bits_to_set = BITS_PER_LONG;
+               mask_to_set = ~0UL;
+               p++;
+       }
+       if (nr) {
+               mask_to_set &= BITMAP_LAST_WORD_MASK(size);
+               if (set_bits_ll(p, mask_to_set))
+                       return nr;
+       }
+
+       return 0;
+}
+
+/*
+ * bitmap_clear_ll - clear the specified number of bits at the specified position
+ * @map: pointer to a bitmap
+ * @start: a bit position in @map
+ * @nr: number of bits to set
+ *
+ * Clear @nr bits start from @start in @map lock-lessly. Several users
+ * can set/clear the same bitmap simultaneously without lock. If two
+ * users clear the same bit, one user will return remain bits,
+ * otherwise return 0.
+ */
+static int bitmap_clear_ll(unsigned long *map, int start, int nr)
+{
+       unsigned long *p = map + BIT_WORD(start);
+       const int size = start + nr;
+       int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+       unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+       while (nr - bits_to_clear >= 0) {
+               if (clear_bits_ll(p, mask_to_clear))
+                       return nr;
+               nr -= bits_to_clear;
+               bits_to_clear = BITS_PER_LONG;
+               mask_to_clear = ~0UL;
+               p++;
+       }
+       if (nr) {
+               mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+               if (clear_bits_ll(p, mask_to_clear))
+                       return nr;
+       }
+
+       return 0;
+}
 
 /**
  * gen_pool_create - create a new special memory pool
@@ -30,7 +149,7 @@ struct gen_pool *gen_pool_create(int min_alloc_order, int nid)
 
        pool = kmalloc_node(sizeof(struct gen_pool), GFP_KERNEL, nid);
        if (pool != NULL) {
-               rwlock_init(&pool->lock);
+               spin_lock_init(&pool->lock);
                INIT_LIST_HEAD(&pool->chunks);
                pool->min_alloc_order = min_alloc_order;
        }
@@ -63,14 +182,14 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy
        if (unlikely(chunk == NULL))
                return -ENOMEM;
 
-       spin_lock_init(&chunk->lock);
        chunk->phys_addr = phys;
        chunk->start_addr = virt;
        chunk->end_addr = virt + size;
+       atomic_set(&chunk->avail, size);
 
-       write_lock(&pool->lock);
-       list_add(&chunk->next_chunk, &pool->chunks);
-       write_unlock(&pool->lock);
+       spin_lock(&pool->lock);
+       list_add_rcu(&chunk->next_chunk, &pool->chunks);
+       spin_unlock(&pool->lock);
 
        return 0;
 }
@@ -85,19 +204,19 @@ EXPORT_SYMBOL(gen_pool_add_virt);
  */
 phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long addr)
 {
-       struct list_head *_chunk;
        struct gen_pool_chunk *chunk;
+       phys_addr_t paddr = -1;
 
-       read_lock(&pool->lock);
-       list_for_each(_chunk, &pool->chunks) {
-               chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
-
-               if (addr >= chunk->start_addr && addr < chunk->end_addr)
-                       return chunk->phys_addr + addr - chunk->start_addr;
+       rcu_read_lock();
+       list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
+               if (addr >= chunk->start_addr && addr < chunk->end_addr) {
+                       paddr = chunk->phys_addr + (addr - chunk->start_addr);
+                       break;
+               }
        }
-       read_unlock(&pool->lock);
+       rcu_read_unlock();
 
-       return -1;
+       return paddr;
 }
 EXPORT_SYMBOL(gen_pool_virt_to_phys);
 
@@ -115,7 +234,6 @@ void gen_pool_destroy(struct gen_pool *pool)
        int order = pool->min_alloc_order;
        int bit, end_bit;
 
-
        list_for_each_safe(_chunk, _next_chunk, &pool->chunks) {
                chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
                list_del(&chunk->next_chunk);
@@ -137,44 +255,50 @@ EXPORT_SYMBOL(gen_pool_destroy);
  * @size: number of bytes to allocate from the pool
  *
  * Allocate the requested number of bytes from the specified pool.
- * Uses a first-fit algorithm.
+ * Uses a first-fit algorithm. Can not be used in NMI handler on
+ * architectures without NMI-safe cmpxchg implementation.
  */
 unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size)
 {
-       struct list_head *_chunk;
        struct gen_pool_chunk *chunk;
-       unsigned long addr, flags;
+       unsigned long addr = 0;
        int order = pool->min_alloc_order;
-       int nbits, start_bit, end_bit;
+       int nbits, start_bit = 0, end_bit, remain;
+
+#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+       BUG_ON(in_nmi());
+#endif
 
        if (size == 0)
                return 0;
 
        nbits = (size + (1UL << order) - 1) >> order;
-
-       read_lock(&pool->lock);
-       list_for_each(_chunk, &pool->chunks) {
-               chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
+       rcu_read_lock();
+       list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
+               if (size > atomic_read(&chunk->avail))
+                       continue;
 
                end_bit = (chunk->end_addr - chunk->start_addr) >> order;
-
-               spin_lock_irqsave(&chunk->lock, flags);
-               start_bit = bitmap_find_next_zero_area(chunk->bits, end_bit, 0,
-                                               nbits, 0);
-               if (start_bit >= end_bit) {
-                       spin_unlock_irqrestore(&chunk->lock, flags);
+retry:
+               start_bit = bitmap_find_next_zero_area(chunk->bits, end_bit,
+                                                      start_bit, nbits, 0);
+               if (start_bit >= end_bit)
                        continue;
+               remain = bitmap_set_ll(chunk->bits, start_bit, nbits);
+               if (remain) {
+                       remain = bitmap_clear_ll(chunk->bits, start_bit,
+                                                nbits - remain);
+                       BUG_ON(remain);
+                       goto retry;
                }
 
                addr = chunk->start_addr + ((unsigned long)start_bit << order);
-
-               bitmap_set(chunk->bits, start_bit, nbits);
-               spin_unlock_irqrestore(&chunk->lock, flags);
-               read_unlock(&pool->lock);
-               return addr;
+               size = nbits << order;
+               atomic_sub(size, &chunk->avail);
+               break;
        }
-       read_unlock(&pool->lock);
-       return 0;
+       rcu_read_unlock();
+       return addr;
 }
 EXPORT_SYMBOL(gen_pool_alloc);
 
@@ -184,33 +308,95 @@ EXPORT_SYMBOL(gen_pool_alloc);
  * @addr: starting address of memory to free back to pool
  * @size: size in bytes of memory to free
  *
- * Free previously allocated special memory back to the specified pool.
+ * Free previously allocated special memory back to the specified
+ * pool.  Can not be used in NMI handler on architectures without
+ * NMI-safe cmpxchg implementation.
  */
 void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
 {
-       struct list_head *_chunk;
        struct gen_pool_chunk *chunk;
-       unsigned long flags;
        int order = pool->min_alloc_order;
-       int bit, nbits;
+       int start_bit, nbits, remain;
 
-       nbits = (size + (1UL << order) - 1) >> order;
-
-       read_lock(&pool->lock);
-       list_for_each(_chunk, &pool->chunks) {
-               chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
+#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+       BUG_ON(in_nmi());
+#endif
 
+       nbits = (size + (1UL << order) - 1) >> order;
+       rcu_read_lock();
+       list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
                if (addr >= chunk->start_addr && addr < chunk->end_addr) {
                        BUG_ON(addr + size > chunk->end_addr);
-                       spin_lock_irqsave(&chunk->lock, flags);
-                       bit = (addr - chunk->start_addr) >> order;
-                       while (nbits--)
-                               __clear_bit(bit++, chunk->bits);
-                       spin_unlock_irqrestore(&chunk->lock, flags);
-                       break;
+                       start_bit = (addr - chunk->start_addr) >> order;
+                       remain = bitmap_clear_ll(chunk->bits, start_bit, nbits);
+                       BUG_ON(remain);
+                       size = nbits << order;
+                       atomic_add(size, &chunk->avail);
+                       rcu_read_unlock();
+                       return;
                }
        }
-       BUG_ON(nbits > 0);
-       read_unlock(&pool->lock);
+       rcu_read_unlock();
+       BUG();
 }
 EXPORT_SYMBOL(gen_pool_free);
+
+/**
+ * gen_pool_for_each_chunk - call func for every chunk of generic memory pool
+ * @pool:      the generic memory pool
+ * @func:      func to call
+ * @data:      additional data used by @func
+ *
+ * Call @func for every chunk of generic memory pool.  The @func is
+ * called with rcu_read_lock held.
+ */
+void gen_pool_for_each_chunk(struct gen_pool *pool,
+       void (*func)(struct gen_pool *pool, struct gen_pool_chunk *chunk, void *data),
+       void *data)
+{
+       struct gen_pool_chunk *chunk;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(chunk, &(pool)->chunks, next_chunk)
+               func(pool, chunk, data);
+       rcu_read_unlock();
+}
+EXPORT_SYMBOL(gen_pool_for_each_chunk);
+
+/**
+ * gen_pool_avail - get available free space of the pool
+ * @pool: pool to get available free space
+ *
+ * Return available free space of the specified pool.
+ */
+size_t gen_pool_avail(struct gen_pool *pool)
+{
+       struct gen_pool_chunk *chunk;
+       size_t avail = 0;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk)
+               avail += atomic_read(&chunk->avail);
+       rcu_read_unlock();
+       return avail;
+}
+EXPORT_SYMBOL_GPL(gen_pool_avail);
+
+/**
+ * gen_pool_size - get size in bytes of memory managed by the pool
+ * @pool: pool to get size
+ *
+ * Return size in bytes of memory managed by the pool.
+ */
+size_t gen_pool_size(struct gen_pool *pool)
+{
+       struct gen_pool_chunk *chunk;
+       size_t size = 0;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk)
+               size += chunk->end_addr - chunk->start_addr;
+       rcu_read_unlock();
+       return size;
+}
+EXPORT_SYMBOL_GPL(gen_pool_size);
index e15502e..db040ce 100644 (file)
--- a/lib/idr.c
+++ b/lib/idr.c
 #include <linux/err.h>
 #include <linux/string.h>
 #include <linux/idr.h>
+#include <linux/spinlock.h>
 
 static struct kmem_cache *idr_layer_cache;
+static DEFINE_SPINLOCK(simple_ida_lock);
 
 static struct idr_layer *get_from_free_list(struct idr *idp)
 {
@@ -925,6 +927,71 @@ void ida_destroy(struct ida *ida)
 }
 EXPORT_SYMBOL(ida_destroy);
 
+/**
+ * ida_simple_get - get a new id.
+ * @ida: the (initialized) ida.
+ * @start: the minimum id (inclusive, < 0x8000000)
+ * @end: the maximum id (exclusive, < 0x8000000 or 0)
+ * @gfp_mask: memory allocation flags
+ *
+ * Allocates an id in the range start <= id < end, or returns -ENOSPC.
+ * On memory allocation failure, returns -ENOMEM.
+ *
+ * Use ida_simple_remove() to get rid of an id.
+ */
+int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
+                  gfp_t gfp_mask)
+{
+       int ret, id;
+       unsigned int max;
+
+       BUG_ON((int)start < 0);
+       BUG_ON((int)end < 0);
+
+       if (end == 0)
+               max = 0x80000000;
+       else {
+               BUG_ON(end < start);
+               max = end - 1;
+       }
+
+again:
+       if (!ida_pre_get(ida, gfp_mask))
+               return -ENOMEM;
+
+       spin_lock(&simple_ida_lock);
+       ret = ida_get_new_above(ida, start, &id);
+       if (!ret) {
+               if (id > max) {
+                       ida_remove(ida, id);
+                       ret = -ENOSPC;
+               } else {
+                       ret = id;
+               }
+       }
+       spin_unlock(&simple_ida_lock);
+
+       if (unlikely(ret == -EAGAIN))
+               goto again;
+
+       return ret;
+}
+EXPORT_SYMBOL(ida_simple_get);
+
+/**
+ * ida_simple_remove - remove an allocated id.
+ * @ida: the (initialized) ida.
+ * @id: the id returned by ida_simple_get.
+ */
+void ida_simple_remove(struct ida *ida, unsigned int id)
+{
+       BUG_ON((int)id < 0);
+       spin_lock(&simple_ida_lock);
+       ida_remove(ida, id);
+       spin_unlock(&simple_ida_lock);
+}
+EXPORT_SYMBOL(ida_simple_remove);
+
 /**
  * ida_init - initialize ida handle
  * @ida:       ida handle
diff --git a/lib/llist.c b/lib/llist.c
new file mode 100644 (file)
index 0000000..da44572
--- /dev/null
@@ -0,0 +1,129 @@
+/*
+ * Lock-less NULL terminated single linked list
+ *
+ * The basic atomic operation of this list is cmpxchg on long.  On
+ * architectures that don't have NMI-safe cmpxchg implementation, the
+ * list can NOT be used in NMI handler.  So code uses the list in NMI
+ * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
+ *
+ * Copyright 2010,2011 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/llist.h>
+
+#include <asm/system.h>
+
+/**
+ * llist_add - add a new entry
+ * @new:       new entry to be added
+ * @head:      the head for your lock-less list
+ */
+void llist_add(struct llist_node *new, struct llist_head *head)
+{
+       struct llist_node *entry, *old_entry;
+
+#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+       BUG_ON(in_nmi());
+#endif
+
+       entry = head->first;
+       do {
+               old_entry = entry;
+               new->next = entry;
+               cpu_relax();
+       } while ((entry = cmpxchg(&head->first, old_entry, new)) != old_entry);
+}
+EXPORT_SYMBOL_GPL(llist_add);
+
+/**
+ * llist_add_batch - add several linked entries in batch
+ * @new_first: first entry in batch to be added
+ * @new_last:  last entry in batch to be added
+ * @head:      the head for your lock-less list
+ */
+void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
+                    struct llist_head *head)
+{
+       struct llist_node *entry, *old_entry;
+
+#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+       BUG_ON(in_nmi());
+#endif
+
+       entry = head->first;
+       do {
+               old_entry = entry;
+               new_last->next = entry;
+               cpu_relax();
+       } while ((entry = cmpxchg(&head->first, old_entry, new_first)) != old_entry);
+}
+EXPORT_SYMBOL_GPL(llist_add_batch);
+
+/**
+ * llist_del_first - delete the first entry of lock-less list
+ * @head:      the head for your lock-less list
+ *
+ * If list is empty, return NULL, otherwise, return the first entry
+ * deleted, this is the newest added one.
+ *
+ * Only one llist_del_first user can be used simultaneously with
+ * multiple llist_add users without lock.  Because otherwise
+ * llist_del_first, llist_add, llist_add (or llist_del_all, llist_add,
+ * llist_add) sequence in another user may change @head->first->next,
+ * but keep @head->first.  If multiple consumers are needed, please
+ * use llist_del_all or use lock between consumers.
+ */
+struct llist_node *llist_del_first(struct llist_head *head)
+{
+       struct llist_node *entry, *old_entry, *next;
+
+#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+       BUG_ON(in_nmi());
+#endif
+
+       entry = head->first;
+       do {
+               if (entry == NULL)
+                       return NULL;
+               old_entry = entry;
+               next = entry->next;
+               cpu_relax();
+       } while ((entry = cmpxchg(&head->first, old_entry, next)) != old_entry);
+
+       return entry;
+}
+EXPORT_SYMBOL_GPL(llist_del_first);
+
+/**
+ * llist_del_all - delete all entries from lock-less list
+ * @head:      the head of lock-less list to delete all entries
+ *
+ * If list is empty, return NULL, otherwise, delete all entries and
+ * return the pointer to the first entry.  The order of entries
+ * deleted is from the newest to the oldest added one.
+ */
+struct llist_node *llist_del_all(struct llist_head *head)
+{
+#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+       BUG_ON(in_nmi());
+#endif
+
+       return xchg(&head->first, NULL);
+}
+EXPORT_SYMBOL_GPL(llist_del_all);
index 7ea2e03..a2f9da5 100644 (file)
@@ -823,8 +823,8 @@ unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
 EXPORT_SYMBOL(radix_tree_prev_hole);
 
 static unsigned int
-__lookup(struct radix_tree_node *slot, void ***results, unsigned long index,
-       unsigned int max_items, unsigned long *next_index)
+__lookup(struct radix_tree_node *slot, void ***results, unsigned long *indices,
+       unsigned long index, unsigned int max_items, unsigned long *next_index)
 {
        unsigned int nr_found = 0;
        unsigned int shift, height;
@@ -857,12 +857,16 @@ __lookup(struct radix_tree_node *slot, void ***results, unsigned long index,
 
        /* Bottom level: grab some items */
        for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
-               index++;
                if (slot->slots[i]) {
-                       results[nr_found++] = &(slot->slots[i]);
-                       if (nr_found == max_items)
+                       results[nr_found] = &(slot->slots[i]);
+                       if (indices)
+                               indices[nr_found] = index;
+                       if (++nr_found == max_items) {
+                               index++;
                                goto out;
+                       }
                }
+               index++;
        }
 out:
        *next_index = index;
@@ -918,8 +922,8 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 
                if (cur_index > max_index)
                        break;
-               slots_found = __lookup(node, (void ***)results + ret, cur_index,
-                                       max_items - ret, &next_index);
+               slots_found = __lookup(node, (void ***)results + ret, NULL,
+                               cur_index, max_items - ret, &next_index);
                nr_found = 0;
                for (i = 0; i < slots_found; i++) {
                        struct radix_tree_node *slot;
@@ -944,6 +948,7 @@ EXPORT_SYMBOL(radix_tree_gang_lookup);
  *     radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree
  *     @root:          radix tree root
  *     @results:       where the results of the lookup are placed
+ *     @indices:       where their indices should be placed (but usually NULL)
  *     @first_index:   start the lookup from this key
  *     @max_items:     place up to this many items at *results
  *
@@ -958,7 +963,8 @@ EXPORT_SYMBOL(radix_tree_gang_lookup);
  *     protection, radix_tree_deref_slot may fail requiring a retry.
  */
 unsigned int
-radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
+radix_tree_gang_lookup_slot(struct radix_tree_root *root,
+                       void ***results, unsigned long *indices,
                        unsigned long first_index, unsigned int max_items)
 {
        unsigned long max_index;
@@ -974,6 +980,8 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
                if (first_index > 0)
                        return 0;
                results[0] = (void **)&root->rnode;
+               if (indices)
+                       indices[0] = 0;
                return 1;
        }
        node = indirect_to_ptr(node);
@@ -987,8 +995,9 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
 
                if (cur_index > max_index)
                        break;
-               slots_found = __lookup(node, results + ret, cur_index,
-                                       max_items - ret, &next_index);
+               slots_found = __lookup(node, results + ret,
+                               indices ? indices + ret : NULL,
+                               cur_index, max_items - ret, &next_index);
                ret += slots_found;
                if (next_index == 0)
                        break;
@@ -1194,6 +1203,98 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
 }
 EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
 
+#if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP)
+#include <linux/sched.h> /* for cond_resched() */
+
+/*
+ * This linear search is at present only useful to shmem_unuse_inode().
+ */
+static unsigned long __locate(struct radix_tree_node *slot, void *item,
+                             unsigned long index, unsigned long *found_index)
+{
+       unsigned int shift, height;
+       unsigned long i;
+
+       height = slot->height;
+       shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+
+       for ( ; height > 1; height--) {
+               i = (index >> shift) & RADIX_TREE_MAP_MASK;
+               for (;;) {
+                       if (slot->slots[i] != NULL)
+                               break;
+                       index &= ~((1UL << shift) - 1);
+                       index += 1UL << shift;
+                       if (index == 0)
+                               goto out;       /* 32-bit wraparound */
+                       i++;
+                       if (i == RADIX_TREE_MAP_SIZE)
+                               goto out;
+               }
+
+               shift -= RADIX_TREE_MAP_SHIFT;
+               slot = rcu_dereference_raw(slot->slots[i]);
+               if (slot == NULL)
+                       goto out;
+       }
+
+       /* Bottom level: check items */
+       for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
+               if (slot->slots[i] == item) {
+                       *found_index = index + i;
+                       index = 0;
+                       goto out;
+               }
+       }
+       index += RADIX_TREE_MAP_SIZE;
+out:
+       return index;
+}
+
+/**
+ *     radix_tree_locate_item - search through radix tree for item
+ *     @root:          radix tree root
+ *     @item:          item to be found
+ *
+ *     Returns index where item was found, or -1 if not found.
+ *     Caller must hold no lock (since this time-consuming function needs
+ *     to be preemptible), and must check afterwards if item is still there.
+ */
+unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
+{
+       struct radix_tree_node *node;
+       unsigned long max_index;
+       unsigned long cur_index = 0;
+       unsigned long found_index = -1;
+
+       do {
+               rcu_read_lock();
+               node = rcu_dereference_raw(root->rnode);
+               if (!radix_tree_is_indirect_ptr(node)) {
+                       rcu_read_unlock();
+                       if (node == item)
+                               found_index = 0;
+                       break;
+               }
+
+               node = indirect_to_ptr(node);
+               max_index = radix_tree_maxindex(node->height);
+               if (cur_index > max_index)
+                       break;
+
+               cur_index = __locate(node, item, cur_index, &found_index);
+               rcu_read_unlock();
+               cond_resched();
+       } while (cur_index != 0 && cur_index <= max_index);
+
+       return found_index;
+}
+#else
+unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
+{
+       return -1;
+}
+#endif /* CONFIG_SHMEM && CONFIG_SWAP */
 
 /**
  *     radix_tree_shrink    -    shrink height of a radix tree to minimal
index 1ce58c2..0dd7b8f 100644 (file)
@@ -34,23 +34,23 @@ __setup("failslab=", setup_failslab);
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 static int __init failslab_debugfs_init(void)
 {
+       struct dentry *dir;
        mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
-       int err;
 
-       err = init_fault_attr_dentries(&failslab.attr, "failslab");
-       if (err)
-               return err;
+       dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr);
+       if (IS_ERR(dir))
+               return PTR_ERR(dir);
 
-       if (!debugfs_create_bool("ignore-gfp-wait", mode, failslab.attr.dir,
+       if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
                                &failslab.ignore_gfp_wait))
                goto fail;
-       if (!debugfs_create_bool("cache-filter", mode, failslab.attr.dir,
+       if (!debugfs_create_bool("cache-filter", mode, dir,
                                &failslab.cache_filter))
                goto fail;
 
        return 0;
 fail:
-       cleanup_fault_attr_dentries(&failslab.attr);
+       debugfs_remove_recursive(dir);
 
        return -ENOMEM;
 }
index 867d402..645a080 100644 (file)
@@ -33,7 +33,6 @@
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
-#include <linux/mm_inline.h> /* for page_is_file_cache() */
 #include <linux/cleancache.h>
 #include "internal.h"
 
@@ -462,6 +461,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
        int error;
 
        VM_BUG_ON(!PageLocked(page));
+       VM_BUG_ON(PageSwapBacked(page));
 
        error = mem_cgroup_cache_charge(page, current->mm,
                                        gfp_mask & GFP_RECLAIM_MASK);
@@ -479,8 +479,6 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                if (likely(!error)) {
                        mapping->nrpages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
-                       if (PageSwapBacked(page))
-                               __inc_zone_page_state(page, NR_SHMEM);
                        spin_unlock_irq(&mapping->tree_lock);
                } else {
                        page->mapping = NULL;
@@ -502,22 +500,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 {
        int ret;
 
-       /*
-        * Splice_read and readahead add shmem/tmpfs pages into the page cache
-        * before shmem_readpage has a chance to mark them as SwapBacked: they
-        * need to go on the anon lru below, and mem_cgroup_cache_charge
-        * (called in add_to_page_cache) needs to know where they're going too.
-        */
-       if (mapping_cap_swap_backed(mapping))
-               SetPageSwapBacked(page);
-
        ret = add_to_page_cache(page, mapping, offset, gfp_mask);
-       if (ret == 0) {
-               if (page_is_file_cache(page))
-                       lru_cache_add_file(page);
-               else
-                       lru_cache_add_anon(page);
-       }
+       if (ret == 0)
+               lru_cache_add_file(page);
        return ret;
 }
 EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
@@ -714,9 +699,16 @@ repeat:
                page = radix_tree_deref_slot(pagep);
                if (unlikely(!page))
                        goto out;
-               if (radix_tree_deref_retry(page))
-                       goto repeat;
-
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page))
+                               goto repeat;
+                       /*
+                        * Otherwise, shmem/tmpfs must be storing a swap entry
+                        * here as an exceptional entry: so return it without
+                        * attempting to raise page count.
+                        */
+                       goto out;
+               }
                if (!page_cache_get_speculative(page))
                        goto repeat;
 
@@ -753,7 +745,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
 
 repeat:
        page = find_get_page(mapping, offset);
-       if (page) {
+       if (page && !radix_tree_exception(page)) {
                lock_page(page);
                /* Has the page been truncated? */
                if (unlikely(page->mapping != mapping)) {
@@ -840,7 +832,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
        rcu_read_lock();
 restart:
        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
-                               (void ***)pages, start, nr_pages);
+                               (void ***)pages, NULL, start, nr_pages);
        ret = 0;
        for (i = 0; i < nr_found; i++) {
                struct page *page;
@@ -849,13 +841,22 @@ repeat:
                if (unlikely(!page))
                        continue;
 
-               /*
-                * This can only trigger when the entry at index 0 moves out
-                * of or back to the root: none yet gotten, safe to restart.
-                */
-               if (radix_tree_deref_retry(page)) {
-                       WARN_ON(start | i);
-                       goto restart;
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page)) {
+                               /*
+                                * Transient condition which can only trigger
+                                * when entry at index 0 moves out of or back
+                                * to root: none yet gotten, safe to restart.
+                                */
+                               WARN_ON(start | i);
+                               goto restart;
+                       }
+                       /*
+                        * Otherwise, shmem/tmpfs must be storing a swap entry
+                        * here as an exceptional entry: so skip over it -
+                        * we only reach this from invalidate_mapping_pages().
+                        */
+                       continue;
                }
 
                if (!page_cache_get_speculative(page))
@@ -903,7 +904,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
        rcu_read_lock();
 restart:
        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
-                               (void ***)pages, index, nr_pages);
+                               (void ***)pages, NULL, index, nr_pages);
        ret = 0;
        for (i = 0; i < nr_found; i++) {
                struct page *page;
@@ -912,12 +913,22 @@ repeat:
                if (unlikely(!page))
                        continue;
 
-               /*
-                * This can only trigger when the entry at index 0 moves out
-                * of or back to the root: none yet gotten, safe to restart.
-                */
-               if (radix_tree_deref_retry(page))
-                       goto restart;
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page)) {
+                               /*
+                                * Transient condition which can only trigger
+                                * when entry at index 0 moves out of or back
+                                * to root: none yet gotten, safe to restart.
+                                */
+                               goto restart;
+                       }
+                       /*
+                        * Otherwise, shmem/tmpfs must be storing a swap entry
+                        * here as an exceptional entry: so stop looking for
+                        * contiguous pages.
+                        */
+                       break;
+               }
 
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -977,12 +988,21 @@ repeat:
                if (unlikely(!page))
                        continue;
 
-               /*
-                * This can only trigger when the entry at index 0 moves out
-                * of or back to the root: none yet gotten, safe to restart.
-                */
-               if (radix_tree_deref_retry(page))
-                       goto restart;
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page)) {
+                               /*
+                                * Transient condition which can only trigger
+                                * when entry at index 0 moves out of or back
+                                * to root: none yet gotten, safe to restart.
+                                */
+                               goto restart;
+                       }
+                       /*
+                        * This function is never used on a shmem/tmpfs
+                        * mapping, so a swap entry won't be found here.
+                        */
+                       BUG();
+               }
 
                if (!page_cache_get_speculative(page))
                        goto repeat;
index 5f84d23..f4ec4e7 100644 (file)
@@ -35,7 +35,6 @@
 #include <linux/limits.h>
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
-#include <linux/shmem_fs.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
@@ -2873,30 +2872,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                return 0;
        if (PageCompound(page))
                return 0;
-       /*
-        * Corner case handling. This is called from add_to_page_cache()
-        * in usual. But some FS (shmem) precharges this page before calling it
-        * and call add_to_page_cache() with GFP_NOWAIT.
-        *
-        * For GFP_NOWAIT case, the page may be pre-charged before calling
-        * add_to_page_cache(). (See shmem.c) check it here and avoid to call
-        * charge twice. (It works but has to pay a bit larger cost.)
-        * And when the page is SwapCache, it should take swap information
-        * into account. This is under lock_page() now.
-        */
-       if (!(gfp_mask & __GFP_WAIT)) {
-               struct page_cgroup *pc;
-
-               pc = lookup_page_cgroup(page);
-               if (!pc)
-                       return 0;
-               lock_page_cgroup(pc);
-               if (PageCgroupUsed(pc)) {
-                       unlock_page_cgroup(pc);
-                       return 0;
-               }
-               unlock_page_cgroup(pc);
-       }
 
        if (unlikely(!mm))
                mm = &init_mm;
@@ -3486,31 +3461,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
        cgroup_release_and_wakeup_rmdir(&mem->css);
 }
 
-/*
- * A call to try to shrink memory usage on charge failure at shmem's swapin.
- * Calling hierarchical_reclaim is not enough because we should update
- * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
- * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
- * not from the memcg which this page would be charged to.
- * try_charge_swapin does all of these works properly.
- */
-int mem_cgroup_shmem_charge_fallback(struct page *page,
-                           struct mm_struct *mm,
-                           gfp_t gfp_mask)
-{
-       struct mem_cgroup *mem;
-       int ret;
-
-       if (mem_cgroup_disabled())
-               return 0;
-
-       ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
-       if (!ret)
-               mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
-
-       return ret;
-}
-
 #ifdef CONFIG_DEBUG_VM
 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
 {
@@ -5330,15 +5280,17 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
                pgoff = pte_to_pgoff(ptent);
 
        /* page is moved even if it's not RSS of this task(page-faulted). */
-       if (!mapping_cap_swap_backed(mapping)) { /* normal file */
-               page = find_get_page(mapping, pgoff);
-       } else { /* shmem/tmpfs file. we should take account of swap too. */
-               swp_entry_t ent;
-               mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
+       page = find_get_page(mapping, pgoff);
+
+#ifdef CONFIG_SWAP
+       /* shmem/tmpfs may report page out on swap: account for that too. */
+       if (radix_tree_exceptional_entry(page)) {
+               swp_entry_t swap = radix_to_swp_entry(page);
                if (do_swap_account)
-                       entry->val = ent.val;
+                       *entry = swap;
+               page = find_get_page(&swapper_space, swap.val);
        }
-
+#endif
        return page;
 }
 
index 740c4f5..2b43ba0 100644 (file)
@@ -53,6 +53,7 @@
 #include <linux/hugetlb.h>
 #include <linux/memory_hotplug.h>
 #include <linux/mm_inline.h>
+#include <linux/kfifo.h>
 #include "internal.h"
 
 int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1178,6 +1179,97 @@ void memory_failure(unsigned long pfn, int trapno)
        __memory_failure(pfn, trapno, 0);
 }
 
+#define MEMORY_FAILURE_FIFO_ORDER      4
+#define MEMORY_FAILURE_FIFO_SIZE       (1 << MEMORY_FAILURE_FIFO_ORDER)
+
+struct memory_failure_entry {
+       unsigned long pfn;
+       int trapno;
+       int flags;
+};
+
+struct memory_failure_cpu {
+       DECLARE_KFIFO(fifo, struct memory_failure_entry,
+                     MEMORY_FAILURE_FIFO_SIZE);
+       spinlock_t lock;
+       struct work_struct work;
+};
+
+static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
+
+/**
+ * memory_failure_queue - Schedule handling memory failure of a page.
+ * @pfn: Page Number of the corrupted page
+ * @trapno: Trap number reported in the signal to user space.
+ * @flags: Flags for memory failure handling
+ *
+ * This function is called by the low level hardware error handler
+ * when it detects hardware memory corruption of a page. It schedules
+ * the recovering of error page, including dropping pages, killing
+ * processes etc.
+ *
+ * The function is primarily of use for corruptions that
+ * happen outside the current execution context (e.g. when
+ * detected by a background scrubber)
+ *
+ * Can run in IRQ context.
+ */
+void memory_failure_queue(unsigned long pfn, int trapno, int flags)
+{
+       struct memory_failure_cpu *mf_cpu;
+       unsigned long proc_flags;
+       struct memory_failure_entry entry = {
+               .pfn =          pfn,
+               .trapno =       trapno,
+               .flags =        flags,
+       };
+
+       mf_cpu = &get_cpu_var(memory_failure_cpu);
+       spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+       if (kfifo_put(&mf_cpu->fifo, &entry))
+               schedule_work_on(smp_processor_id(), &mf_cpu->work);
+       else
+               pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
+                      pfn);
+       spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+       put_cpu_var(memory_failure_cpu);
+}
+EXPORT_SYMBOL_GPL(memory_failure_queue);
+
+static void memory_failure_work_func(struct work_struct *work)
+{
+       struct memory_failure_cpu *mf_cpu;
+       struct memory_failure_entry entry = { 0, };
+       unsigned long proc_flags;
+       int gotten;
+
+       mf_cpu = &__get_cpu_var(memory_failure_cpu);
+       for (;;) {
+               spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+               gotten = kfifo_get(&mf_cpu->fifo, &entry);
+               spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+               if (!gotten)
+                       break;
+               __memory_failure(entry.pfn, entry.trapno, entry.flags);
+       }
+}
+
+static int __init memory_failure_init(void)
+{
+       struct memory_failure_cpu *mf_cpu;
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               mf_cpu = &per_cpu(memory_failure_cpu, cpu);
+               spin_lock_init(&mf_cpu->lock);
+               INIT_KFIFO(mf_cpu->fifo);
+               INIT_WORK(&mf_cpu->work, memory_failure_work_func);
+       }
+
+       return 0;
+}
+core_initcall(memory_failure_init);
+
 /**
  * unpoison_memory - Unpoison a previously poisoned page
  * @pfn: Page number of the to be unpoisoned page
index a4e6b9d..636a868 100644 (file)
@@ -69,12 +69,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
         * file will not get a swp_entry_t in its pte, but rather it is like
         * any other file mapping (ie. marked !present and faulted in with
         * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
-        *
-        * However when tmpfs moves the page from pagecache and into swapcache,
-        * it is still in core, but the find_get_page below won't find it.
-        * No big deal, but make a note of it.
         */
        page = find_get_page(mapping, pgoff);
+#ifdef CONFIG_SWAP
+       /* shmem/tmpfs may return swap: account for swapcache page too. */
+       if (radix_tree_exceptional_entry(page)) {
+               swp_entry_t swap = radix_to_swp_entry(page);
+               page = find_get_page(&swapper_space, swap.val);
+       }
+#endif
        if (page) {
                present = PageUptodate(page);
                page_cache_release(page);
index 1dbcf88..6e8ecb6 100644 (file)
@@ -1409,14 +1409,11 @@ static int __init fail_page_alloc_debugfs(void)
 {
        mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
        struct dentry *dir;
-       int err;
 
-       err = init_fault_attr_dentries(&fail_page_alloc.attr,
-                                      "fail_page_alloc");
-       if (err)
-               return err;
-
-       dir = fail_page_alloc.attr.dir;
+       dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
+                                       &fail_page_alloc.attr);
+       if (IS_ERR(dir))
+               return PTR_ERR(dir);
 
        if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
                                &fail_page_alloc.ignore_gfp_wait))
@@ -1430,7 +1427,7 @@ static int __init fail_page_alloc_debugfs(void)
 
        return 0;
 fail:
-       cleanup_fault_attr_dentries(&fail_page_alloc.attr);
+       debugfs_remove_recursive(dir);
 
        return -ENOMEM;
 }
index 5cc21f8..32f6763 100644 (file)
@@ -6,7 +6,8 @@
  *              2000-2001 Christoph Rohland
  *              2000-2001 SAP AG
  *              2002 Red Hat Inc.
- * Copyright (C) 2002-2005 Hugh Dickins.
+ * Copyright (C) 2002-2011 Hugh Dickins.
+ * Copyright (C) 2011 Google Inc.
  * Copyright (C) 2002-2005 VERITAS Software Corporation.
  * Copyright (C) 2004 Andi Kleen, SuSE Labs
  *
@@ -28,7 +29,6 @@
 #include <linux/file.h>
 #include <linux/mm.h>
 #include <linux/module.h>
-#include <linux/percpu_counter.h>
 #include <linux/swap.h>
 
 static struct vfsmount *shm_mnt;
@@ -51,6 +51,8 @@ static struct vfsmount *shm_mnt;
 #include <linux/shmem_fs.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/pagevec.h>
+#include <linux/percpu_counter.h>
 #include <linux/splice.h>
 #include <linux/security.h>
 #include <linux/swapops.h>
@@ -63,43 +65,17 @@ static struct vfsmount *shm_mnt;
 #include <linux/magic.h>
 
 #include <asm/uaccess.h>
-#include <asm/div64.h>
 #include <asm/pgtable.h>
 
-/*
- * The maximum size of a shmem/tmpfs file is limited by the maximum size of
- * its triple-indirect swap vector - see illustration at shmem_swp_entry().
- *
- * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel,
- * but one eighth of that on a 64-bit kernel.  With 8kB page size, maximum
- * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
- * MAX_LFS_FILESIZE being then more restrictive than swap vector layout.
- *
- * We use / and * instead of shifts in the definitions below, so that the swap
- * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE.
- */
-#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
-#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
-
-#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
-#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT)
-
-#define SHMEM_MAX_BYTES  min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE)
-#define SHMEM_MAX_INDEX  ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT))
-
 #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
 #define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
 
-/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
-#define SHMEM_PAGEIN    VM_READ
-#define SHMEM_TRUNCATE  VM_WRITE
-
-/* Definition to limit shmem_truncate's steps between cond_rescheds */
-#define LATENCY_LIMIT   64
-
 /* Pretend that each entry is of this size in directory's i_size */
 #define BOGO_DIRENT_SIZE 20
 
+/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
+#define SHORT_SYMLINK_LEN 128
+
 struct shmem_xattr {
        struct list_head list;  /* anchored by shmem_inode_info->xattr_list */
        char *name;             /* xattr name */
@@ -107,7 +83,7 @@ struct shmem_xattr {
        char value[0];
 };
 
-/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
+/* Flag allocation requirements to shmem_getpage */
 enum sgp_type {
        SGP_READ,       /* don't exceed i_size, don't allocate page */
        SGP_CACHE,      /* don't exceed i_size, may allocate page */
@@ -137,56 +113,6 @@ static inline int shmem_getpage(struct inode *inode, pgoff_t index,
                        mapping_gfp_mask(inode->i_mapping), fault_type);
 }
 
-static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
-{
-       /*
-        * The above definition of ENTRIES_PER_PAGE, and the use of
-        * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
-        * might be reconsidered if it ever diverges from PAGE_SIZE.
-        *
-        * Mobility flags are masked out as swap vectors cannot move
-        */
-       return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
-                               PAGE_CACHE_SHIFT-PAGE_SHIFT);
-}
-
-static inline void shmem_dir_free(struct page *page)
-{
-       __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
-}
-
-static struct page **shmem_dir_map(struct page *page)
-{
-       return (struct page **)kmap_atomic(page, KM_USER0);
-}
-
-static inline void shmem_dir_unmap(struct page **dir)
-{
-       kunmap_atomic(dir, KM_USER0);
-}
-
-static swp_entry_t *shmem_swp_map(struct page *page)
-{
-       return (swp_entry_t *)kmap_atomic(page, KM_USER1);
-}
-
-static inline void shmem_swp_balance_unmap(void)
-{
-       /*
-        * When passing a pointer to an i_direct entry, to code which
-        * also handles indirect entries and so will shmem_swp_unmap,
-        * we must arrange for the preempt count to remain in balance.
-        * What kmap_atomic of a lowmem page does depends on config
-        * and architecture, so pretend to kmap_atomic some lowmem page.
-        */
-       (void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
-}
-
-static inline void shmem_swp_unmap(swp_entry_t *entry)
-{
-       kunmap_atomic(entry, KM_USER1);
-}
-
 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
 {
        return sb->s_fs_info;
@@ -244,15 +170,6 @@ static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 static LIST_HEAD(shmem_swaplist);
 static DEFINE_MUTEX(shmem_swaplist_mutex);
 
-static void shmem_free_blocks(struct inode *inode, long pages)
-{
-       struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
-       if (sbinfo->max_blocks) {
-               percpu_counter_add(&sbinfo->used_blocks, -pages);
-               inode->i_blocks -= pages*BLOCKS_PER_PAGE;
-       }
-}
-
 static int shmem_reserve_inode(struct super_block *sb)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
@@ -279,7 +196,7 @@ static void shmem_free_inode(struct super_block *sb)
 }
 
 /**
- * shmem_recalc_inode - recalculate the size of an inode
+ * shmem_recalc_inode - recalculate the block usage of an inode
  * @inode: inode to recalc
  *
  * We have to calculate the free blocks since the mm can drop
@@ -297,474 +214,297 @@ static void shmem_recalc_inode(struct inode *inode)
 
        freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
        if (freed > 0) {
+               struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+               if (sbinfo->max_blocks)
+                       percpu_counter_add(&sbinfo->used_blocks, -freed);
                info->alloced -= freed;
+               inode->i_blocks -= freed * BLOCKS_PER_PAGE;
                shmem_unacct_blocks(info->flags, freed);
-               shmem_free_blocks(inode, freed);
        }
 }
 
-/**
- * shmem_swp_entry - find the swap vector position in the info structure
- * @info:  info structure for the inode
- * @index: index of the page to find
- * @page:  optional page to add to the structure. Has to be preset to
- *         all zeros
- *
- * If there is no space allocated yet it will return NULL when
- * page is NULL, else it will use the page for the needed block,
- * setting it to NULL on return to indicate that it has been used.
- *
- * The swap vector is organized the following way:
- *
- * There are SHMEM_NR_DIRECT entries directly stored in the
- * shmem_inode_info structure. So small files do not need an addional
- * allocation.
- *
- * For pages with index > SHMEM_NR_DIRECT there is the pointer
- * i_indirect which points to a page which holds in the first half
- * doubly indirect blocks, in the second half triple indirect blocks:
- *
- * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
- * following layout (for SHMEM_NR_DIRECT == 16):
- *
- * i_indirect -> dir --> 16-19
- *           |      +-> 20-23
- *           |
- *           +-->dir2 --> 24-27
- *           |        +-> 28-31
- *           |        +-> 32-35
- *           |        +-> 36-39
- *           |
- *           +-->dir3 --> 40-43
- *                    +-> 44-47
- *                    +-> 48-51
- *                    +-> 52-55
+/*
+ * Replace item expected in radix tree by a new item, while holding tree lock.
  */
-static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
-{
-       unsigned long offset;
-       struct page **dir;
-       struct page *subdir;
-
-       if (index < SHMEM_NR_DIRECT) {
-               shmem_swp_balance_unmap();
-               return info->i_direct+index;
-       }
-       if (!info->i_indirect) {
-               if (page) {
-                       info->i_indirect = *page;
-                       *page = NULL;
-               }
-               return NULL;                    /* need another page */
-       }
-
-       index -= SHMEM_NR_DIRECT;
-       offset = index % ENTRIES_PER_PAGE;
-       index /= ENTRIES_PER_PAGE;
-       dir = shmem_dir_map(info->i_indirect);
-
-       if (index >= ENTRIES_PER_PAGE/2) {
-               index -= ENTRIES_PER_PAGE/2;
-               dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
-               index %= ENTRIES_PER_PAGE;
-               subdir = *dir;
-               if (!subdir) {
-                       if (page) {
-                               *dir = *page;
-                               *page = NULL;
-                       }
-                       shmem_dir_unmap(dir);
-                       return NULL;            /* need another page */
-               }
-               shmem_dir_unmap(dir);
-               dir = shmem_dir_map(subdir);
-       }
+static int shmem_radix_tree_replace(struct address_space *mapping,
+                       pgoff_t index, void *expected, void *replacement)
+{
+       void **pslot;
+       void *item = NULL;
+
+       VM_BUG_ON(!expected);
+       pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
+       if (pslot)
+               item = radix_tree_deref_slot_protected(pslot,
+                                                       &mapping->tree_lock);
+       if (item != expected)
+               return -ENOENT;
+       if (replacement)
+               radix_tree_replace_slot(pslot, replacement);
+       else
+               radix_tree_delete(&mapping->page_tree, index);
+       return 0;
+}
 
-       dir += index;
-       subdir = *dir;
-       if (!subdir) {
-               if (!page || !(subdir = *page)) {
-                       shmem_dir_unmap(dir);
-                       return NULL;            /* need a page */
+/*
+ * Like add_to_page_cache_locked, but error if expected item has gone.
+ */
+static int shmem_add_to_page_cache(struct page *page,
+                                  struct address_space *mapping,
+                                  pgoff_t index, gfp_t gfp, void *expected)
+{
+       int error = 0;
+
+       VM_BUG_ON(!PageLocked(page));
+       VM_BUG_ON(!PageSwapBacked(page));
+
+       if (!expected)
+               error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+       if (!error) {
+               page_cache_get(page);
+               page->mapping = mapping;
+               page->index = index;
+
+               spin_lock_irq(&mapping->tree_lock);
+               if (!expected)
+                       error = radix_tree_insert(&mapping->page_tree,
+                                                       index, page);
+               else
+                       error = shmem_radix_tree_replace(mapping, index,
+                                                       expected, page);
+               if (!error) {
+                       mapping->nrpages++;
+                       __inc_zone_page_state(page, NR_FILE_PAGES);
+                       __inc_zone_page_state(page, NR_SHMEM);
+                       spin_unlock_irq(&mapping->tree_lock);
+               } else {
+                       page->mapping = NULL;
+                       spin_unlock_irq(&mapping->tree_lock);
+                       page_cache_release(page);
                }
-               *dir = subdir;
-               *page = NULL;
+               if (!expected)
+                       radix_tree_preload_end();
        }
-       shmem_dir_unmap(dir);
-       return shmem_swp_map(subdir) + offset;
+       if (error)
+               mem_cgroup_uncharge_cache_page(page);
+       return error;
 }
 
-static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
+/*
+ * Like delete_from_page_cache, but substitutes swap for page.
+ */
+static void shmem_delete_from_page_cache(struct page *page, void *radswap)
 {
-       long incdec = value? 1: -1;
+       struct address_space *mapping = page->mapping;
+       int error;
 
-       entry->val = value;
-       info->swapped += incdec;
-       if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
-               struct page *page = kmap_atomic_to_page(entry);
-               set_page_private(page, page_private(page) + incdec);
-       }
+       spin_lock_irq(&mapping->tree_lock);
+       error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
+       page->mapping = NULL;
+       mapping->nrpages--;
+       __dec_zone_page_state(page, NR_FILE_PAGES);
+       __dec_zone_page_state(page, NR_SHMEM);
+       spin_unlock_irq(&mapping->tree_lock);
+       page_cache_release(page);
+       BUG_ON(error);
 }
 
-/**
- * shmem_swp_alloc - get the position of the swap entry for the page.
- * @info:      info structure for the inode
- * @index:     index of the page to find
- * @sgp:       check and recheck i_size? skip allocation?
- * @gfp:       gfp mask to use for any page allocation
- *
- * If the entry does not exist, allocate it.
+/*
+ * Like find_get_pages, but collecting swap entries as well as pages.
  */
-static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info,
-                       unsigned long index, enum sgp_type sgp, gfp_t gfp)
-{
-       struct inode *inode = &info->vfs_inode;
-       struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
-       struct page *page = NULL;
-       swp_entry_t *entry;
-
-       if (sgp != SGP_WRITE &&
-           ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-               return ERR_PTR(-EINVAL);
-
-       while (!(entry = shmem_swp_entry(info, index, &page))) {
-               if (sgp == SGP_READ)
-                       return shmem_swp_map(ZERO_PAGE(0));
-               /*
-                * Test used_blocks against 1 less max_blocks, since we have 1 data
-                * page (and perhaps indirect index pages) yet to allocate:
-                * a waste to allocate index if we cannot allocate data.
-                */
-               if (sbinfo->max_blocks) {
-                       if (percpu_counter_compare(&sbinfo->used_blocks,
-                                               sbinfo->max_blocks - 1) >= 0)
-                               return ERR_PTR(-ENOSPC);
-                       percpu_counter_inc(&sbinfo->used_blocks);
-                       inode->i_blocks += BLOCKS_PER_PAGE;
+static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
+                                       pgoff_t start, unsigned int nr_pages,
+                                       struct page **pages, pgoff_t *indices)
+{
+       unsigned int i;
+       unsigned int ret;
+       unsigned int nr_found;
+
+       rcu_read_lock();
+restart:
+       nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+                               (void ***)pages, indices, start, nr_pages);
+       ret = 0;
+       for (i = 0; i < nr_found; i++) {
+               struct page *page;
+repeat:
+               page = radix_tree_deref_slot((void **)pages[i]);
+               if (unlikely(!page))
+                       continue;
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page))
+                               goto restart;
+                       /*
+                        * Otherwise, we must be storing a swap entry
+                        * here as an exceptional entry: so return it
+                        * without attempting to raise page count.
+                        */
+                       goto export;
                }
+               if (!page_cache_get_speculative(page))
+                       goto repeat;
 
-               spin_unlock(&info->lock);
-               page = shmem_dir_alloc(gfp);
-               spin_lock(&info->lock);
-
-               if (!page) {
-                       shmem_free_blocks(inode, 1);
-                       return ERR_PTR(-ENOMEM);
-               }
-               if (sgp != SGP_WRITE &&
-                   ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
-                       entry = ERR_PTR(-EINVAL);
-                       break;
+               /* Has the page moved? */
+               if (unlikely(page != *((void **)pages[i]))) {
+                       page_cache_release(page);
+                       goto repeat;
                }
-               if (info->next_index <= index)
-                       info->next_index = index + 1;
-       }
-       if (page) {
-               /* another task gave its page, or truncated the file */
-               shmem_free_blocks(inode, 1);
-               shmem_dir_free(page);
-       }
-       if (info->next_index <= index && !IS_ERR(entry))
-               info->next_index = index + 1;
-       return entry;
+export:
+               indices[ret] = indices[i];
+               pages[ret] = page;
+               ret++;
+       }
+       if (unlikely(!ret && nr_found))
+               goto restart;
+       rcu_read_unlock();
+       return ret;
 }
 
-/**
- * shmem_free_swp - free some swap entries in a directory
- * @dir:        pointer to the directory
- * @edir:       pointer after last entry of the directory
- * @punch_lock: pointer to spinlock when needed for the holepunch case
+/*
+ * Remove swap entry from radix tree, free the swap and its page cache.
  */
-static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir,
-                                               spinlock_t *punch_lock)
-{
-       spinlock_t *punch_unlock = NULL;
-       swp_entry_t *ptr;
-       int freed = 0;
-
-       for (ptr = dir; ptr < edir; ptr++) {
-               if (ptr->val) {
-                       if (unlikely(punch_lock)) {
-                               punch_unlock = punch_lock;
-                               punch_lock = NULL;
-                               spin_lock(punch_unlock);
-                               if (!ptr->val)
-                                       continue;
-                       }
-                       free_swap_and_cache(*ptr);
-                       *ptr = (swp_entry_t){0};
-                       freed++;
-               }
-       }
-       if (punch_unlock)
-               spin_unlock(punch_unlock);
-       return freed;
-}
-
-static int shmem_map_and_free_swp(struct page *subdir, int offset,
-               int limit, struct page ***dir, spinlock_t *punch_lock)
-{
-       swp_entry_t *ptr;
-       int freed = 0;
-
-       ptr = shmem_swp_map(subdir);
-       for (; offset < limit; offset += LATENCY_LIMIT) {
-               int size = limit - offset;
-               if (size > LATENCY_LIMIT)
-                       size = LATENCY_LIMIT;
-               freed += shmem_free_swp(ptr+offset, ptr+offset+size,
-                                                       punch_lock);
-               if (need_resched()) {
-                       shmem_swp_unmap(ptr);
-                       if (*dir) {
-                               shmem_dir_unmap(*dir);
-                               *dir = NULL;
-                       }
-                       cond_resched();
-                       ptr = shmem_swp_map(subdir);
-               }
-       }
-       shmem_swp_unmap(ptr);
-       return freed;
+static int shmem_free_swap(struct address_space *mapping,
+                          pgoff_t index, void *radswap)
+{
+       int error;
+
+       spin_lock_irq(&mapping->tree_lock);
+       error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
+       spin_unlock_irq(&mapping->tree_lock);
+       if (!error)
+               free_swap_and_cache(radix_to_swp_entry(radswap));
+       return error;
 }
 
-static void shmem_free_pages(struct list_head *next)
+/*
+ * Pagevec may contain swap entries, so shuffle up pages before releasing.
+ */
+static void shmem_pagevec_release(struct pagevec *pvec)
 {
-       struct page *page;
-       int freed = 0;
-
-       do {
-               page = container_of(next, struct page, lru);
-               next = next->next;
-               shmem_dir_free(page);
-               freed++;
-               if (freed >= LATENCY_LIMIT) {
-                       cond_resched();
-                       freed = 0;
-               }
-       } while (next);
+       int i, j;
+
+       for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
+               struct page *page = pvec->pages[i];
+               if (!radix_tree_exceptional_entry(page))
+                       pvec->pages[j++] = page;
+       }
+       pvec->nr = j;
+       pagevec_release(pvec);
 }
 
-void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+/*
+ * Remove range of pages and swap entries from radix tree, and free them.
+ */
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 {
+       struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
-       unsigned long idx;
-       unsigned long size;
-       unsigned long limit;
-       unsigned long stage;
-       unsigned long diroff;
-       struct page **dir;
-       struct page *topdir;
-       struct page *middir;
-       struct page *subdir;
-       swp_entry_t *ptr;
-       LIST_HEAD(pages_to_free);
-       long nr_pages_to_free = 0;
+       pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+       pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
+       struct pagevec pvec;
+       pgoff_t indices[PAGEVEC_SIZE];
        long nr_swaps_freed = 0;
-       int offset;
-       int freed;
-       int punch_hole;
-       spinlock_t *needs_lock;
-       spinlock_t *punch_lock;
-       unsigned long upper_limit;
+       pgoff_t index;
+       int i;
 
-       truncate_inode_pages_range(inode->i_mapping, start, end);
+       BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
 
-       inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-       idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-       if (idx >= info->next_index)
-               return;
+       pagevec_init(&pvec, 0);
+       index = start;
+       while (index <= end) {
+               pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+                       min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+                                                       pvec.pages, indices);
+               if (!pvec.nr)
+                       break;
+               mem_cgroup_uncharge_start();
+               for (i = 0; i < pagevec_count(&pvec); i++) {
+                       struct page *page = pvec.pages[i];
 
-       spin_lock(&info->lock);
-       info->flags |= SHMEM_TRUNCATE;
-       if (likely(end == (loff_t) -1)) {
-               limit = info->next_index;
-               upper_limit = SHMEM_MAX_INDEX;
-               info->next_index = idx;
-               needs_lock = NULL;
-               punch_hole = 0;
-       } else {
-               if (end + 1 >= inode->i_size) { /* we may free a little more */
-                       limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
-                                                       PAGE_CACHE_SHIFT;
-                       upper_limit = SHMEM_MAX_INDEX;
-               } else {
-                       limit = (end + 1) >> PAGE_CACHE_SHIFT;
-                       upper_limit = limit;
-               }
-               needs_lock = &info->lock;
-               punch_hole = 1;
-       }
+                       index = indices[i];
+                       if (index > end)
+                               break;
+
+                       if (radix_tree_exceptional_entry(page)) {
+                               nr_swaps_freed += !shmem_free_swap(mapping,
+                                                               index, page);
+                               continue;
+                       }
 
-       topdir = info->i_indirect;
-       if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
-               info->i_indirect = NULL;
-               nr_pages_to_free++;
-               list_add(&topdir->lru, &pages_to_free);
+                       if (!trylock_page(page))
+                               continue;
+                       if (page->mapping == mapping) {
+                               VM_BUG_ON(PageWriteback(page));
+                               truncate_inode_page(mapping, page);
+                       }
+                       unlock_page(page);
+               }
+               shmem_pagevec_release(&pvec);
+               mem_cgroup_uncharge_end();
+               cond_resched();
+               index++;
        }
-       spin_unlock(&info->lock);
 
-       if (info->swapped && idx < SHMEM_NR_DIRECT) {
-               ptr = info->i_direct;
-               size = limit;
-               if (size > SHMEM_NR_DIRECT)
-                       size = SHMEM_NR_DIRECT;
-               nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock);
+       if (partial) {
+               struct page *page = NULL;
+               shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
+               if (page) {
+                       zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+                       set_page_dirty(page);
+                       unlock_page(page);
+                       page_cache_release(page);
+               }
        }
 
-       /*
-        * If there are no indirect blocks or we are punching a hole
-        * below indirect blocks, nothing to be done.
-        */
-       if (!topdir || limit <= SHMEM_NR_DIRECT)
-               goto done2;
+       index = start;
+       for ( ; ; ) {
+               cond_resched();
+               pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+                       min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+                                                       pvec.pages, indices);
+               if (!pvec.nr) {
+                       if (index == start)
+                               break;
+                       index = start;
+                       continue;
+               }
+               if (index == start && indices[0] > end) {
+                       shmem_pagevec_release(&pvec);
+                       break;
+               }
+               mem_cgroup_uncharge_start();
+               for (i = 0; i < pagevec_count(&pvec); i++) {
+                       struct page *page = pvec.pages[i];
 
-       /*
-        * The truncation case has already dropped info->lock, and we're safe
-        * because i_size and next_index have already been lowered, preventing
-        * access beyond.  But in the punch_hole case, we still need to take
-        * the lock when updating the swap directory, because there might be
-        * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
-        * shmem_writepage.  However, whenever we find we can remove a whole
-        * directory page (not at the misaligned start or end of the range),
-        * we first NULLify its pointer in the level above, and then have no
-        * need to take the lock when updating its contents: needs_lock and
-        * punch_lock (either pointing to info->lock or NULL) manage this.
-        */
+                       index = indices[i];
+                       if (index > end)
+                               break;
 
-       upper_limit -= SHMEM_NR_DIRECT;
-       limit -= SHMEM_NR_DIRECT;
-       idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
-       offset = idx % ENTRIES_PER_PAGE;
-       idx -= offset;
-
-       dir = shmem_dir_map(topdir);
-       stage = ENTRIES_PER_PAGEPAGE/2;
-       if (idx < ENTRIES_PER_PAGEPAGE/2) {
-               middir = topdir;
-               diroff = idx/ENTRIES_PER_PAGE;
-       } else {
-               dir += ENTRIES_PER_PAGE/2;
-               dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
-               while (stage <= idx)
-                       stage += ENTRIES_PER_PAGEPAGE;
-               middir = *dir;
-               if (*dir) {
-                       diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
-                               ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
-                       if (!diroff && !offset && upper_limit >= stage) {
-                               if (needs_lock) {
-                                       spin_lock(needs_lock);
-                                       *dir = NULL;
-                                       spin_unlock(needs_lock);
-                                       needs_lock = NULL;
-                               } else
-                                       *dir = NULL;
-                               nr_pages_to_free++;
-                               list_add(&middir->lru, &pages_to_free);
+                       if (radix_tree_exceptional_entry(page)) {
+                               nr_swaps_freed += !shmem_free_swap(mapping,
+                                                               index, page);
+                               continue;
                        }
-                       shmem_dir_unmap(dir);
-                       dir = shmem_dir_map(middir);
-               } else {
-                       diroff = 0;
-                       offset = 0;
-                       idx = stage;
-               }
-       }
 
-       for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
-               if (unlikely(idx == stage)) {
-                       shmem_dir_unmap(dir);
-                       dir = shmem_dir_map(topdir) +
-                           ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
-                       while (!*dir) {
-                               dir++;
-                               idx += ENTRIES_PER_PAGEPAGE;
-                               if (idx >= limit)
-                                       goto done1;
-                       }
-                       stage = idx + ENTRIES_PER_PAGEPAGE;
-                       middir = *dir;
-                       if (punch_hole)
-                               needs_lock = &info->lock;
-                       if (upper_limit >= stage) {
-                               if (needs_lock) {
-                                       spin_lock(needs_lock);
-                                       *dir = NULL;
-                                       spin_unlock(needs_lock);
-                                       needs_lock = NULL;
-                               } else
-                                       *dir = NULL;
-                               nr_pages_to_free++;
-                               list_add(&middir->lru, &pages_to_free);
+                       lock_page(page);
+                       if (page->mapping == mapping) {
+                               VM_BUG_ON(PageWriteback(page));
+                               truncate_inode_page(mapping, page);
                        }
-                       shmem_dir_unmap(dir);
-                       cond_resched();
-                       dir = shmem_dir_map(middir);
-                       diroff = 0;
-               }
-               punch_lock = needs_lock;
-               subdir = dir[diroff];
-               if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
-                       if (needs_lock) {
-                               spin_lock(needs_lock);
-                               dir[diroff] = NULL;
-                               spin_unlock(needs_lock);
-                               punch_lock = NULL;
-                       } else
-                               dir[diroff] = NULL;
-                       nr_pages_to_free++;
-                       list_add(&subdir->lru, &pages_to_free);
-               }
-               if (subdir && page_private(subdir) /* has swap entries */) {
-                       size = limit - idx;
-                       if (size > ENTRIES_PER_PAGE)
-                               size = ENTRIES_PER_PAGE;
-                       freed = shmem_map_and_free_swp(subdir,
-                                       offset, size, &dir, punch_lock);
-                       if (!dir)
-                               dir = shmem_dir_map(middir);
-                       nr_swaps_freed += freed;
-                       if (offset || punch_lock) {
-                               spin_lock(&info->lock);
-                               set_page_private(subdir,
-                                       page_private(subdir) - freed);
-                               spin_unlock(&info->lock);
-                       } else
-                               BUG_ON(page_private(subdir) != freed);
+                       unlock_page(page);
                }
-               offset = 0;
-       }
-done1:
-       shmem_dir_unmap(dir);
-done2:
-       if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
-               /*
-                * Call truncate_inode_pages again: racing shmem_unuse_inode
-                * may have swizzled a page in from swap since
-                * truncate_pagecache or generic_delete_inode did it, before we
-                * lowered next_index.  Also, though shmem_getpage checks
-                * i_size before adding to cache, no recheck after: so fix the
-                * narrow window there too.
-                */
-               truncate_inode_pages_range(inode->i_mapping, start, end);
+               shmem_pagevec_release(&pvec);
+               mem_cgroup_uncharge_end();
+               index++;
        }
 
        spin_lock(&info->lock);
-       info->flags &= ~SHMEM_TRUNCATE;
        info->swapped -= nr_swaps_freed;
-       if (nr_pages_to_free)
-               shmem_free_blocks(inode, nr_pages_to_free);
        shmem_recalc_inode(inode);
        spin_unlock(&info->lock);
 
-       /*
-        * Empty swap vector directory pages to be freed?
-        */
-       if (!list_empty(&pages_to_free)) {
-               pages_to_free.prev->next = NULL;
-               shmem_free_pages(pages_to_free.next);
-       }
+       inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
 
@@ -780,37 +520,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
                loff_t oldsize = inode->i_size;
                loff_t newsize = attr->ia_size;
-               struct page *page = NULL;
 
-               if (newsize < oldsize) {
-                       /*
-                        * If truncating down to a partial page, then
-                        * if that page is already allocated, hold it
-                        * in memory until the truncation is over, so
-                        * truncate_partial_page cannot miss it were
-                        * it assigned to swap.
-                        */
-                       if (newsize & (PAGE_CACHE_SIZE-1)) {
-                               (void) shmem_getpage(inode,
-                                       newsize >> PAGE_CACHE_SHIFT,
-                                               &page, SGP_READ, NULL);
-                               if (page)
-                                       unlock_page(page);
-                       }
-                       /*
-                        * Reset SHMEM_PAGEIN flag so that shmem_truncate can
-                        * detect if any pages might have been added to cache
-                        * after truncate_inode_pages.  But we needn't bother
-                        * if it's being fully truncated to zero-length: the
-                        * nrpages check is efficient enough in that case.
-                        */
-                       if (newsize) {
-                               struct shmem_inode_info *info = SHMEM_I(inode);
-                               spin_lock(&info->lock);
-                               info->flags &= ~SHMEM_PAGEIN;
-                               spin_unlock(&info->lock);
-                       }
-               }
                if (newsize != oldsize) {
                        i_size_write(inode, newsize);
                        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -822,8 +532,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
                        /* unmap again to remove racily COWed private pages */
                        unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
                }
-               if (page)
-                       page_cache_release(page);
        }
 
        setattr_copy(inode, attr);
@@ -848,7 +556,8 @@ static void shmem_evict_inode(struct inode *inode)
                        list_del_init(&info->swaplist);
                        mutex_unlock(&shmem_swaplist_mutex);
                }
-       }
+       } else
+               kfree(info->symlink);
 
        list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
                kfree(xattr->name);
@@ -859,106 +568,27 @@ static void shmem_evict_inode(struct inode *inode)
        end_writeback(inode);
 }
 
-static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
-{
-       swp_entry_t *ptr;
-
-       for (ptr = dir; ptr < edir; ptr++) {
-               if (ptr->val == entry.val)
-                       return ptr - dir;
-       }
-       return -1;
-}
-
-static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
+/*
+ * If swap found in inode, free it and move page from swapcache to filecache.
+ */
+static int shmem_unuse_inode(struct shmem_inode_info *info,
+                            swp_entry_t swap, struct page *page)
 {
-       struct address_space *mapping;
-       unsigned long idx;
-       unsigned long size;
-       unsigned long limit;
-       unsigned long stage;
-       struct page **dir;
-       struct page *subdir;
-       swp_entry_t *ptr;
-       int offset;
+       struct address_space *mapping = info->vfs_inode.i_mapping;
+       void *radswap;
+       pgoff_t index;
        int error;
 
-       idx = 0;
-       ptr = info->i_direct;
-       spin_lock(&info->lock);
-       if (!info->swapped) {
-               list_del_init(&info->swaplist);
-               goto lost2;
-       }
-       limit = info->next_index;
-       size = limit;
-       if (size > SHMEM_NR_DIRECT)
-               size = SHMEM_NR_DIRECT;
-       offset = shmem_find_swp(entry, ptr, ptr+size);
-       if (offset >= 0) {
-               shmem_swp_balance_unmap();
-               goto found;
-       }
-       if (!info->i_indirect)
-               goto lost2;
-
-       dir = shmem_dir_map(info->i_indirect);
-       stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
-
-       for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
-               if (unlikely(idx == stage)) {
-                       shmem_dir_unmap(dir-1);
-                       if (cond_resched_lock(&info->lock)) {
-                               /* check it has not been truncated */
-                               if (limit > info->next_index) {
-                                       limit = info->next_index;
-                                       if (idx >= limit)
-                                               goto lost2;
-                               }
-                       }
-                       dir = shmem_dir_map(info->i_indirect) +
-                           ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
-                       while (!*dir) {
-                               dir++;
-                               idx += ENTRIES_PER_PAGEPAGE;
-                               if (idx >= limit)
-                                       goto lost1;
-                       }
-                       stage = idx + ENTRIES_PER_PAGEPAGE;
-                       subdir = *dir;
-                       shmem_dir_unmap(dir);
-                       dir = shmem_dir_map(subdir);
-               }
-               subdir = *dir;
-               if (subdir && page_private(subdir)) {
-                       ptr = shmem_swp_map(subdir);
-                       size = limit - idx;
-                       if (size > ENTRIES_PER_PAGE)
-                               size = ENTRIES_PER_PAGE;
-                       offset = shmem_find_swp(entry, ptr, ptr+size);
-                       shmem_swp_unmap(ptr);
-                       if (offset >= 0) {
-                               shmem_dir_unmap(dir);
-                               ptr = shmem_swp_map(subdir);
-                               goto found;
-                       }
-               }
-       }
-lost1:
-       shmem_dir_unmap(dir-1);
-lost2:
-       spin_unlock(&info->lock);
-       return 0;
-found:
-       idx += offset;
-       ptr += offset;
+       radswap = swp_to_radix_entry(swap);
+       index = radix_tree_locate_item(&mapping->page_tree, radswap);
+       if (index == -1)
+               return 0;
 
        /*
         * Move _head_ to start search for next from here.
         * But be careful: shmem_evict_inode checks list_empty without taking
         * mutex, and there's an instant in list_move_tail when info->swaplist
-        * would appear empty, if it were the only one on shmem_swaplist.  We
-        * could avoid doing it if inode NULL; or use this minor optimization.
+        * would appear empty, if it were the only one on shmem_swaplist.
         */
        if (shmem_swaplist.next != &info->swaplist)
                list_move_tail(&shmem_swaplist, &info->swaplist);
@@ -968,29 +598,34 @@ found:
         * but also to hold up shmem_evict_inode(): so inode cannot be freed
         * beneath us (pagelock doesn't help until the page is in pagecache).
         */
-       mapping = info->vfs_inode.i_mapping;
-       error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
+       error = shmem_add_to_page_cache(page, mapping, index,
+                                               GFP_NOWAIT, radswap);
        /* which does mem_cgroup_uncharge_cache_page on error */
 
        if (error != -ENOMEM) {
+               /*
+                * Truncation and eviction use free_swap_and_cache(), which
+                * only does trylock page: if we raced, best clean up here.
+                */
                delete_from_swap_cache(page);
                set_page_dirty(page);
-               info->flags |= SHMEM_PAGEIN;
-               shmem_swp_set(info, ptr, 0);
-               swap_free(entry);
+               if (!error) {
+                       spin_lock(&info->lock);
+                       info->swapped--;
+                       spin_unlock(&info->lock);
+                       swap_free(swap);
+               }
                error = 1;      /* not an error, but entry was found */
        }
-       shmem_swp_unmap(ptr);
-       spin_unlock(&info->lock);
        return error;
 }
 
 /*
- * shmem_unuse() search for an eventually swapped out shmem page.
+ * Search through swapped inodes to find and replace swap by page.
  */
-int shmem_unuse(swp_entry_t entry, struct page *page)
+int shmem_unuse(swp_entry_t swap, struct page *page)
 {
-       struct list_head *p, *next;
+       struct list_head *this, *next;
        struct shmem_inode_info *info;
        int found = 0;
        int error;
@@ -999,32 +634,25 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
         * Charge page using GFP_KERNEL while we can wait, before taking
         * the shmem_swaplist_mutex which might hold up shmem_writepage().
         * Charged back to the user (not to caller) when swap account is used.
-        * add_to_page_cache() will be called with GFP_NOWAIT.
         */
        error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
        if (error)
                goto out;
-       /*
-        * Try to preload while we can wait, to not make a habit of
-        * draining atomic reserves; but don't latch on to this cpu,
-        * it's okay if sometimes we get rescheduled after this.
-        */
-       error = radix_tree_preload(GFP_KERNEL);
-       if (error)
-               goto uncharge;
-       radix_tree_preload_end();
+       /* No radix_tree_preload: swap entry keeps a place for page in tree */
 
        mutex_lock(&shmem_swaplist_mutex);
-       list_for_each_safe(p, next, &shmem_swaplist) {
-               info = list_entry(p, struct shmem_inode_info, swaplist);
-               found = shmem_unuse_inode(info, entry, page);
+       list_for_each_safe(this, next, &shmem_swaplist) {
+               info = list_entry(this, struct shmem_inode_info, swaplist);
+               if (info->swapped)
+                       found = shmem_unuse_inode(info, swap, page);
+               else
+                       list_del_init(&info->swaplist);
                cond_resched();
                if (found)
                        break;
        }
        mutex_unlock(&shmem_swaplist_mutex);
 
-uncharge:
        if (!found)
                mem_cgroup_uncharge_cache_page(page);
        if (found < 0)
@@ -1041,10 +669,10 @@ out:
 static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct shmem_inode_info *info;
-       swp_entry_t *entry, swap;
        struct address_space *mapping;
-       unsigned long index;
        struct inode *inode;
+       swp_entry_t swap;
+       pgoff_t index;
 
        BUG_ON(!PageLocked(page));
        mapping = page->mapping;
@@ -1073,50 +701,32 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 
        /*
         * Add inode to shmem_unuse()'s list of swapped-out inodes,
-        * if it's not already there.  Do it now because we cannot take
-        * mutex while holding spinlock, and must do so before the page
-        * is moved to swap cache, when its pagelock no longer protects
+        * if it's not already there.  Do it now before the page is
+        * moved to swap cache, when its pagelock no longer protects
         * the inode from eviction.  But don't unlock the mutex until
-        * we've taken the spinlock, because shmem_unuse_inode() will
-        * prune a !swapped inode from the swaplist under both locks.
+        * we've incremented swapped, because shmem_unuse_inode() will
+        * prune a !swapped inode from the swaplist under this mutex.
         */
        mutex_lock(&shmem_swaplist_mutex);
        if (list_empty(&info->swaplist))
                list_add_tail(&info->swaplist, &shmem_swaplist);
 
-       spin_lock(&info->lock);
-       mutex_unlock(&shmem_swaplist_mutex);
-
-       if (index >= info->next_index) {
-               BUG_ON(!(info->flags & SHMEM_TRUNCATE));
-               goto unlock;
-       }
-       entry = shmem_swp_entry(info, index, NULL);
-       if (entry->val) {
-               WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
-               free_swap_and_cache(*entry);
-               shmem_swp_set(info, entry, 0);
-       }
-       shmem_recalc_inode(inode);
-
        if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
-               delete_from_page_cache(page);
-               shmem_swp_set(info, entry, swap.val);
-               shmem_swp_unmap(entry);
                swap_shmem_alloc(swap);
+               shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
+
+               spin_lock(&info->lock);
+               info->swapped++;
+               shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
+
+               mutex_unlock(&shmem_swaplist_mutex);
                BUG_ON(page_mapped(page));
                swap_writepage(page, wbc);
                return 0;
        }
 
-       shmem_swp_unmap(entry);
-unlock:
-       spin_unlock(&info->lock);
-       /*
-        * add_to_swap_cache() doesn't return -EEXIST, so we can safely
-        * clear SWAP_HAS_CACHE flag.
-        */
+       mutex_unlock(&shmem_swaplist_mutex);
        swapcache_free(swap, NULL);
 redirty:
        set_page_dirty(page);
@@ -1153,35 +763,33 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 }
 #endif /* CONFIG_TMPFS */
 
-static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
-                       struct shmem_inode_info *info, unsigned long idx)
+static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+                       struct shmem_inode_info *info, pgoff_t index)
 {
        struct mempolicy mpol, *spol;
        struct vm_area_struct pvma;
-       struct page *page;
 
        spol = mpol_cond_copy(&mpol,
-                               mpol_shared_policy_lookup(&info->policy, idx));
+                       mpol_shared_policy_lookup(&info->policy, index));
 
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
-       pvma.vm_pgoff = idx;
+       pvma.vm_pgoff = index;
        pvma.vm_ops = NULL;
        pvma.vm_policy = spol;
-       page = swapin_readahead(entry, gfp, &pvma, 0);
-       return page;
+       return swapin_readahead(swap, gfp, &pvma, 0);
 }
 
 static struct page *shmem_alloc_page(gfp_t gfp,
-                       struct shmem_inode_info *info, unsigned long idx)
+                       struct shmem_inode_info *info, pgoff_t index)
 {
        struct vm_area_struct pvma;
 
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
-       pvma.vm_pgoff = idx;
+       pvma.vm_pgoff = index;
        pvma.vm_ops = NULL;
-       pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+       pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
 
        /*
         * alloc_page_vma() will drop the shared policy reference
@@ -1190,19 +798,19 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 }
 #else /* !CONFIG_NUMA */
 #ifdef CONFIG_TMPFS
-static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p)
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
 {
 }
 #endif /* CONFIG_TMPFS */
 
-static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
-                       struct shmem_inode_info *info, unsigned long idx)
+static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+                       struct shmem_inode_info *info, pgoff_t index)
 {
-       return swapin_readahead(entry, gfp, NULL, 0);
+       return swapin_readahead(swap, gfp, NULL, 0);
 }
 
 static inline struct page *shmem_alloc_page(gfp_t gfp,
-                       struct shmem_inode_info *info, unsigned long idx)
+                       struct shmem_inode_info *info, pgoff_t index)
 {
        return alloc_page(gfp);
 }
@@ -1222,243 +830,190 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
  * vm. If we swap it in we mark it dirty since we also free the swap
  * entry since a page cannot live in both the swap and page cache
  */
-static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx,
+static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
        struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
 {
        struct address_space *mapping = inode->i_mapping;
-       struct shmem_inode_info *info = SHMEM_I(inode);
+       struct shmem_inode_info *info;
        struct shmem_sb_info *sbinfo;
        struct page *page;
-       struct page *prealloc_page = NULL;
-       swp_entry_t *entry;
        swp_entry_t swap;
        int error;
-       int ret;
+       int once = 0;
 
-       if (idx >= SHMEM_MAX_INDEX)
+       if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
                return -EFBIG;
 repeat:
-       page = find_lock_page(mapping, idx);
-       if (page) {
+       swap.val = 0;
+       page = find_lock_page(mapping, index);
+       if (radix_tree_exceptional_entry(page)) {
+               swap = radix_to_swp_entry(page);
+               page = NULL;
+       }
+
+       if (sgp != SGP_WRITE &&
+           ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+               error = -EINVAL;
+               goto failed;
+       }
+
+       if (page || (sgp == SGP_READ && !swap.val)) {
                /*
                 * Once we can get the page lock, it must be uptodate:
                 * if there were an error in reading back from swap,
                 * the page would not be inserted into the filecache.
                 */
-               BUG_ON(!PageUptodate(page));
-               goto done;
+               BUG_ON(page && !PageUptodate(page));
+               *pagep = page;
+               return 0;
        }
 
        /*
-        * Try to preload while we can wait, to not make a habit of
-        * draining atomic reserves; but don't latch on to this cpu.
+        * Fast cache lookup did not find it:
+        * bring it back from swap or allocate.
         */
-       error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
-       if (error)
-               goto out;
-       radix_tree_preload_end();
-
-       if (sgp != SGP_READ && !prealloc_page) {
-               prealloc_page = shmem_alloc_page(gfp, info, idx);
-               if (prealloc_page) {
-                       SetPageSwapBacked(prealloc_page);
-                       if (mem_cgroup_cache_charge(prealloc_page,
-                                       current->mm, GFP_KERNEL)) {
-                               page_cache_release(prealloc_page);
-                               prealloc_page = NULL;
-                       }
-               }
-       }
-
-       spin_lock(&info->lock);
-       shmem_recalc_inode(inode);
-       entry = shmem_swp_alloc(info, idx, sgp, gfp);
-       if (IS_ERR(entry)) {
-               spin_unlock(&info->lock);
-               error = PTR_ERR(entry);
-               goto out;
-       }
-       swap = *entry;
+       info = SHMEM_I(inode);
+       sbinfo = SHMEM_SB(inode->i_sb);
 
        if (swap.val) {
                /* Look it up and read it in.. */
                page = lookup_swap_cache(swap);
                if (!page) {
-                       shmem_swp_unmap(entry);
-                       spin_unlock(&info->lock);
                        /* here we actually do the io */
                        if (fault_type)
                                *fault_type |= VM_FAULT_MAJOR;
-                       page = shmem_swapin(swap, gfp, info, idx);
+                       page = shmem_swapin(swap, gfp, info, index);
                        if (!page) {
-                               spin_lock(&info->lock);
-                               entry = shmem_swp_alloc(info, idx, sgp, gfp);
-                               if (IS_ERR(entry))
-                                       error = PTR_ERR(entry);
-                               else {
-                                       if (entry->val == swap.val)
-                                               error = -ENOMEM;
-                                       shmem_swp_unmap(entry);
-                               }
-                               spin_unlock(&info->lock);
-                               if (error)
-                                       goto out;
-                               goto repeat;
+                               error = -ENOMEM;
+                               goto failed;
                        }
-                       wait_on_page_locked(page);
-                       page_cache_release(page);
-                       goto repeat;
                }
 
                /* We have to do this with page locked to prevent races */
-               if (!trylock_page(page)) {
-                       shmem_swp_unmap(entry);
-                       spin_unlock(&info->lock);
-                       wait_on_page_locked(page);
-                       page_cache_release(page);
-                       goto repeat;
-               }
-               if (PageWriteback(page)) {
-                       shmem_swp_unmap(entry);
-                       spin_unlock(&info->lock);
-                       wait_on_page_writeback(page);
-                       unlock_page(page);
-                       page_cache_release(page);
-                       goto repeat;
-               }
+               lock_page(page);
                if (!PageUptodate(page)) {
-                       shmem_swp_unmap(entry);
-                       spin_unlock(&info->lock);
-                       unlock_page(page);
-                       page_cache_release(page);
                        error = -EIO;
-                       goto out;
+                       goto failed;
                }
-
-               error = add_to_page_cache_locked(page, mapping,
-                                                idx, GFP_NOWAIT);
-               if (error) {
-                       shmem_swp_unmap(entry);
-                       spin_unlock(&info->lock);
-                       if (error == -ENOMEM) {
-                               /*
-                                * reclaim from proper memory cgroup and
-                                * call memcg's OOM if needed.
-                                */
-                               error = mem_cgroup_shmem_charge_fallback(
-                                               page, current->mm, gfp);
-                               if (error) {
-                                       unlock_page(page);
-                                       page_cache_release(page);
-                                       goto out;
-                               }
-                       }
-                       unlock_page(page);
-                       page_cache_release(page);
-                       goto repeat;
+               wait_on_page_writeback(page);
+
+               /* Someone may have already done it for us */
+               if (page->mapping) {
+                       if (page->mapping == mapping &&
+                           page->index == index)
+                               goto done;
+                       error = -EEXIST;
+                       goto failed;
                }
 
-               info->flags |= SHMEM_PAGEIN;
-               shmem_swp_set(info, entry, 0);
-               shmem_swp_unmap(entry);
-               delete_from_swap_cache(page);
+               error = mem_cgroup_cache_charge(page, current->mm,
+                                               gfp & GFP_RECLAIM_MASK);
+               if (!error)
+                       error = shmem_add_to_page_cache(page, mapping, index,
+                                               gfp, swp_to_radix_entry(swap));
+               if (error)
+                       goto failed;
+
+               spin_lock(&info->lock);
+               info->swapped--;
+               shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
+
+               delete_from_swap_cache(page);
                set_page_dirty(page);
                swap_free(swap);
 
-       } else if (sgp == SGP_READ) {
-               shmem_swp_unmap(entry);
-               page = find_get_page(mapping, idx);
-               if (page && !trylock_page(page)) {
-                       spin_unlock(&info->lock);
-                       wait_on_page_locked(page);
-                       page_cache_release(page);
-                       goto repeat;
+       } else {
+               if (shmem_acct_block(info->flags)) {
+                       error = -ENOSPC;
+                       goto failed;
                }
-               spin_unlock(&info->lock);
-
-       } else if (prealloc_page) {
-               shmem_swp_unmap(entry);
-               sbinfo = SHMEM_SB(inode->i_sb);
                if (sbinfo->max_blocks) {
                        if (percpu_counter_compare(&sbinfo->used_blocks,
-                                               sbinfo->max_blocks) >= 0 ||
-                           shmem_acct_block(info->flags))
-                               goto nospace;
+                                               sbinfo->max_blocks) >= 0) {
+                               error = -ENOSPC;
+                               goto unacct;
+                       }
                        percpu_counter_inc(&sbinfo->used_blocks);
-                       inode->i_blocks += BLOCKS_PER_PAGE;
-               } else if (shmem_acct_block(info->flags))
-                       goto nospace;
-
-               page = prealloc_page;
-               prealloc_page = NULL;
-
-               entry = shmem_swp_alloc(info, idx, sgp, gfp);
-               if (IS_ERR(entry))
-                       error = PTR_ERR(entry);
-               else {
-                       swap = *entry;
-                       shmem_swp_unmap(entry);
                }
-               ret = error || swap.val;
-               if (ret)
-                       mem_cgroup_uncharge_cache_page(page);
-               else
-                       ret = add_to_page_cache_lru(page, mapping,
-                                               idx, GFP_NOWAIT);
-               /*
-                * At add_to_page_cache_lru() failure,
-                * uncharge will be done automatically.
-                */
-               if (ret) {
-                       shmem_unacct_blocks(info->flags, 1);
-                       shmem_free_blocks(inode, 1);
-                       spin_unlock(&info->lock);
-                       page_cache_release(page);
-                       if (error)
-                               goto out;
-                       goto repeat;
+
+               page = shmem_alloc_page(gfp, info, index);
+               if (!page) {
+                       error = -ENOMEM;
+                       goto decused;
                }
 
-               info->flags |= SHMEM_PAGEIN;
+               SetPageSwapBacked(page);
+               __set_page_locked(page);
+               error = mem_cgroup_cache_charge(page, current->mm,
+                                               gfp & GFP_RECLAIM_MASK);
+               if (!error)
+                       error = shmem_add_to_page_cache(page, mapping, index,
+                                               gfp, NULL);
+               if (error)
+                       goto decused;
+               lru_cache_add_anon(page);
+
+               spin_lock(&info->lock);
                info->alloced++;
+               inode->i_blocks += BLOCKS_PER_PAGE;
+               shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
+
                clear_highpage(page);
                flush_dcache_page(page);
                SetPageUptodate(page);
                if (sgp == SGP_DIRTY)
                        set_page_dirty(page);
-
-       } else {
-               spin_unlock(&info->lock);
-               error = -ENOMEM;
-               goto out;
        }
 done:
-       *pagep = page;
-       error = 0;
-out:
-       if (prealloc_page) {
-               mem_cgroup_uncharge_cache_page(prealloc_page);
-               page_cache_release(prealloc_page);
+       /* Perhaps the file has been truncated since we checked */
+       if (sgp != SGP_WRITE &&
+           ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+               error = -EINVAL;
+               goto trunc;
        }
-       return error;
+       *pagep = page;
+       return 0;
 
-nospace:
        /*
-        * Perhaps the page was brought in from swap between find_lock_page
-        * and taking info->lock?  We allow for that at add_to_page_cache_lru,
-        * but must also avoid reporting a spurious ENOSPC while working on a
-        * full tmpfs.
+        * Error recovery.
         */
-       page = find_get_page(mapping, idx);
+trunc:
+       ClearPageDirty(page);
+       delete_from_page_cache(page);
+       spin_lock(&info->lock);
+       info->alloced--;
+       inode->i_blocks -= BLOCKS_PER_PAGE;
        spin_unlock(&info->lock);
+decused:
+       if (sbinfo->max_blocks)
+               percpu_counter_add(&sbinfo->used_blocks, -1);
+unacct:
+       shmem_unacct_blocks(info->flags, 1);
+failed:
+       if (swap.val && error != -EINVAL) {
+               struct page *test = find_get_page(mapping, index);
+               if (test && !radix_tree_exceptional_entry(test))
+                       page_cache_release(test);
+               /* Have another try if the entry has changed */
+               if (test != swp_to_radix_entry(swap))
+                       error = -EEXIST;
+       }
        if (page) {
+               unlock_page(page);
                page_cache_release(page);
+       }
+       if (error == -ENOSPC && !once++) {
+               info = SHMEM_I(inode);
+               spin_lock(&info->lock);
+               shmem_recalc_inode(inode);
+               spin_unlock(&info->lock);
                goto repeat;
        }
-       error = -ENOSPC;
-       goto out;
+       if (error == -EEXIST)
+               goto repeat;
+       return error;
 }
 
 static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -1467,9 +1022,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        int error;
        int ret = VM_FAULT_LOCKED;
 
-       if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-               return VM_FAULT_SIGBUS;
-
        error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
        if (error)
                return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -1482,20 +1034,20 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 
 #ifdef CONFIG_NUMA
-static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
 {
-       struct inode *i = vma->vm_file->f_path.dentry->d_inode;
-       return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
+       struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+       return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
 }
 
 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
                                          unsigned long addr)
 {
-       struct inode *i = vma->vm_file->f_path.dentry->d_inode;
-       unsigned long idx;
+       struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+       pgoff_t index;
 
-       idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-       return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
+       index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+       return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
 }
 #endif
 
@@ -1593,7 +1145,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 
 #ifdef CONFIG_TMPFS
 static const struct inode_operations shmem_symlink_inode_operations;
-static const struct inode_operations shmem_symlink_inline_operations;
+static const struct inode_operations shmem_short_symlink_operations;
 
 static int
 shmem_write_begin(struct file *file, struct address_space *mapping,
@@ -1626,7 +1178,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 {
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct address_space *mapping = inode->i_mapping;
-       unsigned long index, offset;
+       pgoff_t index;
+       unsigned long offset;
        enum sgp_type sgp = SGP_READ;
 
        /*
@@ -1642,7 +1195,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 
        for (;;) {
                struct page *page = NULL;
-               unsigned long end_index, nr, ret;
+               pgoff_t end_index;
+               unsigned long nr, ret;
                loff_t i_size = i_size_read(inode);
 
                end_index = i_size >> PAGE_CACHE_SHIFT;
@@ -1880,8 +1434,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_namelen = NAME_MAX;
        if (sbinfo->max_blocks) {
                buf->f_blocks = sbinfo->max_blocks;
-               buf->f_bavail = buf->f_bfree =
-                               sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
+               buf->f_bavail =
+               buf->f_bfree  = sbinfo->max_blocks -
+                               percpu_counter_sum(&sbinfo->used_blocks);
        }
        if (sbinfo->max_inodes) {
                buf->f_files = sbinfo->max_inodes;
@@ -2055,10 +1610,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 
        info = SHMEM_I(inode);
        inode->i_size = len-1;
-       if (len <= SHMEM_SYMLINK_INLINE_LEN) {
-               /* do it inline */
-               memcpy(info->inline_symlink, symname, len);
-               inode->i_op = &shmem_symlink_inline_operations;
+       if (len <= SHORT_SYMLINK_LEN) {
+               info->symlink = kmemdup(symname, len, GFP_KERNEL);
+               if (!info->symlink) {
+                       iput(inode);
+                       return -ENOMEM;
+               }
+               inode->i_op = &shmem_short_symlink_operations;
        } else {
                error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
                if (error) {
@@ -2081,17 +1639,17 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
        return 0;
 }
 
-static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
+static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
 {
-       nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink);
+       nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
        return NULL;
 }
 
 static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
        struct page *page = NULL;
-       int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
-       nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
+       int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
+       nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
        if (page)
                unlock_page(page);
        return page;
@@ -2202,7 +1760,6 @@ out:
        return err;
 }
 
-
 static const struct xattr_handler *shmem_xattr_handlers[] = {
 #ifdef CONFIG_TMPFS_POSIX_ACL
        &generic_acl_access_handler,
@@ -2332,9 +1889,9 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
 }
 #endif /* CONFIG_TMPFS_XATTR */
 
-static const struct inode_operations shmem_symlink_inline_operations = {
+static const struct inode_operations shmem_short_symlink_operations = {
        .readlink       = generic_readlink,
-       .follow_link    = shmem_follow_link_inline,
+       .follow_link    = shmem_follow_short_symlink,
 #ifdef CONFIG_TMPFS_XATTR
        .setxattr       = shmem_setxattr,
        .getxattr       = shmem_getxattr,
@@ -2534,8 +2091,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
        if (config.max_inodes < inodes)
                goto out;
        /*
-        * Those tests also disallow limited->unlimited while any are in
-        * use, so i_blocks will always be zero when max_blocks is zero;
+        * Those tests disallow limited->unlimited while any are in use;
         * but we must separately disallow unlimited->limited, because
         * in that case we have no record of how much is already in use.
         */
@@ -2627,7 +2183,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
                goto failed;
        sbinfo->free_inodes = sbinfo->max_inodes;
 
-       sb->s_maxbytes = SHMEM_MAX_BYTES;
+       sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_blocksize = PAGE_CACHE_SIZE;
        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
        sb->s_magic = TMPFS_MAGIC;
@@ -2662,14 +2218,14 @@ static struct kmem_cache *shmem_inode_cachep;
 
 static struct inode *shmem_alloc_inode(struct super_block *sb)
 {
-       struct shmem_inode_info *p;
-       p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
-       if (!p)
+       struct shmem_inode_info *info;
+       info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
+       if (!info)
                return NULL;
-       return &p->vfs_inode;
+       return &info->vfs_inode;
 }
 
-static void shmem_i_callback(struct rcu_head *head)
+static void shmem_destroy_callback(struct rcu_head *head)
 {
        struct inode *inode = container_of(head, struct inode, i_rcu);
        INIT_LIST_HEAD(&inode->i_dentry);
@@ -2678,29 +2234,26 @@ static void shmem_i_callback(struct rcu_head *head)
 
 static void shmem_destroy_inode(struct inode *inode)
 {
-       if ((inode->i_mode & S_IFMT) == S_IFREG) {
-               /* only struct inode is valid if it's an inline symlink */
+       if ((inode->i_mode & S_IFMT) == S_IFREG)
                mpol_free_shared_policy(&SHMEM_I(inode)->policy);
-       }
-       call_rcu(&inode->i_rcu, shmem_i_callback);
+       call_rcu(&inode->i_rcu, shmem_destroy_callback);
 }
 
-static void init_once(void *foo)
+static void shmem_init_inode(void *foo)
 {
-       struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
-
-       inode_init_once(&p->vfs_inode);
+       struct shmem_inode_info *info = foo;
+       inode_init_once(&info->vfs_inode);
 }
 
-static int init_inodecache(void)
+static int shmem_init_inodecache(void)
 {
        shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
                                sizeof(struct shmem_inode_info),
-                               0, SLAB_PANIC, init_once);
+                               0, SLAB_PANIC, shmem_init_inode);
        return 0;
 }
 
-static void destroy_inodecache(void)
+static void shmem_destroy_inodecache(void)
 {
        kmem_cache_destroy(shmem_inode_cachep);
 }
@@ -2797,21 +2350,20 @@ static const struct vm_operations_struct shmem_vm_ops = {
 #endif
 };
 
-
 static struct dentry *shmem_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
 {
        return mount_nodev(fs_type, flags, data, shmem_fill_super);
 }
 
-static struct file_system_type tmpfs_fs_type = {
+static struct file_system_type shmem_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "tmpfs",
        .mount          = shmem_mount,
        .kill_sb        = kill_litter_super,
 };
 
-int __init init_tmpfs(void)
+int __init shmem_init(void)
 {
        int error;
 
@@ -2819,18 +2371,18 @@ int __init init_tmpfs(void)
        if (error)
                goto out4;
 
-       error = init_inodecache();
+       error = shmem_init_inodecache();
        if (error)
                goto out3;
 
-       error = register_filesystem(&tmpfs_fs_type);
+       error = register_filesystem(&shmem_fs_type);
        if (error) {
                printk(KERN_ERR "Could not register tmpfs\n");
                goto out2;
        }
 
-       shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
-                               tmpfs_fs_type.name, NULL);
+       shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER,
+                                shmem_fs_type.name, NULL);
        if (IS_ERR(shm_mnt)) {
                error = PTR_ERR(shm_mnt);
                printk(KERN_ERR "Could not kern_mount tmpfs\n");
@@ -2839,9 +2391,9 @@ int __init init_tmpfs(void)
        return 0;
 
 out1:
-       unregister_filesystem(&tmpfs_fs_type);
+       unregister_filesystem(&shmem_fs_type);
 out2:
-       destroy_inodecache();
+       shmem_destroy_inodecache();
 out3:
        bdi_destroy(&shmem_backing_dev_info);
 out4:
@@ -2849,45 +2401,6 @@ out4:
        return error;
 }
 
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
- * @inode: the inode to be searched
- * @pgoff: the offset to be searched
- * @pagep: the pointer for the found page to be stored
- * @ent: the pointer for the found swap entry to be stored
- *
- * If a page is found, refcount of it is incremented. Callers should handle
- * these refcount.
- */
-void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
-                                       struct page **pagep, swp_entry_t *ent)
-{
-       swp_entry_t entry = { .val = 0 }, *ptr;
-       struct page *page = NULL;
-       struct shmem_inode_info *info = SHMEM_I(inode);
-
-       if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-               goto out;
-
-       spin_lock(&info->lock);
-       ptr = shmem_swp_entry(info, pgoff, NULL);
-#ifdef CONFIG_SWAP
-       if (ptr && ptr->val) {
-               entry.val = ptr->val;
-               page = find_get_page(&swapper_space, entry.val);
-       } else
-#endif
-               page = find_get_page(inode->i_mapping, pgoff);
-       if (ptr)
-               shmem_swp_unmap(ptr);
-       spin_unlock(&info->lock);
-out:
-       *pagep = page;
-       *ent = entry;
-}
-#endif
-
 #else /* !CONFIG_SHMEM */
 
 /*
@@ -2901,23 +2414,23 @@ out:
 
 #include <linux/ramfs.h>
 
-static struct file_system_type tmpfs_fs_type = {
+static struct file_system_type shmem_fs_type = {
        .name           = "tmpfs",
        .mount          = ramfs_mount,
        .kill_sb        = kill_litter_super,
 };
 
-int __init init_tmpfs(void)
+int __init shmem_init(void)
 {
-       BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
+       BUG_ON(register_filesystem(&shmem_fs_type) != 0);
 
-       shm_mnt = kern_mount(&tmpfs_fs_type);
+       shm_mnt = kern_mount(&shmem_fs_type);
        BUG_ON(IS_ERR(shm_mnt));
 
        return 0;
 }
 
-int shmem_unuse(swp_entry_t entry, struct page *page)
+int shmem_unuse(swp_entry_t swap, struct page *page)
 {
        return 0;
 }
@@ -2927,43 +2440,17 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
        return 0;
 }
 
-void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 {
-       truncate_inode_pages_range(inode->i_mapping, start, end);
+       truncate_inode_pages_range(inode->i_mapping, lstart, lend);
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
 
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
- * @inode: the inode to be searched
- * @pgoff: the offset to be searched
- * @pagep: the pointer for the found page to be stored
- * @ent: the pointer for the found swap entry to be stored
- *
- * If a page is found, refcount of it is incremented. Callers should handle
- * these refcount.
- */
-void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
-                                       struct page **pagep, swp_entry_t *ent)
-{
-       struct page *page = NULL;
-
-       if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-               goto out;
-       page = find_get_page(inode->i_mapping, pgoff);
-out:
-       *pagep = page;
-       *ent = (swp_entry_t){ .val = 0 };
-}
-#endif
-
 #define shmem_vm_ops                           generic_file_vm_ops
 #define shmem_file_operations                  ramfs_file_operations
 #define shmem_get_inode(sb, dir, mode, dev, flags)     ramfs_get_inode(sb, dir, mode, dev)
 #define shmem_acct_size(flags, size)           0
 #define shmem_unacct_size(flags, size)         do {} while (0)
-#define SHMEM_MAX_BYTES                                MAX_LFS_FILESIZE
 
 #endif /* CONFIG_SHMEM */
 
@@ -2987,7 +2474,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
        if (IS_ERR(shm_mnt))
                return (void *)shm_mnt;
 
-       if (size < 0 || size > SHMEM_MAX_BYTES)
+       if (size < 0 || size > MAX_LFS_FILESIZE)
                return ERR_PTR(-EINVAL);
 
        if (shmem_acct_size(flags, size))
index 1b8c339..17bc224 100644 (file)
@@ -1924,20 +1924,24 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
 
        /*
         * Find out how many pages are allowed for a single swap
-        * device. There are two limiting factors: 1) the number of
-        * bits for the swap offset in the swp_entry_t type and
-        * 2) the number of bits in the a swap pte as defined by
-        * the different architectures. In order to find the
-        * largest possible bit mask a swap entry with swap type 0
+        * device. There are three limiting factors: 1) the number
+        * of bits for the swap offset in the swp_entry_t type, and
+        * 2) the number of bits in the swap pte as defined by the
+        * the different architectures, and 3) the number of free bits
+        * in an exceptional radix_tree entry. In order to find the
+        * largest possible bit mask, a swap entry with swap type 0
         * and swap offset ~0UL is created, encoded to a swap pte,
-        * decoded to a swp_entry_t again and finally the swap
+        * decoded to a swp_entry_t again, and finally the swap
         * offset is extracted. This will mask all the bits from
         * the initial ~0UL mask that can't be encoded in either
         * the swp_entry_t or the architecture definition of a
-        * swap pte.
+        * swap pte.  Then the same is done for a radix_tree entry.
         */
        maxpages = swp_offset(pte_to_swp_entry(
-                       swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+                       swp_entry_to_pte(swp_entry(0, ~0UL))));
+       maxpages = swp_offset(radix_to_swp_entry(
+                       swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
+
        if (maxpages > swap_header->info.last_page) {
                maxpages = swap_header->info.last_page + 1;
                /* p->max is an unsigned int: don't overflow it */
index 232eb27..b40ac6d 100644 (file)
@@ -336,6 +336,14 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
        unsigned long count = 0;
        int i;
 
+       /*
+        * Note: this function may get called on a shmem/tmpfs mapping:
+        * pagevec_lookup() might then return 0 prematurely (because it
+        * got a gangful of swap entries); but it's hardly worth worrying
+        * about - it can rarely have anything to free from such a mapping
+        * (most pages are dirty), and already skips over any difficulties.
+        */
+
        pagevec_init(&pvec, 0);
        while (index <= end && pagevec_lookup(&pvec, mapping, index,
                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
index 6d8ef4a..8b2d37b 100644 (file)
@@ -128,34 +128,34 @@ unsigned long long get_msr(int cpu, off_t offset)
 void print_header(void)
 {
        if (show_pkg)
-               fprintf(stderr, "pk");
+               fprintf(stderr, "pk");
        if (show_core)
-               fprintf(stderr, "core");
+               fprintf(stderr, " cr");
        if (show_cpu)
                fprintf(stderr, " CPU");
        if (do_nhm_cstates)
-               fprintf(stderr, "   %%c0 ");
+               fprintf(stderr, "    %%c0 ");
        if (has_aperf)
-               fprintf(stderr, "  GHz");
+               fprintf(stderr, " GHz");
        fprintf(stderr, "  TSC");
        if (do_nhm_cstates)
-               fprintf(stderr, "   %%c1 ");
+               fprintf(stderr, "    %%c1");
        if (do_nhm_cstates)
-               fprintf(stderr, "   %%c3 ");
+               fprintf(stderr, "    %%c3");
        if (do_nhm_cstates)
-               fprintf(stderr, "   %%c6 ");
+               fprintf(stderr, "    %%c6");
        if (do_snb_cstates)
-               fprintf(stderr, "   %%c7 ");
+               fprintf(stderr, "    %%c7");
        if (do_snb_cstates)
-               fprintf(stderr, "  %%pc2 ");
+               fprintf(stderr, "  %%pc2");
        if (do_nhm_cstates)
-               fprintf(stderr, "  %%pc3 ");
+               fprintf(stderr, "  %%pc3");
        if (do_nhm_cstates)
-               fprintf(stderr, "  %%pc6 ");
+               fprintf(stderr, "  %%pc6");
        if (do_snb_cstates)
-               fprintf(stderr, "  %%pc7 ");
+               fprintf(stderr, "  %%pc7");
        if (extra_msr_offset)
-               fprintf(stderr, "       MSR 0x%x ", extra_msr_offset);
+               fprintf(stderr, "        MSR 0x%x ", extra_msr_offset);
 
        putc('\n', stderr);
 }
@@ -194,14 +194,14 @@ void print_cnt(struct counters *p)
        /* topology columns, print blanks on 1st (average) line */
        if (p == cnt_average) {
                if (show_pkg)
-                       fprintf(stderr, "    ");
+                       fprintf(stderr, " ");
                if (show_core)
                        fprintf(stderr, "    ");
                if (show_cpu)
                        fprintf(stderr, "    ");
        } else {
                if (show_pkg)
-                       fprintf(stderr, "%4d", p->pkg);
+                       fprintf(stderr, "%d", p->pkg);
                if (show_core)
                        fprintf(stderr, "%4d", p->core);
                if (show_cpu)
@@ -241,22 +241,22 @@ void print_cnt(struct counters *p)
                if (!skip_c1)
                        fprintf(stderr, "%7.2f", 100.0 * p->c1/p->tsc);
                else
-                       fprintf(stderr, "   ****");
+                       fprintf(stderr, "  ****");
        }
        if (do_nhm_cstates)
-               fprintf(stderr, "%7.2f", 100.0 * p->c3/p->tsc);
+               fprintf(stderr, " %6.2f", 100.0 * p->c3/p->tsc);
        if (do_nhm_cstates)
-               fprintf(stderr, "%7.2f", 100.0 * p->c6/p->tsc);
+               fprintf(stderr, " %6.2f", 100.0 * p->c6/p->tsc);
        if (do_snb_cstates)
-               fprintf(stderr, "%7.2f", 100.0 * p->c7/p->tsc);
+               fprintf(stderr, " %6.2f", 100.0 * p->c7/p->tsc);
        if (do_snb_cstates)
-               fprintf(stderr, "%7.2f", 100.0 * p->pc2/p->tsc);
+               fprintf(stderr, " %5.2f", 100.0 * p->pc2/p->tsc);
        if (do_nhm_cstates)
-               fprintf(stderr, "%7.2f", 100.0 * p->pc3/p->tsc);
+               fprintf(stderr, " %5.2f", 100.0 * p->pc3/p->tsc);
        if (do_nhm_cstates)
-               fprintf(stderr, "%7.2f", 100.0 * p->pc6/p->tsc);
+               fprintf(stderr, " %5.2f", 100.0 * p->pc6/p->tsc);
        if (do_snb_cstates)
-               fprintf(stderr, "%7.2f", 100.0 * p->pc7/p->tsc);
+               fprintf(stderr, " %5.2f", 100.0 * p->pc7/p->tsc);
        if (extra_msr_offset)
                fprintf(stderr, "  0x%016llx", p->extra_msr);
        putc('\n', stderr);
index 2618ef2..33c5c7e 100644 (file)
@@ -137,7 +137,6 @@ void cmdline(int argc, char **argv)
 void validate_cpuid(void)
 {
        unsigned int eax, ebx, ecx, edx, max_level;
-       char brand[16];
        unsigned int fms, family, model, stepping;
 
        eax = ebx = ecx = edx = 0;
@@ -160,8 +159,8 @@ void validate_cpuid(void)
                model += ((fms >> 16) & 0xf) << 4;
 
        if (verbose > 1)
-               printf("CPUID %s %d levels family:model:stepping "
-                       "0x%x:%x:%x (%d:%d:%d)\n", brand, max_level,
+               printf("CPUID %d levels family:model:stepping "
+                       "0x%x:%x:%x (%d:%d:%d)\n", max_level,
                        family, model, stepping, family, model, stepping);
 
        if (!(edx & (1 << 5))) {