Merge master.kernel.org:/home/rmk/linux-2.6-arm
authorLinus Torvalds <torvalds@woody.osdl.org>
Mon, 8 Jan 2007 23:06:39 +0000 (15:06 -0800)
committerLinus Torvalds <torvalds@woody.osdl.org>
Mon, 8 Jan 2007 23:06:39 +0000 (15:06 -0800)
* master.kernel.org:/home/rmk/linux-2.6-arm:
  [ARM] Provide basic printk_clock() implementation
  [ARM] Resolve fuse and direct-IO failures due to missing cache flushes
  [ARM] pass vma for flush_anon_page()
  [ARM] Fix potential MMCI bug
  [ARM] Fix kernel-mode undefined instruction aborts
  [ARM] 4082/1: iop3xx: fix iop33x gpio register offset
  [ARM] 4070/1: arch/arm/kernel: fix warnings from missing includes
  [ARM] 4079/1: iop: Update MAINTAINERS

69 files changed:
Documentation/feature-removal-schedule.txt
Documentation/usb/acm.txt
Documentation/x86_64/boot-options.txt
MAINTAINERS
Makefile
arch/i386/Kconfig
arch/i386/kernel/cpu/common.c
arch/i386/kernel/smpboot.c
arch/i386/kernel/trampoline.S
arch/x86_64/kernel/early-quirks.c
arch/x86_64/kernel/io_apic.c
drivers/acpi/toshiba_acpi.c
drivers/ata/Kconfig
drivers/ata/pata_hpt37x.c
drivers/block/pktcdvd.c
drivers/char/ip2/i2ellis.h
drivers/connector/cn_proc.c
drivers/i2c/busses/Kconfig
drivers/i2c/busses/i2c-mv64xxx.c
drivers/i2c/busses/i2c-pnx.c
drivers/i2c/chips/m41t00.c
drivers/i2c/i2c-core.c
drivers/ide/pci/atiixp.c
drivers/ide/pci/via82cxxx.c
drivers/kvm/kvm.h
drivers/kvm/kvm_main.c
drivers/kvm/mmu.c
drivers/kvm/paging_tmpl.h
drivers/kvm/svm.c
drivers/kvm/vmx.c
drivers/kvm/x86_emulate.c
drivers/leds/leds-s3c24xx.c
drivers/macintosh/via-pmu.c
drivers/pci/Kconfig
drivers/pci/search.c
drivers/rtc/rtc-at91rm9200.c
drivers/rtc/rtc-rs5c372.c
drivers/usb/class/usblp.c
drivers/usb/core/endpoint.c
drivers/usb/gadget/omap_udc.c
drivers/usb/gadget/omap_udc.h
drivers/usb/host/uhci-hcd.c
drivers/usb/misc/sisusbvga/sisusb_con.c
drivers/usb/net/asix.c
drivers/usb/serial/Kconfig
drivers/usb/serial/option.c
drivers/usb/storage/unusual_devs.h
fs/adfs/dir_f.c
fs/bad_inode.c
fs/binfmt_elf.c
fs/ufs/balloc.c
fs/ufs/inode.c
include/asm-i386/boot.h
include/linux/kvm.h
include/linux/magic.h
include/linux/swap.h
init/main.c
kernel/module.c
kernel/params.c
kernel/power/swap.c
kernel/power/user.c
kernel/profile.c
mm/oom_kill.c
mm/page_alloc.c
mm/slab.c
mm/swapfile.c
mm/vmscan.c
scripts/kconfig/qconf.cc
scripts/kconfig/qconf.h

index 30f3c8c..f2024df 100644 (file)
@@ -226,6 +226,23 @@ Who:       Jean Delvare <khali@linux-fr.org>
 
 ---------------------------
 
+What:  i2c_adapter.dev
+       i2c_adapter.list
+When:  July 2007
+Why:   Superfluous, given i2c_adapter.class_dev:
+         * The "dev" was a stand-in for the physical device node that legacy
+           drivers would not have; but now it's almost always present.  Any
+           remaining legacy drivers must upgrade (they now trigger warnings).
+         * The "list" duplicates class device children.
+       The delay in removing this is so upgraded lm_sensors and libsensors
+       can get deployed.  (Removal causes minor changes in the sysfs layout,
+       notably the location of the adapter type name and parenting the i2c
+       client hardware directly from their controller.)
+Who:   Jean Delvare <khali@linux-fr.org>,
+       David Brownell <dbrownell@users.sourceforge.net>
+
+---------------------------
+
 What:  IPv4 only connection tracking/NAT/helpers
 When:  2.6.22
 Why:   The new layer 3 independant connection tracking replaces the old
index 737d610..17f5c2e 100644 (file)
@@ -46,6 +46,10 @@ Abstract Control Model (USB CDC ACM) specification.
 
        3Com USR ISDN Pro TA
 
+  Some cell phones also connect via USB. I know the following phones work:
+
+       SonyEricsson K800i
+
   Unfortunately many modems and most ISDN TAs use proprietary interfaces and
 thus won't work with this drivers. Check for ACM compliance before buying.
 
index dbdcaf6..5c86ed6 100644 (file)
@@ -52,6 +52,10 @@ APICs
                 apicmaintimer. Useful when your PIT timer is totally
                 broken.
 
+   disable_8254_timer / enable_8254_timer
+                Enable interrupt 0 timer routing over the 8254 in addition to over
+                the IO-APIC. The kernel tries to set a sensible default.
+
 Early Console
 
    syntax: earlyprintk=vga
index 6ecb296..4ccc5fa 100644 (file)
@@ -2593,6 +2593,12 @@ P:       Adam Belay
 M:     ambx1@neo.rr.com
 S:     Maintained
 
+PNXxxxx I2C DRIVER
+P:     Vitaly Wool
+M:     vitalywool@gmail.com
+L:     i2c@lm-sensors.org
+S:     Maintained
+
 PPP PROTOCOL DRIVERS AND COMPRESSORS
 P:     Paul Mackerras
 M:     paulus@samba.org
index 0e9eee7..fb5b3ef 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 20
-EXTRAVERSION =-rc3
+EXTRAVERSION =-rc4
 NAME = Homicidal Dwarf Hamster
 
 # *DOCUMENTATION*
index 0d67a0a..0dfee81 100644 (file)
@@ -777,6 +777,47 @@ config CRASH_DUMP
           PHYSICAL_START.
          For more details see Documentation/kdump/kdump.txt
 
+config PHYSICAL_START
+       hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
+       default "0x100000"
+       help
+         This gives the physical address where the kernel is loaded.
+
+         If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then
+         bzImage will decompress itself to above physical address and
+         run from there. Otherwise, bzImage will run from the address where
+         it has been loaded by the boot loader and will ignore above physical
+         address.
+
+         In normal kdump cases one does not have to set/change this option
+         as now bzImage can be compiled as a completely relocatable image
+         (CONFIG_RELOCATABLE=y) and be used to load and run from a different
+         address. This option is mainly useful for the folks who don't want
+         to use a bzImage for capturing the crash dump and want to use a
+         vmlinux instead. vmlinux is not relocatable hence a kernel needs
+         to be specifically compiled to run from a specific memory area
+         (normally a reserved region) and this option comes handy.
+
+         So if you are using bzImage for capturing the crash dump, leave
+         the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y.
+         Otherwise if you plan to use vmlinux for capturing the crash dump
+         change this value to start of the reserved region (Typically 16MB
+         0x1000000). In other words, it can be set based on the "X" value as
+         specified in the "crashkernel=YM@XM" command line boot parameter
+         passed to the panic-ed kernel. Typically this parameter is set as
+         crashkernel=64M@16M. Please take a look at
+         Documentation/kdump/kdump.txt for more details about crash dumps.
+
+         Usage of bzImage for capturing the crash dump is recommended as
+         one does not have to build two kernels. Same kernel can be used
+         as production kernel and capture kernel. Above option should have
+         gone away after relocatable bzImage support is introduced. But it
+         is present because there are users out there who continue to use
+         vmlinux for dump capture. This option should go away down the
+         line.
+
+         Don't change this unless you know what you are doing.
+
 config RELOCATABLE
        bool "Build a relocatable kernel(EXPERIMENTAL)"
        depends on EXPERIMENTAL
index 1b34c56..8689d62 100644 (file)
@@ -54,7 +54,7 @@ static struct cpu_dev __cpuinitdata default_cpu = {
        .c_init = default_init,
        .c_vendor = "Unknown",
 };
-static struct cpu_dev * this_cpu = &default_cpu;
+static struct cpu_dev * this_cpu __cpuinitdata = &default_cpu;
 
 static int __init cachesize_setup(char *str)
 {
index aef39be..300d9b3 100644 (file)
@@ -227,7 +227,7 @@ static struct {
        atomic_t count_start;
        atomic_t count_stop;
        unsigned long long values[NR_CPUS];
-} tsc __initdata = {
+} tsc __cpuinitdata = {
        .start_flag = ATOMIC_INIT(0),
        .count_start = ATOMIC_INIT(0),
        .count_stop = ATOMIC_INIT(0),
@@ -332,7 +332,7 @@ static void __init synchronize_tsc_bp(void)
                printk("passed.\n");
 }
 
-static void __init synchronize_tsc_ap(void)
+static void __cpuinit synchronize_tsc_ap(void)
 {
        int i;
 
index fcce0e6..2f1814c 100644 (file)
 
 .data
 
+/* We can free up trampoline after bootup if cpu hotplug is not supported. */
+#ifndef CONFIG_HOTPLUG_CPU
+.section ".init.data","aw",@progbits
+#endif
+
 .code16
 
 ENTRY(trampoline_data)
index 829698f..49802f1 100644 (file)
@@ -69,6 +69,11 @@ static void nvidia_bugs(void)
 
 static void ati_bugs(void)
 {
+       if (timer_over_8254 == 1) {
+               timer_over_8254 = 0;
+               printk(KERN_INFO
+               "ATI board detected. Disabling timer routing over 8254.\n");
+       }
 }
 
 static void intel_bugs(void)
index 2a1dcd5..d7bad90 100644 (file)
@@ -55,6 +55,10 @@ int sis_apic_bug; /* not actually supported, dummy for compile */
 
 static int no_timer_check;
 
+static int disable_timer_pin_1 __initdata;
+
+int timer_over_8254 __initdata = 1;
+
 /* Where if anywhere is the i8259 connect in external int mode */
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 
@@ -350,6 +354,29 @@ static int __init disable_ioapic_setup(char *str)
 }
 early_param("noapic", disable_ioapic_setup);
 
+/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
+static int __init disable_timer_pin_setup(char *arg)
+{
+       disable_timer_pin_1 = 1;
+       return 1;
+}
+__setup("disable_timer_pin_1", disable_timer_pin_setup);
+
+static int __init setup_disable_8254_timer(char *s)
+{
+       timer_over_8254 = -1;
+       return 1;
+}
+static int __init setup_enable_8254_timer(char *s)
+{
+       timer_over_8254 = 2;
+       return 1;
+}
+
+__setup("disable_8254_timer", setup_disable_8254_timer);
+__setup("enable_8254_timer", setup_enable_8254_timer);
+
+
 /*
  * Find the IRQ entry number of a certain pin.
  */
@@ -1568,33 +1595,10 @@ static inline void unlock_ExtINT_logic(void)
  * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
  * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
  * fanatically on his truly buggy board.
+ *
+ * FIXME: really need to revamp this for modern platforms only.
  */
-
-static int try_apic_pin(int apic, int pin, char *msg)
-{
-       apic_printk(APIC_VERBOSE, KERN_INFO
-                   "..TIMER: trying IO-APIC=%d PIN=%d %s",
-                   apic, pin, msg);
-
-       /*
-        * Ok, does IRQ0 through the IOAPIC work?
-        */
-       if (!no_timer_check && timer_irq_works()) {
-               nmi_watchdog_default();
-               if (nmi_watchdog == NMI_IO_APIC) {
-                       disable_8259A_irq(0);
-                       setup_nmi();
-                       enable_8259A_irq(0);
-               }
-               return 1;
-       }
-       clear_IO_APIC_pin(apic, pin);
-       apic_printk(APIC_QUIET, KERN_ERR " .. failed\n");
-       return 0;
-}
-
-/* The function from hell */
-static void check_timer(void)
+static inline void check_timer(void)
 {
        int apic1, pin1, apic2, pin2;
        int vector;
@@ -1615,43 +1619,61 @@ static void check_timer(void)
         */
        apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
        init_8259A(1);
+       if (timer_over_8254 > 0)
+               enable_8259A_irq(0);
 
        pin1  = find_isa_irq_pin(0, mp_INT);
        apic1 = find_isa_irq_apic(0, mp_INT);
        pin2  = ioapic_i8259.pin;
        apic2 = ioapic_i8259.apic;
 
-       /* Do this first, otherwise we get double interrupts on ATI boards */
-       if ((pin1 != -1) && try_apic_pin(apic1, pin1,"with 8259 IRQ0 disabled"))
-               return;
+       apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
+               vector, apic1, pin1, apic2, pin2);
 
-       /* Now try again with IRQ0 8259A enabled.
-          Assumes timer is on IO-APIC 0 ?!? */
-       enable_8259A_irq(0);
-       unmask_IO_APIC_irq(0);
-       if (try_apic_pin(apic1, pin1, "with 8259 IRQ0 enabled"))
-               return;
-       disable_8259A_irq(0);
-
-       /* Always try pin0 and pin2 on APIC 0 to handle buggy timer overrides
-          on Nvidia boards */
-       if (!(apic1 == 0 && pin1 == 0) &&
-           try_apic_pin(0, 0, "fallback with 8259 IRQ0 disabled"))
-               return;
-       if (!(apic1 == 0 && pin1 == 2) &&
-           try_apic_pin(0, 2, "fallback with 8259 IRQ0 disabled"))
-               return;
+       if (pin1 != -1) {
+               /*
+                * Ok, does IRQ0 through the IOAPIC work?
+                */
+               unmask_IO_APIC_irq(0);
+               if (!no_timer_check && timer_irq_works()) {
+                       nmi_watchdog_default();
+                       if (nmi_watchdog == NMI_IO_APIC) {
+                               disable_8259A_irq(0);
+                               setup_nmi();
+                               enable_8259A_irq(0);
+                       }
+                       if (disable_timer_pin_1 > 0)
+                               clear_IO_APIC_pin(0, pin1);
+                       return;
+               }
+               clear_IO_APIC_pin(apic1, pin1);
+               apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
+                               "connected to IO-APIC\n");
+       }
 
-       /* Then try pure 8259A routing on the 8259 as reported by BIOS*/
-       enable_8259A_irq(0);
+       apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
+                               "through the 8259A ... ");
        if (pin2 != -1) {
+               apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
+                       apic2, pin2);
+               /*
+                * legacy devices should be connected to IO APIC #0
+                */
                setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
-               if (try_apic_pin(apic2,pin2,"8259A broadcast ExtINT from BIOS"))
+               if (timer_irq_works()) {
+                       apic_printk(APIC_VERBOSE," works.\n");
+                       nmi_watchdog_default();
+                       if (nmi_watchdog == NMI_IO_APIC) {
+                               setup_nmi();
+                       }
                        return;
+               }
+               /*
+                * Cleanup, just in case ...
+                */
+               clear_IO_APIC_pin(apic2, pin2);
        }
-
-       /* Tried all possibilities to go through the IO-APIC. Now come the
-          really cheesy fallbacks. */
+       apic_printk(APIC_VERBOSE," failed.\n");
 
        if (nmi_watchdog == NMI_IO_APIC) {
                printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
index 88aeccb..d9b651f 100644 (file)
@@ -321,13 +321,16 @@ static int set_lcd_status(struct backlight_device *bd)
 static unsigned long write_lcd(const char *buffer, unsigned long count)
 {
        int value;
-       int ret = count;
+       int ret;
 
        if (sscanf(buffer, " brightness : %i", &value) == 1 &&
-           value >= 0 && value < HCI_LCD_BRIGHTNESS_LEVELS)
+           value >= 0 && value < HCI_LCD_BRIGHTNESS_LEVELS) {
                ret = set_lcd(value);
-       else
+               if (ret == 0)
+                       ret = count;
+       } else {
                ret = -EINVAL;
+       }
        return ret;
 }
 
index b34e0a9..da21552 100644 (file)
@@ -381,7 +381,7 @@ config PATA_OPTI
          If unsure, say N.
 
 config PATA_OPTIDMA
-       tristate "OPTI FireStar PATA support (Veyr Experimental)"
+       tristate "OPTI FireStar PATA support (Very Experimental)"
        depends on PCI && EXPERIMENTAL
        help
          This option enables DMA/PIO support for the later OPTi
index 47082df..dfb3060 100644 (file)
@@ -25,7 +25,7 @@
 #include <linux/libata.h>
 
 #define DRV_NAME       "pata_hpt37x"
-#define DRV_VERSION    "0.5.1"
+#define DRV_VERSION    "0.5.2"
 
 struct hpt_clock {
        u8      xfer_speed;
@@ -416,7 +416,7 @@ static const char *bad_ata100_5[] = {
 
 static unsigned long hpt370_filter(const struct ata_port *ap, struct ata_device *adev, unsigned long mask)
 {
-       if (adev->class != ATA_DEV_ATA) {
+       if (adev->class == ATA_DEV_ATA) {
                if (hpt_dma_blacklisted(adev, "UDMA", bad_ata33))
                        mask &= ~ATA_MASK_UDMA;
                if (hpt_dma_blacklisted(adev, "UDMA100", bad_ata100_5))
@@ -749,7 +749,7 @@ static void hpt37x_bmdma_stop(struct ata_queued_cmd *qc)
 {
        struct ata_port *ap = qc->ap;
        struct pci_dev *pdev = to_pci_dev(ap->host->dev);
-       int mscreg = 0x50 + 2 * ap->port_no;
+       int mscreg = 0x50 + 4 * ap->port_no;
        u8 bwsr_stat, msc_stat;
 
        pci_read_config_byte(pdev, 0x6A, &bwsr_stat);
index 7c95c76..6246219 100644 (file)
@@ -765,47 +765,34 @@ static inline struct bio *pkt_get_list_first(struct bio **list_head, struct bio
  */
 static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc)
 {
-       char sense[SCSI_SENSE_BUFFERSIZE];
-       request_queue_t *q;
+       request_queue_t *q = bdev_get_queue(pd->bdev);
        struct request *rq;
-       DECLARE_COMPLETION_ONSTACK(wait);
-       int err = 0;
+       int ret = 0;
 
-       q = bdev_get_queue(pd->bdev);
+       rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
+                            WRITE : READ, __GFP_WAIT);
+
+       if (cgc->buflen) {
+               if (blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, __GFP_WAIT))
+                       goto out;
+       }
+
+       rq->cmd_len = COMMAND_SIZE(rq->cmd[0]);
+       memcpy(rq->cmd, cgc->cmd, CDROM_PACKET_SIZE);
+       if (sizeof(rq->cmd) > CDROM_PACKET_SIZE)
+               memset(rq->cmd + CDROM_PACKET_SIZE, 0, sizeof(rq->cmd) - CDROM_PACKET_SIZE);
 
-       rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? WRITE : READ,
-                            __GFP_WAIT);
-       rq->errors = 0;
-       rq->rq_disk = pd->bdev->bd_disk;
-       rq->bio = NULL;
-       rq->buffer = NULL;
        rq->timeout = 60*HZ;
-       rq->data = cgc->buffer;
-       rq->data_len = cgc->buflen;
-       rq->sense = sense;
-       memset(sense, 0, sizeof(sense));
-       rq->sense_len = 0;
        rq->cmd_type = REQ_TYPE_BLOCK_PC;
        rq->cmd_flags |= REQ_HARDBARRIER;
        if (cgc->quiet)
                rq->cmd_flags |= REQ_QUIET;
-       memcpy(rq->cmd, cgc->cmd, CDROM_PACKET_SIZE);
-       if (sizeof(rq->cmd) > CDROM_PACKET_SIZE)
-               memset(rq->cmd + CDROM_PACKET_SIZE, 0, sizeof(rq->cmd) - CDROM_PACKET_SIZE);
-       rq->cmd_len = COMMAND_SIZE(rq->cmd[0]);
-
-       rq->ref_count++;
-       rq->end_io_data = &wait;
-       rq->end_io = blk_end_sync_rq;
-       elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
-       generic_unplug_device(q);
-       wait_for_completion(&wait);
-
-       if (rq->errors)
-               err = -EIO;
 
+       blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0);
+       ret = rq->errors;
+out:
        blk_put_request(rq);
-       return err;
+       return ret;
 }
 
 /*
index 5eabe47..4333050 100644 (file)
@@ -606,9 +606,9 @@ static int iiDownloadAll(i2eBordStrPtr, loadHdrStrPtr, int, int);
 // code and returning.
 //
 #define COMPLETE(pB,code) \
-       if(1){ \
+       do { \
                 pB->i2eError = code; \
                 return (code == I2EE_GOOD);\
-       }
+       } while (0)
 
 #endif   // I2ELLIS_H
index 3ece692..5c9f67f 100644 (file)
@@ -28,6 +28,7 @@
 #include <linux/init.h>
 #include <linux/connector.h>
 #include <asm/atomic.h>
+#include <asm/unaligned.h>
 
 #include <linux/cn_proc.h>
 
@@ -60,7 +61,7 @@ void proc_fork_connector(struct task_struct *task)
        ev = (struct proc_event*)msg->data;
        get_seq(&msg->seq, &ev->cpu);
        ktime_get_ts(&ts); /* get high res monotonic timestamp */
-       ev->timestamp_ns = timespec_to_ns(&ts);
+       put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
        ev->what = PROC_EVENT_FORK;
        ev->event_data.fork.parent_pid = task->real_parent->pid;
        ev->event_data.fork.parent_tgid = task->real_parent->tgid;
@@ -88,7 +89,7 @@ void proc_exec_connector(struct task_struct *task)
        ev = (struct proc_event*)msg->data;
        get_seq(&msg->seq, &ev->cpu);
        ktime_get_ts(&ts); /* get high res monotonic timestamp */
-       ev->timestamp_ns = timespec_to_ns(&ts);
+       put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
        ev->what = PROC_EVENT_EXEC;
        ev->event_data.exec.process_pid = task->pid;
        ev->event_data.exec.process_tgid = task->tgid;
@@ -124,7 +125,7 @@ void proc_id_connector(struct task_struct *task, int which_id)
                return;
        get_seq(&msg->seq, &ev->cpu);
        ktime_get_ts(&ts); /* get high res monotonic timestamp */
-       ev->timestamp_ns = timespec_to_ns(&ts);
+       put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
 
        memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
        msg->ack = 0; /* not used */
@@ -146,7 +147,7 @@ void proc_exit_connector(struct task_struct *task)
        ev = (struct proc_event*)msg->data;
        get_seq(&msg->seq, &ev->cpu);
        ktime_get_ts(&ts); /* get high res monotonic timestamp */
-       ev->timestamp_ns = timespec_to_ns(&ts);
+       put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
        ev->what = PROC_EVENT_EXIT;
        ev->event_data.exit.process_pid = task->pid;
        ev->event_data.exit.process_tgid = task->tgid;
@@ -181,7 +182,7 @@ static void cn_proc_ack(int err, int rcvd_seq, int rcvd_ack)
        ev = (struct proc_event*)msg->data;
        msg->seq = rcvd_seq;
        ktime_get_ts(&ts); /* get high res monotonic timestamp */
-       ev->timestamp_ns = timespec_to_ns(&ts);
+       put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
        ev->cpu = -1;
        ev->what = PROC_EVENT_NONE;
        ev->event_data.ack.err = err;
index e1989f3..9367c4c 100644 (file)
@@ -564,13 +564,4 @@ config I2C_PNX
          This driver can also be built as a module.  If so, the module
          will be called i2c-pnx.
 
-config I2C_PNX_EARLY
-       bool "Early initialization for I2C on PNXxxxx"
-       depends on I2C_PNX=y
-       help
-         Under certain circumstances one may need to make sure I2C on PNXxxxx
-         is initialized earlier than some other driver that depends on it
-         (for instance, that might be USB in case of PNX4008). With this
-         option turned on you can guarantee that.
-
 endmenu
index bbc8e3a..4901736 100644 (file)
@@ -529,6 +529,8 @@ mv64xxx_i2c_probe(struct platform_device *pd)
        platform_set_drvdata(pd, drv_data);
        i2c_set_adapdata(&drv_data->adapter, drv_data);
 
+       mv64xxx_i2c_hw_init(drv_data);
+
        if (request_irq(drv_data->irq, mv64xxx_i2c_intr, 0,
                        MV64XXX_I2C_CTLR_NAME, drv_data)) {
                dev_err(&drv_data->adapter.dev,
@@ -542,8 +544,6 @@ mv64xxx_i2c_probe(struct platform_device *pd)
                goto exit_free_irq;
        }
 
-       mv64xxx_i2c_hw_init(drv_data);
-
        return 0;
 
        exit_free_irq:
index de0bca7..17376fe 100644 (file)
@@ -305,8 +305,7 @@ static int i2c_pnx_master_rcv(struct i2c_adapter *adap)
        return 0;
 }
 
-static irqreturn_t
-i2c_pnx_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static irqreturn_t i2c_pnx_interrupt(int irq, void *dev_id)
 {
        u32 stat, ctl;
        struct i2c_adapter *adap = dev_id;
@@ -699,10 +698,6 @@ MODULE_AUTHOR("Vitaly Wool, Dennis Kovalev <source@mvista.com>");
 MODULE_DESCRIPTION("I2C driver for Philips IP3204-based I2C busses");
 MODULE_LICENSE("GPL");
 
-#ifdef CONFIG_I2C_PNX_EARLY
 /* We need to make sure I2C is initialized before USB */
 subsys_initcall(i2c_adap_pnx_init);
-#else
-mudule_init(i2c_adap_pnx_init);
-#endif
 module_exit(i2c_adap_pnx_exit);
index 420377c..3fcb646 100644 (file)
@@ -209,6 +209,7 @@ m41t00_set(void *arg)
        buf[m41t00_chip->hour] = (buf[m41t00_chip->hour] & ~0x3f) | (hour& 0x3f);
        buf[m41t00_chip->day] = (buf[m41t00_chip->day] & ~0x3f) | (day & 0x3f);
        buf[m41t00_chip->mon] = (buf[m41t00_chip->mon] & ~0x1f) | (mon & 0x1f);
+       buf[m41t00_chip->year] = year;
 
        if (i2c_master_send(save_client, wbuf, 9) < 0)
                dev_err(&save_client->dev, "m41t00_set: Write error\n");
index 3e31f1d..b05378a 100644 (file)
@@ -95,16 +95,32 @@ struct device_driver i2c_adapter_driver = {
        .bus = &i2c_bus_type,
 };
 
+/* ------------------------------------------------------------------------- */
+
+/* I2C bus adapters -- one roots each I2C or SMBUS segment */
+
 static void i2c_adapter_class_dev_release(struct class_device *dev)
 {
        struct i2c_adapter *adap = class_dev_to_i2c_adapter(dev);
        complete(&adap->class_dev_released);
 }
 
+static ssize_t i2c_adapter_show_name(struct class_device *cdev, char *buf)
+{
+       struct i2c_adapter *adap = class_dev_to_i2c_adapter(cdev);
+       return sprintf(buf, "%s\n", adap->name);
+}
+
+static struct class_device_attribute i2c_adapter_attrs[] = {
+       __ATTR(name, S_IRUGO, i2c_adapter_show_name, NULL),
+       { },
+};
+
 struct class i2c_adapter_class = {
-       .owner =        THIS_MODULE,
-       .name =         "i2c-adapter",
-       .release =      &i2c_adapter_class_dev_release,
+       .owner                  = THIS_MODULE,
+       .name                   = "i2c-adapter",
+       .class_dev_attrs        = i2c_adapter_attrs,
+       .release                = &i2c_adapter_class_dev_release,
 };
 
 static ssize_t show_adapter_name(struct device *dev, struct device_attribute *attr, char *buf)
@@ -175,8 +191,12 @@ int i2c_add_adapter(struct i2c_adapter *adap)
         * If the parent pointer is not set up,
         * we add this adapter to the host bus.
         */
-       if (adap->dev.parent == NULL)
+       if (adap->dev.parent == NULL) {
                adap->dev.parent = &platform_bus;
+               printk(KERN_WARNING "**WARNING** I2C adapter driver [%s] "
+                      "forgot to specify physical device; fix it!\n",
+                      adap->name);
+       }
        sprintf(adap->dev.bus_id, "i2c-%d", adap->nr);
        adap->dev.driver = &i2c_adapter_driver;
        adap->dev.release = &i2c_adapter_dev_release;
index ffdffb6..524e65d 100644 (file)
@@ -46,6 +46,8 @@ static atiixp_ide_timing mdma_timing[] = {
 
 static int save_mdma_mode[4];
 
+static DEFINE_SPINLOCK(atiixp_lock);
+
 /**
  *     atiixp_ratemask         -       compute rate mask for ATIIXP IDE
  *     @drive: IDE drive to compute for
@@ -105,7 +107,7 @@ static int atiixp_ide_dma_host_on(ide_drive_t *drive)
        unsigned long flags;
        u16 tmp16;
 
-       spin_lock_irqsave(&ide_lock, flags);
+       spin_lock_irqsave(&atiixp_lock, flags);
 
        pci_read_config_word(dev, ATIIXP_IDE_UDMA_CONTROL, &tmp16);
        if (save_mdma_mode[drive->dn])
@@ -114,7 +116,7 @@ static int atiixp_ide_dma_host_on(ide_drive_t *drive)
                tmp16 |= (1 << drive->dn);
        pci_write_config_word(dev, ATIIXP_IDE_UDMA_CONTROL, tmp16);
 
-       spin_unlock_irqrestore(&ide_lock, flags);
+       spin_unlock_irqrestore(&atiixp_lock, flags);
 
        return __ide_dma_host_on(drive);
 }
@@ -125,13 +127,13 @@ static int atiixp_ide_dma_host_off(ide_drive_t *drive)
        unsigned long flags;
        u16 tmp16;
 
-       spin_lock_irqsave(&ide_lock, flags);
+       spin_lock_irqsave(&atiixp_lock, flags);
 
        pci_read_config_word(dev, ATIIXP_IDE_UDMA_CONTROL, &tmp16);
        tmp16 &= ~(1 << drive->dn);
        pci_write_config_word(dev, ATIIXP_IDE_UDMA_CONTROL, tmp16);
 
-       spin_unlock_irqrestore(&ide_lock, flags);
+       spin_unlock_irqrestore(&atiixp_lock, flags);
 
        return __ide_dma_host_off(drive);
 }
@@ -152,7 +154,7 @@ static void atiixp_tuneproc(ide_drive_t *drive, u8 pio)
        u32 pio_timing_data;
        u16 pio_mode_data;
 
-       spin_lock_irqsave(&ide_lock, flags);
+       spin_lock_irqsave(&atiixp_lock, flags);
 
        pci_read_config_word(dev, ATIIXP_IDE_PIO_MODE, &pio_mode_data);
        pio_mode_data &= ~(0x07 << (drive->dn * 4));
@@ -165,7 +167,7 @@ static void atiixp_tuneproc(ide_drive_t *drive, u8 pio)
                 (pio_timing[pio].command_width << (timing_shift + 4));
        pci_write_config_dword(dev, ATIIXP_IDE_PIO_TIMING, pio_timing_data);
 
-       spin_unlock_irqrestore(&ide_lock, flags);
+       spin_unlock_irqrestore(&atiixp_lock, flags);
 }
 
 /**
@@ -189,7 +191,7 @@ static int atiixp_speedproc(ide_drive_t *drive, u8 xferspeed)
 
        speed = ide_rate_filter(atiixp_ratemask(drive), xferspeed);
 
-       spin_lock_irqsave(&ide_lock, flags);
+       spin_lock_irqsave(&atiixp_lock, flags);
 
        save_mdma_mode[drive->dn] = 0;
        if (speed >= XFER_UDMA_0) {
@@ -208,7 +210,7 @@ static int atiixp_speedproc(ide_drive_t *drive, u8 xferspeed)
                }
        }
 
-       spin_unlock_irqrestore(&ide_lock, flags);
+       spin_unlock_irqrestore(&atiixp_lock, flags);
 
        if (speed >= XFER_SW_DMA_0)
                pio = atiixp_dma_2_pio(speed);
index 61f1a96..381cc6f 100644 (file)
@@ -123,7 +123,7 @@ struct via82cxxx_dev
 static void via_set_speed(ide_hwif_t *hwif, u8 dn, struct ide_timing *timing)
 {
        struct pci_dev *dev = hwif->pci_dev;
-       struct via82cxxx_dev *vdev = ide_get_hwifdata(hwif);
+       struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev);
        u8 t;
 
        if (~vdev->via_config->flags & VIA_BAD_AST) {
@@ -162,7 +162,7 @@ static void via_set_speed(ide_hwif_t *hwif, u8 dn, struct ide_timing *timing)
 static int via_set_drive(ide_drive_t *drive, u8 speed)
 {
        ide_drive_t *peer = HWIF(drive)->drives + (~drive->dn & 1);
-       struct via82cxxx_dev *vdev = ide_get_hwifdata(drive->hwif);
+       struct via82cxxx_dev *vdev = pci_get_drvdata(drive->hwif->pci_dev);
        struct ide_timing t, p;
        unsigned int T, UT;
 
@@ -225,7 +225,7 @@ static void via82cxxx_tune_drive(ide_drive_t *drive, u8 pio)
 static int via82cxxx_ide_dma_check (ide_drive_t *drive)
 {
        ide_hwif_t *hwif = HWIF(drive);
-       struct via82cxxx_dev *vdev = ide_get_hwifdata(hwif);
+       struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev);
        u16 w80 = hwif->udma_four;
 
        u16 speed = ide_find_best_mode(drive,
@@ -262,6 +262,53 @@ static struct via_isa_bridge *via_config_find(struct pci_dev **isa)
        return via_config;
 }
 
+/*
+ * Check and handle 80-wire cable presence
+ */
+static void __devinit via_cable_detect(struct via82cxxx_dev *vdev, u32 u)
+{
+       int i;
+
+       switch (vdev->via_config->flags & VIA_UDMA) {
+               case VIA_UDMA_66:
+                       for (i = 24; i >= 0; i -= 8)
+                               if (((u >> (i & 16)) & 8) &&
+                                   ((u >> i) & 0x20) &&
+                                    (((u >> i) & 7) < 2)) {
+                                       /*
+                                        * 2x PCI clock and
+                                        * UDMA w/ < 3T/cycle
+                                        */
+                                       vdev->via_80w |= (1 << (1 - (i >> 4)));
+                               }
+                       break;
+
+               case VIA_UDMA_100:
+                       for (i = 24; i >= 0; i -= 8)
+                               if (((u >> i) & 0x10) ||
+                                   (((u >> i) & 0x20) &&
+                                    (((u >> i) & 7) < 4))) {
+                                       /* BIOS 80-wire bit or
+                                        * UDMA w/ < 60ns/cycle
+                                        */
+                                       vdev->via_80w |= (1 << (1 - (i >> 4)));
+                               }
+                       break;
+
+               case VIA_UDMA_133:
+                       for (i = 24; i >= 0; i -= 8)
+                               if (((u >> i) & 0x10) ||
+                                   (((u >> i) & 0x20) &&
+                                    (((u >> i) & 7) < 6))) {
+                                       /* BIOS 80-wire bit or
+                                        * UDMA w/ < 60ns/cycle
+                                        */
+                                       vdev->via_80w |= (1 << (1 - (i >> 4)));
+                               }
+                       break;
+       }
+}
+
 /**
  *     init_chipset_via82cxxx  -       initialization handler
  *     @dev: PCI device
@@ -274,14 +321,22 @@ static struct via_isa_bridge *via_config_find(struct pci_dev **isa)
 static unsigned int __devinit init_chipset_via82cxxx(struct pci_dev *dev, const char *name)
 {
        struct pci_dev *isa = NULL;
+       struct via82cxxx_dev *vdev;
        struct via_isa_bridge *via_config;
        u8 t, v;
-       unsigned int u;
+       u32 u;
+
+       vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+       if (!vdev) {
+               printk(KERN_ERR "VP_IDE: out of memory :(\n");
+               return -ENOMEM;
+       }
+       pci_set_drvdata(dev, vdev);
 
        /*
         * Find the ISA bridge to see how good the IDE is.
         */
-       via_config = via_config_find(&isa);
+       vdev->via_config = via_config = via_config_find(&isa);
 
        /* We checked this earlier so if it fails here deeep badness
           is involved */
@@ -289,16 +344,17 @@ static unsigned int __devinit init_chipset_via82cxxx(struct pci_dev *dev, const
        BUG_ON(!via_config->id);
 
        /*
-        * Setup or disable Clk66 if appropriate
+        * Detect cable and configure Clk66
         */
+       pci_read_config_dword(dev, VIA_UDMA_TIMING, &u);
+
+       via_cable_detect(vdev, u);
 
        if ((via_config->flags & VIA_UDMA) == VIA_UDMA_66) {
                /* Enable Clk66 */
-               pci_read_config_dword(dev, VIA_UDMA_TIMING, &u);
                pci_write_config_dword(dev, VIA_UDMA_TIMING, u|0x80008);
        } else if (via_config->flags & VIA_BAD_CLK66) {
                /* Would cause trouble on 596a and 686 */
-               pci_read_config_dword(dev, VIA_UDMA_TIMING, &u);
                pci_write_config_dword(dev, VIA_UDMA_TIMING, u & ~0x80008);
        }
 
@@ -367,75 +423,11 @@ static unsigned int __devinit init_chipset_via82cxxx(struct pci_dev *dev, const
        return 0;
 }
 
-/*
- * Check and handle 80-wire cable presence
- */
-static void __devinit via_cable_detect(struct pci_dev *dev, struct via82cxxx_dev *vdev)
-{
-       unsigned int u;
-       int i;
-       pci_read_config_dword(dev, VIA_UDMA_TIMING, &u);
-
-       switch (vdev->via_config->flags & VIA_UDMA) {
-
-               case VIA_UDMA_66:
-                       for (i = 24; i >= 0; i -= 8)
-                               if (((u >> (i & 16)) & 8) &&
-                                   ((u >> i) & 0x20) &&
-                                    (((u >> i) & 7) < 2)) {
-                                       /*
-                                        * 2x PCI clock and
-                                        * UDMA w/ < 3T/cycle
-                                        */
-                                       vdev->via_80w |= (1 << (1 - (i >> 4)));
-                               }
-                       break;
-
-               case VIA_UDMA_100:
-                       for (i = 24; i >= 0; i -= 8)
-                               if (((u >> i) & 0x10) ||
-                                   (((u >> i) & 0x20) &&
-                                    (((u >> i) & 7) < 4))) {
-                                       /* BIOS 80-wire bit or
-                                        * UDMA w/ < 60ns/cycle
-                                        */
-                                       vdev->via_80w |= (1 << (1 - (i >> 4)));
-                               }
-                       break;
-
-               case VIA_UDMA_133:
-                       for (i = 24; i >= 0; i -= 8)
-                               if (((u >> i) & 0x10) ||
-                                   (((u >> i) & 0x20) &&
-                                    (((u >> i) & 7) < 6))) {
-                                       /* BIOS 80-wire bit or
-                                        * UDMA w/ < 60ns/cycle
-                                        */
-                                       vdev->via_80w |= (1 << (1 - (i >> 4)));
-                               }
-                       break;
-
-       }
-}
-
 static void __devinit init_hwif_via82cxxx(ide_hwif_t *hwif)
 {
-       struct via82cxxx_dev *vdev = kmalloc(sizeof(struct via82cxxx_dev),
-               GFP_KERNEL);
-       struct pci_dev *isa = NULL;
+       struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev);
        int i;
 
-       if (vdev == NULL) {
-               printk(KERN_ERR "VP_IDE: out of memory :(\n");
-               return;
-       }
-
-       memset(vdev, 0, sizeof(struct via82cxxx_dev));
-       ide_set_hwifdata(hwif, vdev);
-
-       vdev->via_config = via_config_find(&isa);
-       via_cable_detect(hwif->pci_dev, vdev);
-
        hwif->autodma = 0;
 
        hwif->tuneproc = &via82cxxx_tune_drive;
index 100df6f..91e0c75 100644 (file)
@@ -52,6 +52,8 @@
 #define KVM_MAX_VCPUS 1
 #define KVM_MEMORY_SLOTS 4
 #define KVM_NUM_MMU_PAGES 256
+#define KVM_MIN_FREE_MMU_PAGES 5
+#define KVM_REFILL_PAGES 25
 
 #define FX_IMAGE_SIZE 512
 #define FX_IMAGE_ALIGN 16
@@ -89,14 +91,54 @@ typedef unsigned long  hva_t;
 typedef u64            hpa_t;
 typedef unsigned long  hfn_t;
 
+#define NR_PTE_CHAIN_ENTRIES 5
+
+struct kvm_pte_chain {
+       u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
+       struct hlist_node link;
+};
+
+/*
+ * kvm_mmu_page_role, below, is defined as:
+ *
+ *   bits 0:3 - total guest paging levels (2-4, or zero for real mode)
+ *   bits 4:7 - page table level for this shadow (1-4)
+ *   bits 8:9 - page table quadrant for 2-level guests
+ *   bit   16 - "metaphysical" - gfn is not a real page (huge page/real mode)
+ */
+union kvm_mmu_page_role {
+       unsigned word;
+       struct {
+               unsigned glevels : 4;
+               unsigned level : 4;
+               unsigned quadrant : 2;
+               unsigned pad_for_nice_hex_output : 6;
+               unsigned metaphysical : 1;
+       };
+};
+
 struct kvm_mmu_page {
        struct list_head link;
+       struct hlist_node hash_link;
+
+       /*
+        * The following two entries are used to key the shadow page in the
+        * hash table.
+        */
+       gfn_t gfn;
+       union kvm_mmu_page_role role;
+
        hpa_t page_hpa;
        unsigned long slot_bitmap; /* One bit set per slot which has memory
                                    * in this shadow page.
                                    */
        int global;              /* Set if all ptes in this page are global */
-       u64 *parent_pte;
+       int multimapped;         /* More than one parent_pte? */
+       int root_count;          /* Currently serving as active root */
+       union {
+               u64 *parent_pte;               /* !multimapped */
+               struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
+       };
 };
 
 struct vmcs {
@@ -117,14 +159,26 @@ struct kvm_vcpu;
 struct kvm_mmu {
        void (*new_cr3)(struct kvm_vcpu *vcpu);
        int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
-       void (*inval_page)(struct kvm_vcpu *vcpu, gva_t gva);
        void (*free)(struct kvm_vcpu *vcpu);
        gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
        hpa_t root_hpa;
        int root_level;
        int shadow_root_level;
+
+       u64 *pae_root;
+};
+
+#define KVM_NR_MEM_OBJS 20
+
+struct kvm_mmu_memory_cache {
+       int nobjs;
+       void *objects[KVM_NR_MEM_OBJS];
 };
 
+/*
+ * We don't want allocation failures within the mmu code, so we preallocate
+ * enough memory for a single page fault in a cache.
+ */
 struct kvm_guest_debug {
        int enabled;
        unsigned long bp[4];
@@ -173,6 +227,7 @@ struct kvm_vcpu {
        struct mutex mutex;
        int   cpu;
        int   launched;
+       int interrupt_window_open;
        unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
 #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
        unsigned long irq_pending[NR_IRQ_WORDS];
@@ -184,6 +239,7 @@ struct kvm_vcpu {
        unsigned long cr3;
        unsigned long cr4;
        unsigned long cr8;
+       u64 pdptrs[4]; /* pae */
        u64 shadow_efer;
        u64 apic_base;
        int nmsrs;
@@ -194,6 +250,12 @@ struct kvm_vcpu {
        struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES];
        struct kvm_mmu mmu;
 
+       struct kvm_mmu_memory_cache mmu_pte_chain_cache;
+       struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
+
+       gfn_t last_pt_write_gfn;
+       int   last_pt_write_count;
+
        struct kvm_guest_debug guest_debug;
 
        char fx_buf[FX_BUF_SIZE];
@@ -231,10 +293,16 @@ struct kvm {
        spinlock_t lock; /* protects everything except vcpus */
        int nmemslots;
        struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS];
+       /*
+        * Hash table of struct kvm_mmu_page.
+        */
        struct list_head active_mmu_pages;
+       int n_free_mmu_pages;
+       struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
        struct kvm_vcpu vcpus[KVM_MAX_VCPUS];
        int memory_config_version;
        int busy;
+       unsigned long rmap_overflow;
 };
 
 struct kvm_stat {
@@ -247,6 +315,9 @@ struct kvm_stat {
        u32 io_exits;
        u32 mmio_exits;
        u32 signal_exits;
+       u32 irq_window_exits;
+       u32 halt_exits;
+       u32 request_irq_exits;
        u32 irq_exits;
 };
 
@@ -279,6 +350,7 @@ struct kvm_arch_ops {
        void (*set_segment)(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
        void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
+       void (*decache_cr0_cr4_guest_bits)(struct kvm_vcpu *vcpu);
        void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
        void (*set_cr0_no_modeswitch)(struct kvm_vcpu *vcpu,
                                      unsigned long cr0);
@@ -323,7 +395,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot);
 
 hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa);
 #define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
@@ -396,6 +468,19 @@ int kvm_write_guest(struct kvm_vcpu *vcpu,
 
 unsigned long segment_base(u16 selector);
 
+void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes);
+void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes);
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
+void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
+
+static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
+                                    u32 error_code)
+{
+       if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
+               kvm_mmu_free_some_pages(vcpu);
+       return vcpu->mmu.page_fault(vcpu, gva, error_code);
+}
+
 static inline struct page *_gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
@@ -541,19 +626,4 @@ static inline u32 get_rdx_init_val(void)
 #define TSS_REDIRECTION_SIZE (256 / 8)
 #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
 
-#ifdef CONFIG_X86_64
-
-/*
- * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.  Therefore
- * we need to allocate shadow page tables in the first 4GB of memory, which
- * happens to fit the DMA32 zone.
- */
-#define GFP_KVM_MMU (GFP_KERNEL | __GFP_DMA32)
-
-#else
-
-#define GFP_KVM_MMU GFP_KERNEL
-
-#endif
-
 #endif
index ce7fe64..67c1154 100644 (file)
@@ -58,6 +58,9 @@ static struct kvm_stats_debugfs_item {
        { "io_exits", &kvm_stat.io_exits },
        { "mmio_exits", &kvm_stat.mmio_exits },
        { "signal_exits", &kvm_stat.signal_exits },
+       { "irq_window", &kvm_stat.irq_window_exits },
+       { "halt_exits", &kvm_stat.halt_exits },
+       { "request_irq", &kvm_stat.request_irq_exits },
        { "irq_exits", &kvm_stat.irq_exits },
        { 0, 0 }
 };
@@ -227,6 +230,7 @@ static int kvm_dev_open(struct inode *inode, struct file *filp)
                struct kvm_vcpu *vcpu = &kvm->vcpus[i];
 
                mutex_init(&vcpu->mutex);
+               vcpu->kvm = kvm;
                vcpu->mmu.root_hpa = INVALID_PAGE;
                INIT_LIST_HEAD(&vcpu->free_pages);
        }
@@ -268,8 +272,8 @@ static void kvm_free_physmem(struct kvm *kvm)
 
 static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
 {
-       kvm_arch_ops->vcpu_free(vcpu);
        kvm_mmu_destroy(vcpu);
+       kvm_arch_ops->vcpu_free(vcpu);
 }
 
 static void kvm_free_vcpus(struct kvm *kvm)
@@ -295,14 +299,17 @@ static void inject_gp(struct kvm_vcpu *vcpu)
        kvm_arch_ops->inject_gp(vcpu, 0);
 }
 
-static int pdptrs_have_reserved_bits_set(struct kvm_vcpu *vcpu,
-                                        unsigned long cr3)
+/*
+ * Load the pae pdptrs.  Return true is they are all valid.
+ */
+static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
        gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
-       unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5;
+       unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
        int i;
        u64 pdpte;
        u64 *pdpt;
+       int ret;
        struct kvm_memory_slot *memslot;
 
        spin_lock(&vcpu->kvm->lock);
@@ -310,16 +317,23 @@ static int pdptrs_have_reserved_bits_set(struct kvm_vcpu *vcpu,
        /* FIXME: !memslot - emulate? 0xff? */
        pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0);
 
+       ret = 1;
        for (i = 0; i < 4; ++i) {
                pdpte = pdpt[offset + i];
-               if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
-                       break;
+               if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) {
+                       ret = 0;
+                       goto out;
+               }
        }
 
+       for (i = 0; i < 4; ++i)
+               vcpu->pdptrs[i] = pdpt[offset + i];
+
+out:
        kunmap_atomic(pdpt, KM_USER0);
        spin_unlock(&vcpu->kvm->lock);
 
-       return i != 4;
+       return ret;
 }
 
 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
@@ -365,8 +379,7 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
                        }
                } else
 #endif
-               if (is_pae(vcpu) &&
-                           pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
+               if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
                        printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
                               "reserved bits\n");
                        inject_gp(vcpu);
@@ -387,6 +400,7 @@ EXPORT_SYMBOL_GPL(set_cr0);
 
 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 {
+       kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
        set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
 }
 EXPORT_SYMBOL_GPL(lmsw);
@@ -407,7 +421,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                        return;
                }
        } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK)
-                  && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
+                  && !load_pdptrs(vcpu, vcpu->cr3)) {
                printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
                inject_gp(vcpu);
        }
@@ -439,7 +453,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                        return;
                }
                if (is_paging(vcpu) && is_pae(vcpu) &&
-                   pdptrs_have_reserved_bits_set(vcpu, cr3)) {
+                   !load_pdptrs(vcpu, cr3)) {
                        printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
                               "reserved bits\n");
                        inject_gp(vcpu);
@@ -449,7 +463,19 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 
        vcpu->cr3 = cr3;
        spin_lock(&vcpu->kvm->lock);
-       vcpu->mmu.new_cr3(vcpu);
+       /*
+        * Does the new cr3 value map to physical memory? (Note, we
+        * catch an invalid cr3 even in real-mode, because it would
+        * cause trouble later on when we turn on paging anyway.)
+        *
+        * A real CPU would silently accept an invalid cr3 and would
+        * attempt to use it - with largely undefined (and often hard
+        * to debug) behavior on the guest side.
+        */
+       if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
+               inject_gp(vcpu);
+       else
+               vcpu->mmu.new_cr3(vcpu);
        spin_unlock(&vcpu->kvm->lock);
 }
 EXPORT_SYMBOL_GPL(set_cr3);
@@ -517,7 +543,6 @@ static int kvm_dev_ioctl_create_vcpu(struct kvm *kvm, int n)
        vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
 
        vcpu->cpu = -1;  /* First load will set up TR */
-       vcpu->kvm = kvm;
        r = kvm_arch_ops->vcpu_create(vcpu);
        if (r < 0)
                goto out_free_vcpus;
@@ -634,6 +659,7 @@ raced:
                                                     | __GFP_ZERO);
                        if (!new.phys_mem[i])
                                goto out_free;
+                       new.phys_mem[i]->private = 0;
                }
        }
 
@@ -688,6 +714,13 @@ out:
        return r;
 }
 
+static void do_remove_write_access(struct kvm_vcpu *vcpu, int slot)
+{
+       spin_lock(&vcpu->kvm->lock);
+       kvm_mmu_slot_remove_write_access(vcpu, slot);
+       spin_unlock(&vcpu->kvm->lock);
+}
+
 /*
  * Get (and clear) the dirty memory log for a memory slot.
  */
@@ -697,6 +730,7 @@ static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm,
        struct kvm_memory_slot *memslot;
        int r, i;
        int n;
+       int cleared;
        unsigned long any = 0;
 
        spin_lock(&kvm->lock);
@@ -727,15 +761,17 @@ static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm,
 
 
        if (any) {
-               spin_lock(&kvm->lock);
-               kvm_mmu_slot_remove_write_access(kvm, log->slot);
-               spin_unlock(&kvm->lock);
-               memset(memslot->dirty_bitmap, 0, n);
+               cleared = 0;
                for (i = 0; i < KVM_MAX_VCPUS; ++i) {
                        struct kvm_vcpu *vcpu = vcpu_load(kvm, i);
 
                        if (!vcpu)
                                continue;
+                       if (!cleared) {
+                               do_remove_write_access(vcpu, log->slot);
+                               memset(memslot->dirty_bitmap, 0, n);
+                               cleared = 1;
+                       }
                        kvm_arch_ops->tlb_flush(vcpu);
                        vcpu_put(vcpu);
                }
@@ -863,6 +899,27 @@ static int emulator_read_emulated(unsigned long addr,
        }
 }
 
+static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+                              unsigned long val, int bytes)
+{
+       struct kvm_memory_slot *m;
+       struct page *page;
+       void *virt;
+
+       if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
+               return 0;
+       m = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT);
+       if (!m)
+               return 0;
+       page = gfn_to_page(m, gpa >> PAGE_SHIFT);
+       kvm_mmu_pre_write(vcpu, gpa, bytes);
+       virt = kmap_atomic(page, KM_USER0);
+       memcpy(virt + offset_in_page(gpa), &val, bytes);
+       kunmap_atomic(virt, KM_USER0);
+       kvm_mmu_post_write(vcpu, gpa, bytes);
+       return 1;
+}
+
 static int emulator_write_emulated(unsigned long addr,
                                   unsigned long val,
                                   unsigned int bytes,
@@ -874,6 +931,9 @@ static int emulator_write_emulated(unsigned long addr,
        if (gpa == UNMAPPED_GVA)
                return X86EMUL_PROPAGATE_FAULT;
 
+       if (emulator_write_phys(vcpu, gpa, val, bytes))
+               return X86EMUL_CONTINUE;
+
        vcpu->mmio_needed = 1;
        vcpu->mmio_phys_addr = gpa;
        vcpu->mmio_size = bytes;
@@ -898,6 +958,30 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
        return emulator_write_emulated(addr, new, bytes, ctxt);
 }
 
+#ifdef CONFIG_X86_32
+
+static int emulator_cmpxchg8b_emulated(unsigned long addr,
+                                      unsigned long old_lo,
+                                      unsigned long old_hi,
+                                      unsigned long new_lo,
+                                      unsigned long new_hi,
+                                      struct x86_emulate_ctxt *ctxt)
+{
+       static int reported;
+       int r;
+
+       if (!reported) {
+               reported = 1;
+               printk(KERN_WARNING "kvm: emulating exchange8b as write\n");
+       }
+       r = emulator_write_emulated(addr, new_lo, 4, ctxt);
+       if (r != X86EMUL_CONTINUE)
+               return r;
+       return emulator_write_emulated(addr+4, new_hi, 4, ctxt);
+}
+
+#endif
+
 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
 {
        return kvm_arch_ops->get_segment_base(vcpu, seg);
@@ -905,18 +989,15 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
 
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
 {
-       spin_lock(&vcpu->kvm->lock);
-       vcpu->mmu.inval_page(vcpu, address);
-       spin_unlock(&vcpu->kvm->lock);
-       kvm_arch_ops->invlpg(vcpu, address);
        return X86EMUL_CONTINUE;
 }
 
 int emulate_clts(struct kvm_vcpu *vcpu)
 {
-       unsigned long cr0 = vcpu->cr0;
+       unsigned long cr0;
 
-       cr0 &= ~CR0_TS_MASK;
+       kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
+       cr0 = vcpu->cr0 & ~CR0_TS_MASK;
        kvm_arch_ops->set_cr0(vcpu, cr0);
        return X86EMUL_CONTINUE;
 }
@@ -975,6 +1056,9 @@ struct x86_emulate_ops emulate_ops = {
        .read_emulated       = emulator_read_emulated,
        .write_emulated      = emulator_write_emulated,
        .cmpxchg_emulated    = emulator_cmpxchg_emulated,
+#ifdef CONFIG_X86_32
+       .cmpxchg8b_emulated  = emulator_cmpxchg8b_emulated,
+#endif
 };
 
 int emulate_instruction(struct kvm_vcpu *vcpu,
@@ -1024,6 +1108,8 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
        }
 
        if (r) {
+               if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+                       return EMULATE_DONE;
                if (!vcpu->mmio_needed) {
                        report_emulation_failure(&emulate_ctxt);
                        return EMULATE_FAIL;
@@ -1069,6 +1155,7 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
 
 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
 {
+       kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
        switch (cr) {
        case 0:
                return vcpu->cr0;
@@ -1403,6 +1490,7 @@ static int kvm_dev_ioctl_get_sregs(struct kvm *kvm, struct kvm_sregs *sregs)
        sregs->gdt.limit = dt.limit;
        sregs->gdt.base = dt.base;
 
+       kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
        sregs->cr0 = vcpu->cr0;
        sregs->cr2 = vcpu->cr2;
        sregs->cr3 = vcpu->cr3;
@@ -1467,11 +1555,15 @@ static int kvm_dev_ioctl_set_sregs(struct kvm *kvm, struct kvm_sregs *sregs)
 #endif
        vcpu->apic_base = sregs->apic_base;
 
+       kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
+
        mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
        kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0);
 
        mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
        kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
+       if (!is_long_mode(vcpu) && is_pae(vcpu))
+               load_pdptrs(vcpu, vcpu->cr3);
 
        if (mmu_reset_needed)
                kvm_mmu_reset_context(vcpu);
@@ -1693,12 +1785,12 @@ static long kvm_dev_ioctl(struct file *filp,
                if (copy_from_user(&kvm_run, (void *)arg, sizeof kvm_run))
                        goto out;
                r = kvm_dev_ioctl_run(kvm, &kvm_run);
-               if (r < 0)
+               if (r < 0 &&  r != -EINTR)
                        goto out;
-               r = -EFAULT;
-               if (copy_to_user((void *)arg, &kvm_run, sizeof kvm_run))
+               if (copy_to_user((void *)arg, &kvm_run, sizeof kvm_run)) {
+                       r = -EFAULT;
                        goto out;
-               r = 0;
+               }
                break;
        }
        case KVM_GET_REGS: {
@@ -1842,6 +1934,7 @@ static long kvm_dev_ioctl(struct file *filp,
                                 num_msrs_to_save * sizeof(u32)))
                        goto out;
                r = 0;
+               break;
        }
        default:
                ;
@@ -1944,17 +2037,17 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
                return -EEXIST;
        }
 
-       kvm_arch_ops = ops;
-
-       if (!kvm_arch_ops->cpu_has_kvm_support()) {
+       if (!ops->cpu_has_kvm_support()) {
                printk(KERN_ERR "kvm: no hardware support\n");
                return -EOPNOTSUPP;
        }
-       if (kvm_arch_ops->disabled_by_bios()) {
+       if (ops->disabled_by_bios()) {
                printk(KERN_ERR "kvm: disabled by bios\n");
                return -EOPNOTSUPP;
        }
 
+       kvm_arch_ops = ops;
+
        r = kvm_arch_ops->hardware_setup();
        if (r < 0)
            return r;
index 790423c..c6f9729 100644 (file)
 #include "vmx.h"
 #include "kvm.h"
 
+#undef MMU_DEBUG
+
+#undef AUDIT
+
+#ifdef AUDIT
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
+#endif
+
+#ifdef MMU_DEBUG
+
+#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
+#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
+
+#else
+
 #define pgprintk(x...) do { } while (0)
+#define rmap_printk(x...) do { } while (0)
+
+#endif
+
+#if defined(MMU_DEBUG) || defined(AUDIT)
+static int dbg = 1;
+#endif
 
 #define ASSERT(x)                                                      \
        if (!(x)) {                                                     \
                       __FILE__, __LINE__, #x);                         \
        }
 
-#define PT64_ENT_PER_PAGE 512
-#define PT32_ENT_PER_PAGE 1024
+#define PT64_PT_BITS 9
+#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
+#define PT32_PT_BITS 10
+#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
 
 #define PT_WRITABLE_SHIFT 1
 
 #define PT_DIRECTORY_LEVEL 2
 #define PT_PAGE_TABLE_LEVEL 1
 
+#define RMAP_EXT 4
+
+struct kvm_rmap_desc {
+       u64 *shadow_ptes[RMAP_EXT];
+       struct kvm_rmap_desc *more;
+};
+
 static int is_write_protection(struct kvm_vcpu *vcpu)
 {
        return vcpu->cr0 & CR0_WP_MASK;
@@ -150,32 +183,272 @@ static int is_io_pte(unsigned long pte)
        return pte & PT_SHADOW_IO_MARK;
 }
 
+static int is_rmap_pte(u64 pte)
+{
+       return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
+               == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
+}
+
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+                                 size_t objsize, int min)
+{
+       void *obj;
+
+       if (cache->nobjs >= min)
+               return 0;
+       while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+               obj = kzalloc(objsize, GFP_NOWAIT);
+               if (!obj)
+                       return -ENOMEM;
+               cache->objects[cache->nobjs++] = obj;
+       }
+       return 0;
+}
+
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+{
+       while (mc->nobjs)
+               kfree(mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+{
+       int r;
+
+       r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
+                                  sizeof(struct kvm_pte_chain), 4);
+       if (r)
+               goto out;
+       r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
+                                  sizeof(struct kvm_rmap_desc), 1);
+out:
+       return r;
+}
+
+static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+       mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
+       mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
+}
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
+                                   size_t size)
+{
+       void *p;
+
+       BUG_ON(!mc->nobjs);
+       p = mc->objects[--mc->nobjs];
+       memset(p, 0, size);
+       return p;
+}
+
+static void mmu_memory_cache_free(struct kvm_mmu_memory_cache *mc, void *obj)
+{
+       if (mc->nobjs < KVM_NR_MEM_OBJS)
+               mc->objects[mc->nobjs++] = obj;
+       else
+               kfree(obj);
+}
+
+static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
+{
+       return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
+                                     sizeof(struct kvm_pte_chain));
+}
+
+static void mmu_free_pte_chain(struct kvm_vcpu *vcpu,
+                              struct kvm_pte_chain *pc)
+{
+       mmu_memory_cache_free(&vcpu->mmu_pte_chain_cache, pc);
+}
+
+static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
+{
+       return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
+                                     sizeof(struct kvm_rmap_desc));
+}
+
+static void mmu_free_rmap_desc(struct kvm_vcpu *vcpu,
+                              struct kvm_rmap_desc *rd)
+{
+       mmu_memory_cache_free(&vcpu->mmu_rmap_desc_cache, rd);
+}
+
+/*
+ * Reverse mapping data structures:
+ *
+ * If page->private bit zero is zero, then page->private points to the
+ * shadow page table entry that points to page_address(page).
+ *
+ * If page->private bit zero is one, (then page->private & ~1) points
+ * to a struct kvm_rmap_desc containing more mappings.
+ */
+static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
+{
+       struct page *page;
+       struct kvm_rmap_desc *desc;
+       int i;
+
+       if (!is_rmap_pte(*spte))
+               return;
+       page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+       if (!page->private) {
+               rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
+               page->private = (unsigned long)spte;
+       } else if (!(page->private & 1)) {
+               rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
+               desc = mmu_alloc_rmap_desc(vcpu);
+               desc->shadow_ptes[0] = (u64 *)page->private;
+               desc->shadow_ptes[1] = spte;
+               page->private = (unsigned long)desc | 1;
+       } else {
+               rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
+               desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
+               while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
+                       desc = desc->more;
+               if (desc->shadow_ptes[RMAP_EXT-1]) {
+                       desc->more = mmu_alloc_rmap_desc(vcpu);
+                       desc = desc->more;
+               }
+               for (i = 0; desc->shadow_ptes[i]; ++i)
+                       ;
+               desc->shadow_ptes[i] = spte;
+       }
+}
+
+static void rmap_desc_remove_entry(struct kvm_vcpu *vcpu,
+                                  struct page *page,
+                                  struct kvm_rmap_desc *desc,
+                                  int i,
+                                  struct kvm_rmap_desc *prev_desc)
+{
+       int j;
+
+       for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
+               ;
+       desc->shadow_ptes[i] = desc->shadow_ptes[j];
+       desc->shadow_ptes[j] = 0;
+       if (j != 0)
+               return;
+       if (!prev_desc && !desc->more)
+               page->private = (unsigned long)desc->shadow_ptes[0];
+       else
+               if (prev_desc)
+                       prev_desc->more = desc->more;
+               else
+                       page->private = (unsigned long)desc->more | 1;
+       mmu_free_rmap_desc(vcpu, desc);
+}
+
+static void rmap_remove(struct kvm_vcpu *vcpu, u64 *spte)
+{
+       struct page *page;
+       struct kvm_rmap_desc *desc;
+       struct kvm_rmap_desc *prev_desc;
+       int i;
+
+       if (!is_rmap_pte(*spte))
+               return;
+       page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+       if (!page->private) {
+               printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
+               BUG();
+       } else if (!(page->private & 1)) {
+               rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
+               if ((u64 *)page->private != spte) {
+                       printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
+                              spte, *spte);
+                       BUG();
+               }
+               page->private = 0;
+       } else {
+               rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
+               desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
+               prev_desc = NULL;
+               while (desc) {
+                       for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
+                               if (desc->shadow_ptes[i] == spte) {
+                                       rmap_desc_remove_entry(vcpu, page,
+                                                              desc, i,
+                                                              prev_desc);
+                                       return;
+                               }
+                       prev_desc = desc;
+                       desc = desc->more;
+               }
+               BUG();
+       }
+}
+
+static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct page *page;
+       struct kvm_memory_slot *slot;
+       struct kvm_rmap_desc *desc;
+       u64 *spte;
+
+       slot = gfn_to_memslot(kvm, gfn);
+       BUG_ON(!slot);
+       page = gfn_to_page(slot, gfn);
+
+       while (page->private) {
+               if (!(page->private & 1))
+                       spte = (u64 *)page->private;
+               else {
+                       desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
+                       spte = desc->shadow_ptes[0];
+               }
+               BUG_ON(!spte);
+               BUG_ON((*spte & PT64_BASE_ADDR_MASK) !=
+                      page_to_pfn(page) << PAGE_SHIFT);
+               BUG_ON(!(*spte & PT_PRESENT_MASK));
+               BUG_ON(!(*spte & PT_WRITABLE_MASK));
+               rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
+               rmap_remove(vcpu, spte);
+               kvm_arch_ops->tlb_flush(vcpu);
+               *spte &= ~(u64)PT_WRITABLE_MASK;
+       }
+}
+
+static int is_empty_shadow_page(hpa_t page_hpa)
+{
+       u64 *pos;
+       u64 *end;
+
+       for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u64);
+                     pos != end; pos++)
+               if (*pos != 0) {
+                       printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
+                              pos, *pos);
+                       return 0;
+               }
+       return 1;
+}
+
 static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
 {
        struct kvm_mmu_page *page_head = page_header(page_hpa);
 
+       ASSERT(is_empty_shadow_page(page_hpa));
        list_del(&page_head->link);
        page_head->page_hpa = page_hpa;
        list_add(&page_head->link, &vcpu->free_pages);
+       ++vcpu->kvm->n_free_mmu_pages;
 }
 
-static int is_empty_shadow_page(hpa_t page_hpa)
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
 {
-       u32 *pos;
-       u32 *end;
-       for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u32);
-                     pos != end; pos++)
-               if (*pos != 0)
-                       return 0;
-       return 1;
+       return gfn;
 }
 
-static hpa_t kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte)
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
+                                              u64 *parent_pte)
 {
        struct kvm_mmu_page *page;
 
        if (list_empty(&vcpu->free_pages))
-               return INVALID_PAGE;
+               return NULL;
 
        page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link);
        list_del(&page->link);
@@ -183,8 +456,239 @@ static hpa_t kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte)
        ASSERT(is_empty_shadow_page(page->page_hpa));
        page->slot_bitmap = 0;
        page->global = 1;
+       page->multimapped = 0;
        page->parent_pte = parent_pte;
-       return page->page_hpa;
+       --vcpu->kvm->n_free_mmu_pages;
+       return page;
+}
+
+static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
+                                   struct kvm_mmu_page *page, u64 *parent_pte)
+{
+       struct kvm_pte_chain *pte_chain;
+       struct hlist_node *node;
+       int i;
+
+       if (!parent_pte)
+               return;
+       if (!page->multimapped) {
+               u64 *old = page->parent_pte;
+
+               if (!old) {
+                       page->parent_pte = parent_pte;
+                       return;
+               }
+               page->multimapped = 1;
+               pte_chain = mmu_alloc_pte_chain(vcpu);
+               INIT_HLIST_HEAD(&page->parent_ptes);
+               hlist_add_head(&pte_chain->link, &page->parent_ptes);
+               pte_chain->parent_ptes[0] = old;
+       }
+       hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
+               if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
+                       continue;
+               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
+                       if (!pte_chain->parent_ptes[i]) {
+                               pte_chain->parent_ptes[i] = parent_pte;
+                               return;
+                       }
+       }
+       pte_chain = mmu_alloc_pte_chain(vcpu);
+       BUG_ON(!pte_chain);
+       hlist_add_head(&pte_chain->link, &page->parent_ptes);
+       pte_chain->parent_ptes[0] = parent_pte;
+}
+
+static void mmu_page_remove_parent_pte(struct kvm_vcpu *vcpu,
+                                      struct kvm_mmu_page *page,
+                                      u64 *parent_pte)
+{
+       struct kvm_pte_chain *pte_chain;
+       struct hlist_node *node;
+       int i;
+
+       if (!page->multimapped) {
+               BUG_ON(page->parent_pte != parent_pte);
+               page->parent_pte = NULL;
+               return;
+       }
+       hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
+               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+                       if (!pte_chain->parent_ptes[i])
+                               break;
+                       if (pte_chain->parent_ptes[i] != parent_pte)
+                               continue;
+                       while (i + 1 < NR_PTE_CHAIN_ENTRIES
+                               && pte_chain->parent_ptes[i + 1]) {
+                               pte_chain->parent_ptes[i]
+                                       = pte_chain->parent_ptes[i + 1];
+                               ++i;
+                       }
+                       pte_chain->parent_ptes[i] = NULL;
+                       if (i == 0) {
+                               hlist_del(&pte_chain->link);
+                               mmu_free_pte_chain(vcpu, pte_chain);
+                               if (hlist_empty(&page->parent_ptes)) {
+                                       page->multimapped = 0;
+                                       page->parent_pte = NULL;
+                               }
+                       }
+                       return;
+               }
+       BUG();
+}
+
+static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
+                                               gfn_t gfn)
+{
+       unsigned index;
+       struct hlist_head *bucket;
+       struct kvm_mmu_page *page;
+       struct hlist_node *node;
+
+       pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+       bucket = &vcpu->kvm->mmu_page_hash[index];
+       hlist_for_each_entry(page, node, bucket, hash_link)
+               if (page->gfn == gfn && !page->role.metaphysical) {
+                       pgprintk("%s: found role %x\n",
+                                __FUNCTION__, page->role.word);
+                       return page;
+               }
+       return NULL;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
+                                            gfn_t gfn,
+                                            gva_t gaddr,
+                                            unsigned level,
+                                            int metaphysical,
+                                            u64 *parent_pte)
+{
+       union kvm_mmu_page_role role;
+       unsigned index;
+       unsigned quadrant;
+       struct hlist_head *bucket;
+       struct kvm_mmu_page *page;
+       struct hlist_node *node;
+
+       role.word = 0;
+       role.glevels = vcpu->mmu.root_level;
+       role.level = level;
+       role.metaphysical = metaphysical;
+       if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
+               quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
+               quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
+               role.quadrant = quadrant;
+       }
+       pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
+                gfn, role.word);
+       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+       bucket = &vcpu->kvm->mmu_page_hash[index];
+       hlist_for_each_entry(page, node, bucket, hash_link)
+               if (page->gfn == gfn && page->role.word == role.word) {
+                       mmu_page_add_parent_pte(vcpu, page, parent_pte);
+                       pgprintk("%s: found\n", __FUNCTION__);
+                       return page;
+               }
+       page = kvm_mmu_alloc_page(vcpu, parent_pte);
+       if (!page)
+               return page;
+       pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
+       page->gfn = gfn;
+       page->role = role;
+       hlist_add_head(&page->hash_link, bucket);
+       if (!metaphysical)
+               rmap_write_protect(vcpu, gfn);
+       return page;
+}
+
+static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
+                                        struct kvm_mmu_page *page)
+{
+       unsigned i;
+       u64 *pt;
+       u64 ent;
+
+       pt = __va(page->page_hpa);
+
+       if (page->role.level == PT_PAGE_TABLE_LEVEL) {
+               for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+                       if (pt[i] & PT_PRESENT_MASK)
+                               rmap_remove(vcpu, &pt[i]);
+                       pt[i] = 0;
+               }
+               kvm_arch_ops->tlb_flush(vcpu);
+               return;
+       }
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+               ent = pt[i];
+
+               pt[i] = 0;
+               if (!(ent & PT_PRESENT_MASK))
+                       continue;
+               ent &= PT64_BASE_ADDR_MASK;
+               mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]);
+       }
+}
+
+static void kvm_mmu_put_page(struct kvm_vcpu *vcpu,
+                            struct kvm_mmu_page *page,
+                            u64 *parent_pte)
+{
+       mmu_page_remove_parent_pte(vcpu, page, parent_pte);
+}
+
+static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu,
+                            struct kvm_mmu_page *page)
+{
+       u64 *parent_pte;
+
+       while (page->multimapped || page->parent_pte) {
+               if (!page->multimapped)
+                       parent_pte = page->parent_pte;
+               else {
+                       struct kvm_pte_chain *chain;
+
+                       chain = container_of(page->parent_ptes.first,
+                                            struct kvm_pte_chain, link);
+                       parent_pte = chain->parent_ptes[0];
+               }
+               BUG_ON(!parent_pte);
+               kvm_mmu_put_page(vcpu, page, parent_pte);
+               *parent_pte = 0;
+       }
+       kvm_mmu_page_unlink_children(vcpu, page);
+       if (!page->root_count) {
+               hlist_del(&page->hash_link);
+               kvm_mmu_free_page(vcpu, page->page_hpa);
+       } else {
+               list_del(&page->link);
+               list_add(&page->link, &vcpu->kvm->active_mmu_pages);
+       }
+}
+
+static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+       unsigned index;
+       struct hlist_head *bucket;
+       struct kvm_mmu_page *page;
+       struct hlist_node *node, *n;
+       int r;
+
+       pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+       r = 0;
+       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+       bucket = &vcpu->kvm->mmu_page_hash[index];
+       hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
+               if (page->gfn == gfn && !page->role.metaphysical) {
+                       pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
+                                page->role.word);
+                       kvm_mmu_zap_page(vcpu, page);
+                       r = 1;
+               }
+       return r;
 }
 
 static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
@@ -225,35 +729,6 @@ hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
        return gpa_to_hpa(vcpu, gpa);
 }
 
-
-static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa,
-                              int level)
-{
-       ASSERT(vcpu);
-       ASSERT(VALID_PAGE(page_hpa));
-       ASSERT(level <= PT64_ROOT_LEVEL && level > 0);
-
-       if (level == 1)
-               memset(__va(page_hpa), 0, PAGE_SIZE);
-       else {
-               u64 *pos;
-               u64 *end;
-
-               for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE;
-                    pos != end; pos++) {
-                       u64 current_ent = *pos;
-
-                       *pos = 0;
-                       if (is_present_pte(current_ent))
-                               release_pt_page_64(vcpu,
-                                                 current_ent &
-                                                 PT64_BASE_ADDR_MASK,
-                                                 level - 1);
-               }
-       }
-       kvm_mmu_free_page(vcpu, page_hpa);
-}
-
 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 {
 }
@@ -266,52 +741,109 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
        for (; ; level--) {
                u32 index = PT64_INDEX(v, level);
                u64 *table;
+               u64 pte;
 
                ASSERT(VALID_PAGE(table_addr));
                table = __va(table_addr);
 
                if (level == 1) {
+                       pte = table[index];
+                       if (is_present_pte(pte) && is_writeble_pte(pte))
+                               return 0;
                        mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
                        page_header_update_slot(vcpu->kvm, table, v);
                        table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
                                                                PT_USER_MASK;
+                       rmap_add(vcpu, &table[index]);
                        return 0;
                }
 
                if (table[index] == 0) {
-                       hpa_t new_table = kvm_mmu_alloc_page(vcpu,
-                                                            &table[index]);
-
-                       if (!VALID_PAGE(new_table)) {
+                       struct kvm_mmu_page *new_table;
+                       gfn_t pseudo_gfn;
+
+                       pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
+                               >> PAGE_SHIFT;
+                       new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
+                                                    v, level - 1,
+                                                    1, &table[index]);
+                       if (!new_table) {
                                pgprintk("nonpaging_map: ENOMEM\n");
                                return -ENOMEM;
                        }
 
-                       if (level == PT32E_ROOT_LEVEL)
-                               table[index] = new_table | PT_PRESENT_MASK;
-                       else
-                               table[index] = new_table | PT_PRESENT_MASK |
-                                               PT_WRITABLE_MASK | PT_USER_MASK;
+                       table[index] = new_table->page_hpa | PT_PRESENT_MASK
+                               | PT_WRITABLE_MASK | PT_USER_MASK;
                }
                table_addr = table[index] & PT64_BASE_ADDR_MASK;
        }
 }
 
-static void nonpaging_flush(struct kvm_vcpu *vcpu)
+static void mmu_free_roots(struct kvm_vcpu *vcpu)
 {
-       hpa_t root = vcpu->mmu.root_hpa;
+       int i;
+       struct kvm_mmu_page *page;
 
-       ++kvm_stat.tlb_flush;
-       pgprintk("nonpaging_flush\n");
-       ASSERT(VALID_PAGE(root));
-       release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level);
-       root = kvm_mmu_alloc_page(vcpu, NULL);
-       ASSERT(VALID_PAGE(root));
-       vcpu->mmu.root_hpa = root;
-       if (is_paging(vcpu))
-               root |= (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK));
-       kvm_arch_ops->set_cr3(vcpu, root);
-       kvm_arch_ops->tlb_flush(vcpu);
+#ifdef CONFIG_X86_64
+       if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+               hpa_t root = vcpu->mmu.root_hpa;
+
+               ASSERT(VALID_PAGE(root));
+               page = page_header(root);
+               --page->root_count;
+               vcpu->mmu.root_hpa = INVALID_PAGE;
+               return;
+       }
+#endif
+       for (i = 0; i < 4; ++i) {
+               hpa_t root = vcpu->mmu.pae_root[i];
+
+               ASSERT(VALID_PAGE(root));
+               root &= PT64_BASE_ADDR_MASK;
+               page = page_header(root);
+               --page->root_count;
+               vcpu->mmu.pae_root[i] = INVALID_PAGE;
+       }
+       vcpu->mmu.root_hpa = INVALID_PAGE;
+}
+
+static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+       int i;
+       gfn_t root_gfn;
+       struct kvm_mmu_page *page;
+
+       root_gfn = vcpu->cr3 >> PAGE_SHIFT;
+
+#ifdef CONFIG_X86_64
+       if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+               hpa_t root = vcpu->mmu.root_hpa;
+
+               ASSERT(!VALID_PAGE(root));
+               page = kvm_mmu_get_page(vcpu, root_gfn, 0,
+                                       PT64_ROOT_LEVEL, 0, NULL);
+               root = page->page_hpa;
+               ++page->root_count;
+               vcpu->mmu.root_hpa = root;
+               return;
+       }
+#endif
+       for (i = 0; i < 4; ++i) {
+               hpa_t root = vcpu->mmu.pae_root[i];
+
+               ASSERT(!VALID_PAGE(root));
+               if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL)
+                       root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
+               else if (vcpu->mmu.root_level == 0)
+                       root_gfn = 0;
+               page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
+                                       PT32_ROOT_LEVEL, !is_paging(vcpu),
+                                       NULL);
+               root = page->page_hpa;
+               ++page->root_count;
+               vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
+       }
+       vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
 }
 
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -322,43 +854,29 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
                               u32 error_code)
 {
-       int ret;
        gpa_t addr = gva;
+       hpa_t paddr;
+       int r;
+
+       r = mmu_topup_memory_caches(vcpu);
+       if (r)
+               return r;
 
        ASSERT(vcpu);
        ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
 
-       for (;;) {
-            hpa_t paddr;
-
-            paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
 
-            if (is_error_hpa(paddr))
-                    return 1;
+       paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
 
-            ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
-            if (ret) {
-                    nonpaging_flush(vcpu);
-                    continue;
-            }
-            break;
-       }
-       return ret;
-}
+       if (is_error_hpa(paddr))
+               return 1;
 
-static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
-{
+       return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
 }
 
 static void nonpaging_free(struct kvm_vcpu *vcpu)
 {
-       hpa_t root;
-
-       ASSERT(vcpu);
-       root = vcpu->mmu.root_hpa;
-       if (VALID_PAGE(root))
-               release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level);
-       vcpu->mmu.root_hpa = INVALID_PAGE;
+       mmu_free_roots(vcpu);
 }
 
 static int nonpaging_init_context(struct kvm_vcpu *vcpu)
@@ -367,40 +885,31 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 
        context->new_cr3 = nonpaging_new_cr3;
        context->page_fault = nonpaging_page_fault;
-       context->inval_page = nonpaging_inval_page;
        context->gva_to_gpa = nonpaging_gva_to_gpa;
        context->free = nonpaging_free;
-       context->root_level = PT32E_ROOT_LEVEL;
+       context->root_level = 0;
        context->shadow_root_level = PT32E_ROOT_LEVEL;
-       context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL);
+       mmu_alloc_roots(vcpu);
        ASSERT(VALID_PAGE(context->root_hpa));
        kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
        return 0;
 }
 
-
 static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 {
-       struct kvm_mmu_page *page, *npage;
-
-       list_for_each_entry_safe(page, npage, &vcpu->kvm->active_mmu_pages,
-                                link) {
-               if (page->global)
-                       continue;
-
-               if (!page->parent_pte)
-                       continue;
-
-               *page->parent_pte = 0;
-               release_pt_page_64(vcpu, page->page_hpa, 1);
-       }
        ++kvm_stat.tlb_flush;
        kvm_arch_ops->tlb_flush(vcpu);
 }
 
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
 {
+       pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
+       mmu_free_roots(vcpu);
+       if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
+               kvm_mmu_free_some_pages(vcpu);
+       mmu_alloc_roots(vcpu);
        kvm_mmu_flush_tlb(vcpu);
+       kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
 }
 
 static void mark_pagetable_nonglobal(void *shadow_pte)
@@ -412,7 +921,8 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu,
                             u64 *shadow_pte,
                             gpa_t gaddr,
                             int dirty,
-                            u64 access_bits)
+                            u64 access_bits,
+                            gfn_t gfn)
 {
        hpa_t paddr;
 
@@ -420,13 +930,10 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu,
        if (!dirty)
                access_bits &= ~PT_WRITABLE_MASK;
 
-       if (access_bits & PT_WRITABLE_MASK)
-               mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
+       paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
 
        *shadow_pte |= access_bits;
 
-       paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
-
        if (!(*shadow_pte & PT_GLOBAL_MASK))
                mark_pagetable_nonglobal(shadow_pte);
 
@@ -434,10 +941,31 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu,
                *shadow_pte |= gaddr;
                *shadow_pte |= PT_SHADOW_IO_MARK;
                *shadow_pte &= ~PT_PRESENT_MASK;
-       } else {
-               *shadow_pte |= paddr;
-               page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
+               return;
+       }
+
+       *shadow_pte |= paddr;
+
+       if (access_bits & PT_WRITABLE_MASK) {
+               struct kvm_mmu_page *shadow;
+
+               shadow = kvm_mmu_lookup_page(vcpu, gfn);
+               if (shadow) {
+                       pgprintk("%s: found shadow page for %lx, marking ro\n",
+                                __FUNCTION__, gfn);
+                       access_bits &= ~PT_WRITABLE_MASK;
+                       if (is_writeble_pte(*shadow_pte)) {
+                                   *shadow_pte &= ~PT_WRITABLE_MASK;
+                                   kvm_arch_ops->tlb_flush(vcpu);
+                       }
+               }
        }
+
+       if (access_bits & PT_WRITABLE_MASK)
+               mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
+
+       page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
+       rmap_add(vcpu, shadow_pte);
 }
 
 static void inject_page_fault(struct kvm_vcpu *vcpu,
@@ -474,41 +1002,6 @@ static int may_access(u64 pte, int write, int user)
        return 1;
 }
 
-/*
- * Remove a shadow pte.
- */
-static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
-{
-       hpa_t page_addr = vcpu->mmu.root_hpa;
-       int level = vcpu->mmu.shadow_root_level;
-
-       ++kvm_stat.invlpg;
-
-       for (; ; level--) {
-               u32 index = PT64_INDEX(addr, level);
-               u64 *table = __va(page_addr);
-
-               if (level == PT_PAGE_TABLE_LEVEL ) {
-                       table[index] = 0;
-                       return;
-               }
-
-               if (!is_present_pte(table[index]))
-                       return;
-
-               page_addr = table[index] & PT64_BASE_ADDR_MASK;
-
-               if (level == PT_DIRECTORY_LEVEL &&
-                         (table[index] & PT_SHADOW_PS_MARK)) {
-                       table[index] = 0;
-                       release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL);
-
-                       kvm_arch_ops->tlb_flush(vcpu);
-                       return;
-               }
-       }
-}
-
 static void paging_free(struct kvm_vcpu *vcpu)
 {
        nonpaging_free(vcpu);
@@ -522,37 +1015,40 @@ static void paging_free(struct kvm_vcpu *vcpu)
 #include "paging_tmpl.h"
 #undef PTTYPE
 
-static int paging64_init_context(struct kvm_vcpu *vcpu)
+static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 {
        struct kvm_mmu *context = &vcpu->mmu;
 
        ASSERT(is_pae(vcpu));
        context->new_cr3 = paging_new_cr3;
        context->page_fault = paging64_page_fault;
-       context->inval_page = paging_inval_page;
        context->gva_to_gpa = paging64_gva_to_gpa;
        context->free = paging_free;
-       context->root_level = PT64_ROOT_LEVEL;
-       context->shadow_root_level = PT64_ROOT_LEVEL;
-       context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL);
+       context->root_level = level;
+       context->shadow_root_level = level;
+       mmu_alloc_roots(vcpu);
        ASSERT(VALID_PAGE(context->root_hpa));
        kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
                    (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
        return 0;
 }
 
+static int paging64_init_context(struct kvm_vcpu *vcpu)
+{
+       return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
+}
+
 static int paging32_init_context(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *context = &vcpu->mmu;
 
        context->new_cr3 = paging_new_cr3;
        context->page_fault = paging32_page_fault;
-       context->inval_page = paging_inval_page;
        context->gva_to_gpa = paging32_gva_to_gpa;
        context->free = paging_free;
        context->root_level = PT32_ROOT_LEVEL;
        context->shadow_root_level = PT32E_ROOT_LEVEL;
-       context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL);
+       mmu_alloc_roots(vcpu);
        ASSERT(VALID_PAGE(context->root_hpa));
        kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
                    (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
@@ -561,14 +1057,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
 
 static int paging32E_init_context(struct kvm_vcpu *vcpu)
 {
-       int ret;
-
-       if ((ret = paging64_init_context(vcpu)))
-               return ret;
-
-       vcpu->mmu.root_level = PT32E_ROOT_LEVEL;
-       vcpu->mmu.shadow_root_level = PT32E_ROOT_LEVEL;
-       return 0;
+       return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
 }
 
 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
@@ -597,41 +1086,161 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
 {
+       int r;
+
        destroy_kvm_mmu(vcpu);
-       return init_kvm_mmu(vcpu);
+       r = init_kvm_mmu(vcpu);
+       if (r < 0)
+               goto out;
+       r = mmu_topup_memory_caches(vcpu);
+out:
+       return r;
 }
 
-static void free_mmu_pages(struct kvm_vcpu *vcpu)
+void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
 {
-       while (!list_empty(&vcpu->free_pages)) {
+       gfn_t gfn = gpa >> PAGE_SHIFT;
+       struct kvm_mmu_page *page;
+       struct kvm_mmu_page *child;
+       struct hlist_node *node, *n;
+       struct hlist_head *bucket;
+       unsigned index;
+       u64 *spte;
+       u64 pte;
+       unsigned offset = offset_in_page(gpa);
+       unsigned pte_size;
+       unsigned page_offset;
+       unsigned misaligned;
+       int level;
+       int flooded = 0;
+
+       pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
+       if (gfn == vcpu->last_pt_write_gfn) {
+               ++vcpu->last_pt_write_count;
+               if (vcpu->last_pt_write_count >= 3)
+                       flooded = 1;
+       } else {
+               vcpu->last_pt_write_gfn = gfn;
+               vcpu->last_pt_write_count = 1;
+       }
+       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+       bucket = &vcpu->kvm->mmu_page_hash[index];
+       hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
+               if (page->gfn != gfn || page->role.metaphysical)
+                       continue;
+               pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
+               misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+               if (misaligned || flooded) {
+                       /*
+                        * Misaligned accesses are too much trouble to fix
+                        * up; also, they usually indicate a page is not used
+                        * as a page table.
+                        *
+                        * If we're seeing too many writes to a page,
+                        * it may no longer be a page table, or we may be
+                        * forking, in which case it is better to unmap the
+                        * page.
+                        */
+                       pgprintk("misaligned: gpa %llx bytes %d role %x\n",
+                                gpa, bytes, page->role.word);
+                       kvm_mmu_zap_page(vcpu, page);
+                       continue;
+               }
+               page_offset = offset;
+               level = page->role.level;
+               if (page->role.glevels == PT32_ROOT_LEVEL) {
+                       page_offset <<= 1;          /* 32->64 */
+                       page_offset &= ~PAGE_MASK;
+               }
+               spte = __va(page->page_hpa);
+               spte += page_offset / sizeof(*spte);
+               pte = *spte;
+               if (is_present_pte(pte)) {
+                       if (level == PT_PAGE_TABLE_LEVEL)
+                               rmap_remove(vcpu, spte);
+                       else {
+                               child = page_header(pte & PT64_BASE_ADDR_MASK);
+                               mmu_page_remove_parent_pte(vcpu, child, spte);
+                       }
+               }
+               *spte = 0;
+       }
+}
+
+void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
+{
+}
+
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
+{
+       gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
+
+       return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
+}
+
+void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+{
+       while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
                struct kvm_mmu_page *page;
 
+               page = container_of(vcpu->kvm->active_mmu_pages.prev,
+                                   struct kvm_mmu_page, link);
+               kvm_mmu_zap_page(vcpu, page);
+       }
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_free_some_pages);
+
+static void free_mmu_pages(struct kvm_vcpu *vcpu)
+{
+       struct kvm_mmu_page *page;
+
+       while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
+               page = container_of(vcpu->kvm->active_mmu_pages.next,
+                                   struct kvm_mmu_page, link);
+               kvm_mmu_zap_page(vcpu, page);
+       }
+       while (!list_empty(&vcpu->free_pages)) {
                page = list_entry(vcpu->free_pages.next,
                                  struct kvm_mmu_page, link);
                list_del(&page->link);
                __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT));
                page->page_hpa = INVALID_PAGE;
        }
+       free_page((unsigned long)vcpu->mmu.pae_root);
 }
 
 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 {
+       struct page *page;
        int i;
 
        ASSERT(vcpu);
 
        for (i = 0; i < KVM_NUM_MMU_PAGES; i++) {
-               struct page *page;
                struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i];
 
                INIT_LIST_HEAD(&page_header->link);
-               if ((page = alloc_page(GFP_KVM_MMU)) == NULL)
+               if ((page = alloc_page(GFP_KERNEL)) == NULL)
                        goto error_1;
                page->private = (unsigned long)page_header;
                page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
                memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
                list_add(&page_header->link, &vcpu->free_pages);
+               ++vcpu->kvm->n_free_mmu_pages;
        }
+
+       /*
+        * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
+        * Therefore we need to allocate shadow page tables in the first
+        * 4GB of memory, which happens to fit the DMA32 zone.
+        */
+       page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+       if (!page)
+               goto error_1;
+       vcpu->mmu.pae_root = page_address(page);
+       for (i = 0; i < 4; ++i)
+               vcpu->mmu.pae_root[i] = INVALID_PAGE;
+
        return 0;
 
 error_1:
@@ -663,10 +1272,12 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 
        destroy_kvm_mmu(vcpu);
        free_mmu_pages(vcpu);
+       mmu_free_memory_caches(vcpu);
 }
 
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
+void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot)
 {
+       struct kvm *kvm = vcpu->kvm;
        struct kvm_mmu_page *page;
 
        list_for_each_entry(page, &kvm->active_mmu_pages, link) {
@@ -679,8 +1290,169 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
                pt = __va(page->page_hpa);
                for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
                        /* avoid RMW */
-                       if (pt[i] & PT_WRITABLE_MASK)
+                       if (pt[i] & PT_WRITABLE_MASK) {
+                               rmap_remove(vcpu, &pt[i]);
                                pt[i] &= ~PT_WRITABLE_MASK;
+                       }
+       }
+}
+
+#ifdef AUDIT
+
+static const char *audit_msg;
+
+static gva_t canonicalize(gva_t gva)
+{
+#ifdef CONFIG_X86_64
+       gva = (long long)(gva << 16) >> 16;
+#endif
+       return gva;
+}
 
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+                               gva_t va, int level)
+{
+       u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+       int i;
+       gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+               u64 ent = pt[i];
+
+               if (!ent & PT_PRESENT_MASK)
+                       continue;
+
+               va = canonicalize(va);
+               if (level > 1)
+                       audit_mappings_page(vcpu, ent, va, level - 1);
+               else {
+                       gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
+                       hpa_t hpa = gpa_to_hpa(vcpu, gpa);
+
+                       if ((ent & PT_PRESENT_MASK)
+                           && (ent & PT64_BASE_ADDR_MASK) != hpa)
+                               printk(KERN_ERR "audit error: (%s) levels %d"
+                                      " gva %lx gpa %llx hpa %llx ent %llx\n",
+                                      audit_msg, vcpu->mmu.root_level,
+                                      va, gpa, hpa, ent);
+               }
        }
 }
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+       int i;
+
+       if (vcpu->mmu.root_level == 4)
+               audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
+       else
+               for (i = 0; i < 4; ++i)
+                       if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
+                               audit_mappings_page(vcpu,
+                                                   vcpu->mmu.pae_root[i],
+                                                   i << 30,
+                                                   2);
+}
+
+static int count_rmaps(struct kvm_vcpu *vcpu)
+{
+       int nmaps = 0;
+       int i, j, k;
+
+       for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+               struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
+               struct kvm_rmap_desc *d;
+
+               for (j = 0; j < m->npages; ++j) {
+                       struct page *page = m->phys_mem[j];
+
+                       if (!page->private)
+                               continue;
+                       if (!(page->private & 1)) {
+                               ++nmaps;
+                               continue;
+                       }
+                       d = (struct kvm_rmap_desc *)(page->private & ~1ul);
+                       while (d) {
+                               for (k = 0; k < RMAP_EXT; ++k)
+                                       if (d->shadow_ptes[k])
+                                               ++nmaps;
+                                       else
+                                               break;
+                               d = d->more;
+                       }
+               }
+       }
+       return nmaps;
+}
+
+static int count_writable_mappings(struct kvm_vcpu *vcpu)
+{
+       int nmaps = 0;
+       struct kvm_mmu_page *page;
+       int i;
+
+       list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
+               u64 *pt = __va(page->page_hpa);
+
+               if (page->role.level != PT_PAGE_TABLE_LEVEL)
+                       continue;
+
+               for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+                       u64 ent = pt[i];
+
+                       if (!(ent & PT_PRESENT_MASK))
+                               continue;
+                       if (!(ent & PT_WRITABLE_MASK))
+                               continue;
+                       ++nmaps;
+               }
+       }
+       return nmaps;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+       int n_rmap = count_rmaps(vcpu);
+       int n_actual = count_writable_mappings(vcpu);
+
+       if (n_rmap != n_actual)
+               printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
+                      __FUNCTION__, audit_msg, n_rmap, n_actual);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+       struct kvm_mmu_page *page;
+
+       list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
+               hfn_t hfn;
+               struct page *pg;
+
+               if (page->role.metaphysical)
+                       continue;
+
+               hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
+                       >> PAGE_SHIFT;
+               pg = pfn_to_page(hfn);
+               if (pg->private)
+                       printk(KERN_ERR "%s: (%s) shadow page has writable"
+                              " mappings: gfn %lx role %x\n",
+                              __FUNCTION__, audit_msg, page->gfn,
+                              page->role.word);
+       }
+}
+
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
+{
+       int olddbg = dbg;
+
+       dbg = 0;
+       audit_msg = msg;
+       audit_rmap(vcpu);
+       audit_write_protection(vcpu);
+       audit_mappings(vcpu);
+       dbg = olddbg;
+}
+
+#endif
index 09bb9b4..2dbf430 100644 (file)
        #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
        #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
        #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK
+       #ifdef CONFIG_X86_64
+       #define PT_MAX_FULL_LEVELS 4
+       #else
+       #define PT_MAX_FULL_LEVELS 2
+       #endif
 #elif PTTYPE == 32
        #define pt_element_t u32
        #define guest_walker guest_walker32
@@ -42,6 +47,7 @@
        #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
        #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
        #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK
+       #define PT_MAX_FULL_LEVELS 2
 #else
        #error Invalid PTTYPE value
 #endif
  */
 struct guest_walker {
        int level;
+       gfn_t table_gfn[PT_MAX_FULL_LEVELS];
        pt_element_t *table;
+       pt_element_t *ptep;
        pt_element_t inherited_ar;
+       gfn_t gfn;
 };
 
-static void FNAME(init_walker)(struct guest_walker *walker,
-                              struct kvm_vcpu *vcpu)
+/*
+ * Fetch a guest pte for a guest virtual address
+ */
+static void FNAME(walk_addr)(struct guest_walker *walker,
+                            struct kvm_vcpu *vcpu, gva_t addr)
 {
        hpa_t hpa;
        struct kvm_memory_slot *slot;
+       pt_element_t *ptep;
+       pt_element_t root;
+       gfn_t table_gfn;
 
+       pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
        walker->level = vcpu->mmu.root_level;
-       slot = gfn_to_memslot(vcpu->kvm,
-                             (vcpu->cr3 & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
-       hpa = safe_gpa_to_hpa(vcpu, vcpu->cr3 & PT64_BASE_ADDR_MASK);
+       walker->table = NULL;
+       root = vcpu->cr3;
+#if PTTYPE == 64
+       if (!is_long_mode(vcpu)) {
+               walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
+               root = *walker->ptep;
+               if (!(root & PT_PRESENT_MASK))
+                       return;
+               --walker->level;
+       }
+#endif
+       table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+       walker->table_gfn[walker->level - 1] = table_gfn;
+       pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
+                walker->level - 1, table_gfn);
+       slot = gfn_to_memslot(vcpu->kvm, table_gfn);
+       hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
        walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0);
 
        ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
               (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0);
 
-       walker->table = (pt_element_t *)( (unsigned long)walker->table |
-               (unsigned long)(vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) );
        walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
+
+       for (;;) {
+               int index = PT_INDEX(addr, walker->level);
+               hpa_t paddr;
+
+               ptep = &walker->table[index];
+               ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
+                      ((unsigned long)ptep & PAGE_MASK));
+
+               if (is_present_pte(*ptep) && !(*ptep &  PT_ACCESSED_MASK))
+                       *ptep |= PT_ACCESSED_MASK;
+
+               if (!is_present_pte(*ptep))
+                       break;
+
+               if (walker->level == PT_PAGE_TABLE_LEVEL) {
+                       walker->gfn = (*ptep & PT_BASE_ADDR_MASK)
+                               >> PAGE_SHIFT;
+                       break;
+               }
+
+               if (walker->level == PT_DIRECTORY_LEVEL
+                   && (*ptep & PT_PAGE_SIZE_MASK)
+                   && (PTTYPE == 64 || is_pse(vcpu))) {
+                       walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK)
+                               >> PAGE_SHIFT;
+                       walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
+                       break;
+               }
+
+               if (walker->level != 3 || is_long_mode(vcpu))
+                       walker->inherited_ar &= walker->table[index];
+               table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
+               paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK);
+               kunmap_atomic(walker->table, KM_USER0);
+               walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT),
+                                           KM_USER0);
+               --walker->level;
+               walker->table_gfn[walker->level - 1 ] = table_gfn;
+               pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
+                        walker->level - 1, table_gfn);
+       }
+       walker->ptep = ptep;
+       pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
 }
 
 static void FNAME(release_walker)(struct guest_walker *walker)
 {
-       kunmap_atomic(walker->table, KM_USER0);
+       if (walker->table)
+               kunmap_atomic(walker->table, KM_USER0);
 }
 
 static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte,
-                          u64 *shadow_pte, u64 access_bits)
+                          u64 *shadow_pte, u64 access_bits, gfn_t gfn)
 {
        ASSERT(*shadow_pte == 0);
        access_bits &= guest_pte;
        *shadow_pte = (guest_pte & PT_PTE_COPY_MASK);
        set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK,
-                      guest_pte & PT_DIRTY_MASK, access_bits);
+                      guest_pte & PT_DIRTY_MASK, access_bits, gfn);
 }
 
 static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde,
-                          u64 *shadow_pte, u64 access_bits,
-                          int index)
+                          u64 *shadow_pte, u64 access_bits, gfn_t gfn)
 {
        gpa_t gaddr;
 
        ASSERT(*shadow_pte == 0);
        access_bits &= guest_pde;
-       gaddr = (guest_pde & PT_DIR_BASE_ADDR_MASK) + PAGE_SIZE * index;
+       gaddr = (gpa_t)gfn << PAGE_SHIFT;
        if (PTTYPE == 32 && is_cpuid_PSE36())
                gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) <<
                        (32 - PT32_DIR_PSE36_SHIFT);
        *shadow_pte = guest_pde & PT_PTE_COPY_MASK;
        set_pte_common(vcpu, shadow_pte, gaddr,
-                      guest_pde & PT_DIRTY_MASK, access_bits);
-}
-
-/*
- * Fetch a guest pte from a specific level in the paging hierarchy.
- */
-static pt_element_t *FNAME(fetch_guest)(struct kvm_vcpu *vcpu,
-                                       struct guest_walker *walker,
-                                       int level,
-                                       gva_t addr)
-{
-
-       ASSERT(level > 0  && level <= walker->level);
-
-       for (;;) {
-               int index = PT_INDEX(addr, walker->level);
-               hpa_t paddr;
-
-               ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
-                      ((unsigned long)&walker->table[index] & PAGE_MASK));
-               if (level == walker->level ||
-                   !is_present_pte(walker->table[index]) ||
-                   (walker->level == PT_DIRECTORY_LEVEL &&
-                    (walker->table[index] & PT_PAGE_SIZE_MASK) &&
-                    (PTTYPE == 64 || is_pse(vcpu))))
-                       return &walker->table[index];
-               if (walker->level != 3 || is_long_mode(vcpu))
-                       walker->inherited_ar &= walker->table[index];
-               paddr = safe_gpa_to_hpa(vcpu, walker->table[index] & PT_BASE_ADDR_MASK);
-               kunmap_atomic(walker->table, KM_USER0);
-               walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT),
-                                           KM_USER0);
-               --walker->level;
-       }
+                      guest_pde & PT_DIRTY_MASK, access_bits, gfn);
 }
 
 /*
@@ -150,15 +189,26 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
        hpa_t shadow_addr;
        int level;
        u64 *prev_shadow_ent = NULL;
+       pt_element_t *guest_ent = walker->ptep;
+
+       if (!is_present_pte(*guest_ent))
+               return NULL;
 
        shadow_addr = vcpu->mmu.root_hpa;
        level = vcpu->mmu.shadow_root_level;
+       if (level == PT32E_ROOT_LEVEL) {
+               shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3];
+               shadow_addr &= PT64_BASE_ADDR_MASK;
+               --level;
+       }
 
        for (; ; level--) {
                u32 index = SHADOW_PT_INDEX(addr, level);
                u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index;
-               pt_element_t *guest_ent;
+               struct kvm_mmu_page *shadow_page;
                u64 shadow_pte;
+               int metaphysical;
+               gfn_t table_gfn;
 
                if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
                        if (level == PT_PAGE_TABLE_LEVEL)
@@ -168,21 +218,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                        continue;
                }
 
-               if (PTTYPE == 32 && level > PT32_ROOT_LEVEL) {
-                       ASSERT(level == PT32E_ROOT_LEVEL);
-                       guest_ent = FNAME(fetch_guest)(vcpu, walker,
-                                                      PT32_ROOT_LEVEL, addr);
-               } else
-                       guest_ent = FNAME(fetch_guest)(vcpu, walker,
-                                                      level, addr);
-
-               if (!is_present_pte(*guest_ent))
-                       return NULL;
-
-               /* Don't set accessed bit on PAE PDPTRs */
-               if (vcpu->mmu.root_level != 3 || walker->level != 3)
-                       *guest_ent |= PT_ACCESSED_MASK;
-
                if (level == PT_PAGE_TABLE_LEVEL) {
 
                        if (walker->level == PT_DIRECTORY_LEVEL) {
@@ -190,21 +225,30 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                                        *prev_shadow_ent |= PT_SHADOW_PS_MARK;
                                FNAME(set_pde)(vcpu, *guest_ent, shadow_ent,
                                               walker->inherited_ar,
-                                         PT_INDEX(addr, PT_PAGE_TABLE_LEVEL));
+                                              walker->gfn);
                        } else {
                                ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
-                               FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, walker->inherited_ar);
+                               FNAME(set_pte)(vcpu, *guest_ent, shadow_ent,
+                                              walker->inherited_ar,
+                                              walker->gfn);
                        }
                        return shadow_ent;
                }
 
-               shadow_addr = kvm_mmu_alloc_page(vcpu, shadow_ent);
-               if (!VALID_PAGE(shadow_addr))
-                       return ERR_PTR(-ENOMEM);
-               shadow_pte = shadow_addr | PT_PRESENT_MASK;
-               if (vcpu->mmu.root_level > 3 || level != 3)
-                       shadow_pte |= PT_ACCESSED_MASK
-                               | PT_WRITABLE_MASK | PT_USER_MASK;
+               if (level - 1 == PT_PAGE_TABLE_LEVEL
+                   && walker->level == PT_DIRECTORY_LEVEL) {
+                       metaphysical = 1;
+                       table_gfn = (*guest_ent & PT_BASE_ADDR_MASK)
+                               >> PAGE_SHIFT;
+               } else {
+                       metaphysical = 0;
+                       table_gfn = walker->table_gfn[level - 2];
+               }
+               shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+                                              metaphysical, shadow_ent);
+               shadow_addr = shadow_page->page_hpa;
+               shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
+                       | PT_WRITABLE_MASK | PT_USER_MASK;
                *shadow_ent = shadow_pte;
                prev_shadow_ent = shadow_ent;
        }
@@ -221,11 +265,13 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu,
                               u64 *shadow_ent,
                               struct guest_walker *walker,
                               gva_t addr,
-                              int user)
+                              int user,
+                              int *write_pt)
 {
        pt_element_t *guest_ent;
        int writable_shadow;
        gfn_t gfn;
+       struct kvm_mmu_page *page;
 
        if (is_writeble_pte(*shadow_ent))
                return 0;
@@ -250,17 +296,35 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu,
                        *shadow_ent &= ~PT_USER_MASK;
                }
 
-       guest_ent = FNAME(fetch_guest)(vcpu, walker, PT_PAGE_TABLE_LEVEL, addr);
+       guest_ent = walker->ptep;
 
        if (!is_present_pte(*guest_ent)) {
                *shadow_ent = 0;
                return 0;
        }
 
-       gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+       gfn = walker->gfn;
+
+       if (user) {
+               /*
+                * Usermode page faults won't be for page table updates.
+                */
+               while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
+                       pgprintk("%s: zap %lx %x\n",
+                                __FUNCTION__, gfn, page->role.word);
+                       kvm_mmu_zap_page(vcpu, page);
+               }
+       } else if (kvm_mmu_lookup_page(vcpu, gfn)) {
+               pgprintk("%s: found shadow page for %lx, marking ro\n",
+                        __FUNCTION__, gfn);
+               *guest_ent |= PT_DIRTY_MASK;
+               *write_pt = 1;
+               return 0;
+       }
        mark_page_dirty(vcpu->kvm, gfn);
        *shadow_ent |= PT_WRITABLE_MASK;
        *guest_ent |= PT_DIRTY_MASK;
+       rmap_add(vcpu, shadow_ent);
 
        return 1;
 }
@@ -276,7 +340,8 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu,
  *   - normal guest page fault due to the guest pte marked not present, not
  *     writable, or not executable
  *
- *  Returns: 1 if we need to emulate the instruction, 0 otherwise
+ *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
+ *           a negative value on error.
  */
 static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
                               u32 error_code)
@@ -287,39 +352,47 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
        struct guest_walker walker;
        u64 *shadow_pte;
        int fixed;
+       int write_pt = 0;
+       int r;
+
+       pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
+       kvm_mmu_audit(vcpu, "pre page fault");
+
+       r = mmu_topup_memory_caches(vcpu);
+       if (r)
+               return r;
 
        /*
         * Look up the shadow pte for the faulting address.
         */
-       for (;;) {
-               FNAME(init_walker)(&walker, vcpu);
-               shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
-               if (IS_ERR(shadow_pte)) {  /* must be -ENOMEM */
-                       nonpaging_flush(vcpu);
-                       FNAME(release_walker)(&walker);
-                       continue;
-               }
-               break;
-       }
+       FNAME(walk_addr)(&walker, vcpu, addr);
+       shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
 
        /*
         * The page is not mapped by the guest.  Let the guest handle it.
         */
        if (!shadow_pte) {
+               pgprintk("%s: not mapped\n", __FUNCTION__);
                inject_page_fault(vcpu, addr, error_code);
                FNAME(release_walker)(&walker);
                return 0;
        }
 
+       pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__,
+                shadow_pte, *shadow_pte);
+
        /*
         * Update the shadow pte.
         */
        if (write_fault)
                fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr,
-                                           user_fault);
+                                           user_fault, &write_pt);
        else
                fixed = fix_read_pf(shadow_pte);
 
+       pgprintk("%s: updated shadow pte %p %llx\n", __FUNCTION__,
+                shadow_pte, *shadow_pte);
+
        FNAME(release_walker)(&walker);
 
        /*
@@ -331,20 +404,23 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
                pgprintk("%s: io work, no access\n", __FUNCTION__);
                inject_page_fault(vcpu, addr,
                                  error_code | PFERR_PRESENT_MASK);
+               kvm_mmu_audit(vcpu, "post page fault (io)");
                return 0;
        }
 
        /*
         * pte not present, guest page fault.
         */
-       if (pte_present && !fixed) {
+       if (pte_present && !fixed && !write_pt) {
                inject_page_fault(vcpu, addr, error_code);
+               kvm_mmu_audit(vcpu, "post page fault (guest)");
                return 0;
        }
 
        ++kvm_stat.pf_fixed;
+       kvm_mmu_audit(vcpu, "post page fault (fixed)");
 
-       return 0;
+       return write_pt;
 }
 
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -353,9 +429,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
        pt_element_t guest_pte;
        gpa_t gpa;
 
-       FNAME(init_walker)(&walker, vcpu);
-       guest_pte = *FNAME(fetch_guest)(vcpu, &walker, PT_PAGE_TABLE_LEVEL,
-                                       vaddr);
+       FNAME(walk_addr)(&walker, vcpu, vaddr);
+       guest_pte = *walker.ptep;
        FNAME(release_walker)(&walker);
 
        if (!is_present_pte(guest_pte))
@@ -389,3 +464,4 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
 #undef PT_PTE_COPY_MASK
 #undef PT_NON_PTE_COPY_MASK
 #undef PT_DIR_BASE_ADDR_MASK
+#undef PT_MAX_FULL_LEVELS
index fa04287..ccc06b1 100644 (file)
@@ -235,6 +235,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 
        vcpu->rip = vcpu->svm->vmcb->save.rip = vcpu->svm->next_rip;
        vcpu->svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+
+       vcpu->interrupt_window_open = 1;
 }
 
 static int has_svm(void)
@@ -495,7 +497,6 @@ static void init_vmcb(struct vmcb *vmcb)
                /*              (1ULL << INTERCEPT_SELECTIVE_CR0) | */
                                (1ULL << INTERCEPT_CPUID) |
                                (1ULL << INTERCEPT_HLT) |
-                               (1ULL << INTERCEPT_INVLPG) |
                                (1ULL << INTERCEPT_INVLPGA) |
                                (1ULL << INTERCEPT_IOIO_PROT) |
                                (1ULL << INTERCEPT_MSR_PROT) |
@@ -700,6 +701,10 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
        vcpu->svm->vmcb->save.gdtr.base = dt->base ;
 }
 
+static void svm_decache_cr0_cr4_guest_bits(struct kvm_vcpu *vcpu)
+{
+}
+
 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
 #ifdef CONFIG_X86_64
@@ -847,6 +852,7 @@ static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        u64 fault_address;
        u32 error_code;
        enum emulation_result er;
+       int r;
 
        if (is_external_interrupt(exit_int_info))
                push_irq(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
@@ -855,7 +861,12 @@ static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
        fault_address  = vcpu->svm->vmcb->control.exit_info_2;
        error_code = vcpu->svm->vmcb->control.exit_info_1;
-       if (!vcpu->mmu.page_fault(vcpu, fault_address, error_code)) {
+       r = kvm_mmu_page_fault(vcpu, fault_address, error_code);
+       if (r < 0) {
+               spin_unlock(&vcpu->kvm->lock);
+               return r;
+       }
+       if (!r) {
                spin_unlock(&vcpu->kvm->lock);
                return 1;
        }
@@ -1031,10 +1042,11 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1;
        skip_emulated_instruction(vcpu);
-       if (vcpu->irq_summary && (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF))
+       if (vcpu->irq_summary)
                return 1;
 
        kvm_run->exit_reason = KVM_EXIT_HLT;
+       ++kvm_stat.halt_exits;
        return 0;
 }
 
@@ -1186,6 +1198,23 @@ static int msr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                return rdmsr_interception(vcpu, kvm_run);
 }
 
+static int interrupt_window_interception(struct kvm_vcpu *vcpu,
+                                  struct kvm_run *kvm_run)
+{
+       /*
+        * If the user space waits to inject interrupts, exit as soon as
+        * possible
+        */
+       if (kvm_run->request_interrupt_window &&
+           !vcpu->irq_summary) {
+               ++kvm_stat.irq_window_exits;
+               kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
+               return 0;
+       }
+
+       return 1;
+}
+
 static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu,
                                      struct kvm_run *kvm_run) = {
        [SVM_EXIT_READ_CR0]                     = emulate_on_interception,
@@ -1210,6 +1239,7 @@ static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu,
        [SVM_EXIT_NMI]                          = nop_on_interception,
        [SVM_EXIT_SMI]                          = nop_on_interception,
        [SVM_EXIT_INIT]                         = nop_on_interception,
+       [SVM_EXIT_VINTR]                        = interrupt_window_interception,
        /* [SVM_EXIT_CR0_SEL_WRITE]             = emulate_on_interception, */
        [SVM_EXIT_CPUID]                        = cpuid_interception,
        [SVM_EXIT_HLT]                          = halt_interception,
@@ -1278,15 +1308,11 @@ static void pre_svm_run(struct kvm_vcpu *vcpu)
 }
 
 
-static inline void kvm_try_inject_irq(struct kvm_vcpu *vcpu)
+static inline void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
 {
        struct vmcb_control_area *control;
 
-       if (!vcpu->irq_summary)
-               return;
-
        control = &vcpu->svm->vmcb->control;
-
        control->int_vector = pop_irq(vcpu);
        control->int_ctl &= ~V_INTR_PRIO_MASK;
        control->int_ctl |= V_IRQ_MASK |
@@ -1301,6 +1327,59 @@ static void kvm_reput_irq(struct kvm_vcpu *vcpu)
                control->int_ctl &= ~V_IRQ_MASK;
                push_irq(vcpu, control->int_vector);
        }
+
+       vcpu->interrupt_window_open =
+               !(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
+}
+
+static void do_interrupt_requests(struct kvm_vcpu *vcpu,
+                                      struct kvm_run *kvm_run)
+{
+       struct vmcb_control_area *control = &vcpu->svm->vmcb->control;
+
+       vcpu->interrupt_window_open =
+               (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+                (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF));
+
+       if (vcpu->interrupt_window_open && vcpu->irq_summary)
+               /*
+                * If interrupts enabled, and not blocked by sti or mov ss. Good.
+                */
+               kvm_do_inject_irq(vcpu);
+
+       /*
+        * Interrupts blocked.  Wait for unblock.
+        */
+       if (!vcpu->interrupt_window_open &&
+           (vcpu->irq_summary || kvm_run->request_interrupt_window)) {
+               control->intercept |= 1ULL << INTERCEPT_VINTR;
+       } else
+               control->intercept &= ~(1ULL << INTERCEPT_VINTR);
+}
+
+static void post_kvm_run_save(struct kvm_vcpu *vcpu,
+                             struct kvm_run *kvm_run)
+{
+       kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open &&
+                                                 vcpu->irq_summary == 0);
+       kvm_run->if_flag = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF) != 0;
+       kvm_run->cr8 = vcpu->cr8;
+       kvm_run->apic_base = vcpu->apic_base;
+}
+
+/*
+ * Check if userspace requested an interrupt window, and that the
+ * interrupt window is open.
+ *
+ * No need to exit to userspace if we already have an interrupt queued.
+ */
+static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
+                                         struct kvm_run *kvm_run)
+{
+       return (!vcpu->irq_summary &&
+               kvm_run->request_interrupt_window &&
+               vcpu->interrupt_window_open &&
+               (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF));
 }
 
 static void save_db_regs(unsigned long *db_regs)
@@ -1324,9 +1403,10 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        u16 fs_selector;
        u16 gs_selector;
        u16 ldt_selector;
+       int r;
 
 again:
-       kvm_try_inject_irq(vcpu);
+       do_interrupt_requests(vcpu, kvm_run);
 
        clgi();
 
@@ -1487,18 +1567,28 @@ again:
        if (vcpu->svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
                kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY;
                kvm_run->exit_reason = vcpu->svm->vmcb->control.exit_code;
+               post_kvm_run_save(vcpu, kvm_run);
                return 0;
        }
 
-       if (handle_exit(vcpu, kvm_run)) {
+       r = handle_exit(vcpu, kvm_run);
+       if (r > 0) {
                if (signal_pending(current)) {
                        ++kvm_stat.signal_exits;
+                       post_kvm_run_save(vcpu, kvm_run);
+                       return -EINTR;
+               }
+
+               if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+                       ++kvm_stat.request_irq_exits;
+                       post_kvm_run_save(vcpu, kvm_run);
                        return -EINTR;
                }
                kvm_resched(vcpu);
                goto again;
        }
-       return 0;
+       post_kvm_run_save(vcpu, kvm_run);
+       return r;
 }
 
 static void svm_flush_tlb(struct kvm_vcpu *vcpu)
@@ -1565,6 +1655,7 @@ static struct kvm_arch_ops svm_arch_ops = {
        .get_segment = svm_get_segment,
        .set_segment = svm_set_segment,
        .get_cs_db_l_bits = svm_get_cs_db_l_bits,
+       .decache_cr0_cr4_guest_bits = svm_decache_cr0_cr4_guest_bits,
        .set_cr0 = svm_set_cr0,
        .set_cr0_no_modeswitch = svm_set_cr0,
        .set_cr3 = svm_set_cr3,
index d0a2c2d..d4701cb 100644 (file)
@@ -116,7 +116,7 @@ static void vmcs_clear(struct vmcs *vmcs)
 static void __vcpu_clear(void *arg)
 {
        struct kvm_vcpu *vcpu = arg;
-       int cpu = smp_processor_id();
+       int cpu = raw_smp_processor_id();
 
        if (vcpu->cpu == cpu)
                vmcs_clear(vcpu->vmcs);
@@ -152,15 +152,21 @@ static u64 vmcs_read64(unsigned long field)
 #endif
 }
 
+static noinline void vmwrite_error(unsigned long field, unsigned long value)
+{
+       printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
+              field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
+       dump_stack();
+}
+
 static void vmcs_writel(unsigned long field, unsigned long value)
 {
        u8 error;
 
        asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
                       : "=q"(error) : "a"(value), "d"(field) : "cc" );
-       if (error)
-               printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
-                      field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
+       if (unlikely(error))
+               vmwrite_error(field, value);
 }
 
 static void vmcs_write16(unsigned long field, u16 value)
@@ -263,6 +269,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
        if (interruptibility & 3)
                vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
                             interruptibility & ~3);
+       vcpu->interrupt_window_open = 1;
 }
 
 static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
@@ -541,7 +548,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
 
 static struct vmcs *alloc_vmcs(void)
 {
-       return alloc_vmcs_cpu(smp_processor_id());
+       return alloc_vmcs_cpu(raw_smp_processor_id());
 }
 
 static void free_vmcs(struct vmcs *vmcs)
@@ -736,6 +743,15 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 #endif
 
+static void vmx_decache_cr0_cr4_guest_bits(struct kvm_vcpu *vcpu)
+{
+       vcpu->cr0 &= KVM_GUEST_CR0_MASK;
+       vcpu->cr0 |= vmcs_readl(GUEST_CR0) & ~KVM_GUEST_CR0_MASK;
+
+       vcpu->cr4 &= KVM_GUEST_CR4_MASK;
+       vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
+}
+
 static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
        if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
@@ -1011,8 +1027,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
        vmcs_writel(GUEST_RIP, 0xfff0);
        vmcs_writel(GUEST_RSP, 0);
 
-       vmcs_writel(GUEST_CR3, 0);
-
        //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
        vmcs_writel(GUEST_DR7, 0x400);
 
@@ -1049,7 +1063,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
                               | CPU_BASED_CR8_LOAD_EXITING    /* 20.6.2 */
                               | CPU_BASED_CR8_STORE_EXITING   /* 20.6.2 */
                               | CPU_BASED_UNCOND_IO_EXITING   /* 20.6.2 */
-                              | CPU_BASED_INVDPG_EXITING
                               | CPU_BASED_MOV_DR_EXITING
                               | CPU_BASED_USE_TSC_OFFSETING   /* 21.3 */
                        );
@@ -1094,14 +1107,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
        rdmsrl(MSR_IA32_SYSENTER_EIP, a);
        vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
 
-       ret = -ENOMEM;
-       vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
-       if (!vcpu->guest_msrs)
-               goto out;
-       vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
-       if (!vcpu->host_msrs)
-               goto out_free_guest_msrs;
-
        for (i = 0; i < NR_VMX_MSR; ++i) {
                u32 index = vmx_msr_index[i];
                u32 data_low, data_high;
@@ -1155,8 +1160,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
 
        return 0;
 
-out_free_guest_msrs:
-       kfree(vcpu->guest_msrs);
 out:
        return ret;
 }
@@ -1224,21 +1227,34 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
                        irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
 }
 
-static void kvm_try_inject_irq(struct kvm_vcpu *vcpu)
+
+static void do_interrupt_requests(struct kvm_vcpu *vcpu,
+                                      struct kvm_run *kvm_run)
 {
-       if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)
-           && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0)
+       u32 cpu_based_vm_exec_control;
+
+       vcpu->interrupt_window_open =
+               ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+                (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
+
+       if (vcpu->interrupt_window_open &&
+           vcpu->irq_summary &&
+           !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
                /*
-                * Interrupts enabled, and not blocked by sti or mov ss. Good.
+                * If interrupts enabled, and not blocked by sti or mov ss. Good.
                 */
                kvm_do_inject_irq(vcpu);
-       else
+
+       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+       if (!vcpu->interrupt_window_open &&
+           (vcpu->irq_summary || kvm_run->request_interrupt_window))
                /*
                 * Interrupts blocked.  Wait for unblock.
                 */
-               vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
-                            vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
-                            | CPU_BASED_VIRTUAL_INTR_PENDING);
+               cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+       else
+               cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 }
 
 static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
@@ -1277,6 +1293,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        unsigned long cr2, rip;
        u32 vect_info;
        enum emulation_result er;
+       int r;
 
        vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
        intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
@@ -1305,7 +1322,12 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                cr2 = vmcs_readl(EXIT_QUALIFICATION);
 
                spin_lock(&vcpu->kvm->lock);
-               if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
+               r = kvm_mmu_page_fault(vcpu, cr2, error_code);
+               if (r < 0) {
+                       spin_unlock(&vcpu->kvm->lock);
+                       return r;
+               }
+               if (!r) {
                        spin_unlock(&vcpu->kvm->lock);
                        return 1;
                }
@@ -1425,17 +1447,6 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        return 0;
 }
 
-static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-       u64 address = vmcs_read64(EXIT_QUALIFICATION);
-       int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
-       spin_lock(&vcpu->kvm->lock);
-       vcpu->mmu.inval_page(vcpu, address);
-       spin_unlock(&vcpu->kvm->lock);
-       vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
-       return 1;
-}
-
 static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        u64 exit_qualification;
@@ -1575,23 +1586,40 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        return 1;
 }
 
+static void post_kvm_run_save(struct kvm_vcpu *vcpu,
+                             struct kvm_run *kvm_run)
+{
+       kvm_run->if_flag = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) != 0;
+       kvm_run->cr8 = vcpu->cr8;
+       kvm_run->apic_base = vcpu->apic_base;
+       kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open &&
+                                                 vcpu->irq_summary == 0);
+}
+
 static int handle_interrupt_window(struct kvm_vcpu *vcpu,
                                   struct kvm_run *kvm_run)
 {
-       /* Turn off interrupt window reporting. */
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
-                    vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
-                    & ~CPU_BASED_VIRTUAL_INTR_PENDING);
+       /*
+        * If the user space waits to inject interrupts, exit as soon as
+        * possible
+        */
+       if (kvm_run->request_interrupt_window &&
+           !vcpu->irq_summary) {
+               kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
+               ++kvm_stat.irq_window_exits;
+               return 0;
+       }
        return 1;
 }
 
 static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        skip_emulated_instruction(vcpu);
-       if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF))
+       if (vcpu->irq_summary)
                return 1;
 
        kvm_run->exit_reason = KVM_EXIT_HLT;
+       ++kvm_stat.halt_exits;
        return 0;
 }
 
@@ -1605,7 +1633,6 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
        [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
        [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
        [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
-       [EXIT_REASON_INVLPG]                  = handle_invlpg,
        [EXIT_REASON_CR_ACCESS]               = handle_cr,
        [EXIT_REASON_DR_ACCESS]               = handle_dr,
        [EXIT_REASON_CPUID]                   = handle_cpuid,
@@ -1642,11 +1669,27 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        return 0;
 }
 
+/*
+ * Check if userspace requested an interrupt window, and that the
+ * interrupt window is open.
+ *
+ * No need to exit to userspace if we already have an interrupt queued.
+ */
+static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
+                                         struct kvm_run *kvm_run)
+{
+       return (!vcpu->irq_summary &&
+               kvm_run->request_interrupt_window &&
+               vcpu->interrupt_window_open &&
+               (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
+}
+
 static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        u8 fail;
        u16 fs_sel, gs_sel, ldt_sel;
        int fs_gs_ldt_reload_needed;
+       int r;
 
 again:
        /*
@@ -1673,9 +1716,7 @@ again:
        vmcs_writel(HOST_GS_BASE, segment_base(gs_sel));
 #endif
 
-       if (vcpu->irq_summary &&
-           !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
-               kvm_try_inject_irq(vcpu);
+       do_interrupt_requests(vcpu, kvm_run);
 
        if (vcpu->guest_debug.enabled)
                kvm_guest_debug_pre(vcpu);
@@ -1812,6 +1853,7 @@ again:
 
        fx_save(vcpu->guest_fx_image);
        fx_restore(vcpu->host_fx_image);
+       vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
 
 #ifndef CONFIG_X86_64
        asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
@@ -1821,6 +1863,7 @@ again:
        if (fail) {
                kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY;
                kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
+               r = 0;
        } else {
                if (fs_gs_ldt_reload_needed) {
                        load_ldt(ldt_sel);
@@ -1840,17 +1883,28 @@ again:
                }
                vcpu->launched = 1;
                kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT;
-               if (kvm_handle_exit(kvm_run, vcpu)) {
+               r = kvm_handle_exit(kvm_run, vcpu);
+               if (r > 0) {
                        /* Give scheduler a change to reschedule. */
                        if (signal_pending(current)) {
                                ++kvm_stat.signal_exits;
+                               post_kvm_run_save(vcpu, kvm_run);
+                               return -EINTR;
+                       }
+
+                       if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+                               ++kvm_stat.request_irq_exits;
+                               post_kvm_run_save(vcpu, kvm_run);
                                return -EINTR;
                        }
+
                        kvm_resched(vcpu);
                        goto again;
                }
        }
-       return 0;
+
+       post_kvm_run_save(vcpu, kvm_run);
+       return r;
 }
 
 static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
@@ -1906,13 +1960,33 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
 {
        struct vmcs *vmcs;
 
+       vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!vcpu->guest_msrs)
+               return -ENOMEM;
+
+       vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!vcpu->host_msrs)
+               goto out_free_guest_msrs;
+
        vmcs = alloc_vmcs();
        if (!vmcs)
-               return -ENOMEM;
+               goto out_free_msrs;
+
        vmcs_clear(vmcs);
        vcpu->vmcs = vmcs;
        vcpu->launched = 0;
+
        return 0;
+
+out_free_msrs:
+       kfree(vcpu->host_msrs);
+       vcpu->host_msrs = NULL;
+
+out_free_guest_msrs:
+       kfree(vcpu->guest_msrs);
+       vcpu->guest_msrs = NULL;
+
+       return -ENOMEM;
 }
 
 static struct kvm_arch_ops vmx_arch_ops = {
@@ -1936,6 +2010,7 @@ static struct kvm_arch_ops vmx_arch_ops = {
        .get_segment = vmx_get_segment,
        .set_segment = vmx_set_segment,
        .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+       .decache_cr0_cr4_guest_bits = vmx_decache_cr0_cr4_guest_bits,
        .set_cr0 = vmx_set_cr0,
        .set_cr0_no_modeswitch = vmx_set_cr0_no_modeswitch,
        .set_cr3 = vmx_set_cr3,
index 1bff3e9..be70795 100644 (file)
@@ -1323,7 +1323,7 @@ twobyte_special_insn:
                                                         ctxt)) != 0))
                                goto done;
                        if ((old_lo != _regs[VCPU_REGS_RAX])
-                           || (old_hi != _regs[VCPU_REGS_RDI])) {
+                           || (old_hi != _regs[VCPU_REGS_RDX])) {
                                _regs[VCPU_REGS_RAX] = old_lo;
                                _regs[VCPU_REGS_RDX] = old_hi;
                                _eflags &= ~EFLG_ZF;
index fb1edc1..5091443 100644 (file)
@@ -16,7 +16,7 @@
 #include <linux/platform_device.h>
 #include <linux/leds.h>
 
-#include <asm/arch/hardware.h>
+#include <asm/hardware.h>
 #include <asm/arch/regs-gpio.h>
 #include <asm/arch/leds-gpio.h>
 
index c8558d4..8ca75e5 100644 (file)
@@ -44,6 +44,7 @@
 #include <linux/sysdev.h>
 #include <linux/freezer.h>
 #include <linux/syscalls.h>
+#include <linux/suspend.h>
 #include <linux/cpu.h>
 #include <asm/prom.h>
 #include <asm/machdep.h>
index f1dd81a..3cfb0a3 100644 (file)
@@ -19,7 +19,7 @@ config PCI_MSI
 
 config PCI_MULTITHREAD_PROBE
        bool "PCI Multi-threaded probe (EXPERIMENTAL)"
-       depends on PCI && EXPERIMENTAL
+       depends on PCI && EXPERIMENTAL && BROKEN
        help
          Say Y here if you want the PCI core to spawn a new thread for
          every PCI device that is probed.  This can cause a huge
index 45f2b20..fab381e 100644 (file)
@@ -193,6 +193,18 @@ static struct pci_dev * pci_find_subsys(unsigned int vendor,
        struct pci_dev *dev;
 
        WARN_ON(in_interrupt());
+
+       /*
+        * pci_find_subsys() can be called on the ide_setup() path, super-early
+        * in boot.  But the down_read() will enable local interrupts, which
+        * can cause some machines to crash.  So here we detect and flag that
+        * situation and bail out early.
+        */
+       if (unlikely(list_empty(&pci_devices))) {
+               printk(KERN_INFO "pci_find_subsys() called while pci_devices "
+                               "is still empty\n");
+               return NULL;
+       }
        down_read(&pci_bus_sem);
        n = from ? from->global_list.next : pci_devices.next;
 
@@ -259,6 +271,18 @@ pci_get_subsys(unsigned int vendor, unsigned int device,
        struct pci_dev *dev;
 
        WARN_ON(in_interrupt());
+
+       /*
+        * pci_get_subsys() can potentially be called by drivers super-early
+        * in boot.  But the down_read() will enable local interrupts, which
+        * can cause some machines to crash.  So here we detect and flag that
+        * situation and bail out early.
+        */
+       if (unlikely(list_empty(&pci_devices))) {
+               printk(KERN_NOTICE "pci_get_subsys() called while pci_devices "
+                               "is still empty\n");
+               return NULL;
+       }
        down_read(&pci_bus_sem);
        n = from ? from->global_list.next : pci_devices.next;
 
index 4f654c9..a724ab4 100644 (file)
@@ -33,6 +33,8 @@
 
 #include <asm/mach/time.h>
 
+#include <asm/arch/at91_rtc.h>
+
 
 #define AT91_RTC_FREQ          1
 #define AT91_RTC_EPOCH         1900UL  /* just like arch/arm/common/rtctime.c */
index 1460f6b..e7851e3 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * An I2C driver for the Ricoh RS5C372 RTC
+ * An I2C driver for Ricoh RS5C372 and RV5C38[67] RTCs
  *
  * Copyright (C) 2005 Pavel Mironchik <pmironchik@optifacio.net>
  * Copyright (C) 2006 Tower Technologies
@@ -13,7 +13,7 @@
 #include <linux/rtc.h>
 #include <linux/bcd.h>
 
-#define DRV_VERSION "0.3"
+#define DRV_VERSION "0.4"
 
 /* Addresses to scan */
 static unsigned short normal_i2c[] = { /* 0x32,*/ I2C_CLIENT_END };
@@ -21,6 +21,13 @@ static unsigned short normal_i2c[] = { /* 0x32,*/ I2C_CLIENT_END };
 /* Insmod parameters */
 I2C_CLIENT_INSMOD;
 
+
+/*
+ * Ricoh has a family of I2C based RTCs, which differ only slightly from
+ * each other.  Differences center on pinout (e.g. how many interrupts,
+ * output clock, etc) and how the control registers are used.  The '372
+ * is significant only because that's the one this driver first supported.
+ */
 #define RS5C372_REG_SECS       0
 #define RS5C372_REG_MINS       1
 #define RS5C372_REG_HOURS      2
@@ -29,59 +36,142 @@ I2C_CLIENT_INSMOD;
 #define RS5C372_REG_MONTH      5
 #define RS5C372_REG_YEAR       6
 #define RS5C372_REG_TRIM       7
+#      define RS5C372_TRIM_XSL         0x80
+#      define RS5C372_TRIM_MASK        0x7F
+
+#define RS5C_REG_ALARM_A_MIN   8                       /* or ALARM_W */
+#define RS5C_REG_ALARM_A_HOURS 9
+#define RS5C_REG_ALARM_A_WDAY  10
+
+#define RS5C_REG_ALARM_B_MIN   11                      /* or ALARM_D */
+#define RS5C_REG_ALARM_B_HOURS 12
+#define RS5C_REG_ALARM_B_WDAY  13                      /* (ALARM_B only) */
+
+#define RS5C_REG_CTRL1         14
+#      define RS5C_CTRL1_AALE          (1 << 7)        /* or WALE */
+#      define RS5C_CTRL1_BALE          (1 << 6)        /* or DALE */
+#      define RV5C387_CTRL1_24         (1 << 5)
+#      define RS5C372A_CTRL1_SL1       (1 << 5)
+#      define RS5C_CTRL1_CT_MASK       (7 << 0)
+#      define RS5C_CTRL1_CT0           (0 << 0)        /* no periodic irq */
+#      define RS5C_CTRL1_CT4           (4 << 0)        /* 1 Hz level irq */
+#define RS5C_REG_CTRL2         15
+#      define RS5C372_CTRL2_24         (1 << 5)
+#      define RS5C_CTRL2_XSTP          (1 << 4)
+#      define RS5C_CTRL2_CTFG          (1 << 2)
+#      define RS5C_CTRL2_AAFG          (1 << 1)        /* or WAFG */
+#      define RS5C_CTRL2_BAFG          (1 << 0)        /* or DAFG */
+
+
+/* to read (style 1) or write registers starting at R */
+#define RS5C_ADDR(R)           (((R) << 4) | 0)
+
+
+enum rtc_type {
+       rtc_undef = 0,
+       rtc_rs5c372a,
+       rtc_rs5c372b,
+       rtc_rv5c386,
+       rtc_rv5c387a,
+};
 
-#define RS5C372_TRIM_XSL       0x80
-#define RS5C372_TRIM_MASK      0x7F
+/* REVISIT:  this assumes that:
+ *  - we're in the 21st century, so it's safe to ignore the century
+ *    bit for rv5c38[67] (REG_MONTH bit 7);
+ *  - we should use ALARM_A not ALARM_B (may be wrong on some boards)
+ */
+struct rs5c372 {
+       struct i2c_client       *client;
+       struct rtc_device       *rtc;
+       enum rtc_type           type;
+       unsigned                time24:1;
+       unsigned                has_irq:1;
+       char                    buf[17];
+       char                    *regs;
+
+       /* on conversion to a "new style" i2c driver, this vanishes */
+       struct i2c_client       dev;
+};
 
-#define RS5C372_REG_BASE       0
+static int rs5c_get_regs(struct rs5c372 *rs5c)
+{
+       struct i2c_client       *client = rs5c->client;
+       struct i2c_msg          msgs[] = {
+               { client->addr, I2C_M_RD, sizeof rs5c->buf, rs5c->buf },
+       };
+
+       /* This implements the third reading method from the datasheet, using
+        * an internal address that's reset after each transaction (by STOP)
+        * to 0x0f ... so we read extra registers, and skip the first one.
+        *
+        * The first method doesn't work with the iop3xx adapter driver, on at
+        * least 80219 chips; this works around that bug.
+        */
+       if ((i2c_transfer(client->adapter, msgs, 1)) != 1) {
+               pr_debug("%s: can't read registers\n", rs5c->rtc->name);
+               return -EIO;
+       }
 
-static int rs5c372_attach(struct i2c_adapter *adapter);
-static int rs5c372_detach(struct i2c_client *client);
-static int rs5c372_probe(struct i2c_adapter *adapter, int address, int kind);
+       dev_dbg(&client->dev,
+               "%02x %02x %02x (%02x) %02x %02x %02x (%02x), "
+               "%02x %02x %02x, %02x %02x %02x; %02x %02x\n",
+               rs5c->regs[0],  rs5c->regs[1],  rs5c->regs[2],  rs5c->regs[3],
+               rs5c->regs[4],  rs5c->regs[5],  rs5c->regs[6],  rs5c->regs[7],
+               rs5c->regs[8],  rs5c->regs[9],  rs5c->regs[10], rs5c->regs[11],
+               rs5c->regs[12], rs5c->regs[13], rs5c->regs[14], rs5c->regs[15]);
 
-struct rs5c372 {
-       u8 reg_addr;
-       u8 regs[17];
-       struct i2c_msg msg[1];
-       struct i2c_client client;
-       struct rtc_device *rtc;
-};
+       return 0;
+}
 
-static struct i2c_driver rs5c372_driver = {
-       .driver         = {
-               .name   = "rs5c372",
-       },
-       .attach_adapter = &rs5c372_attach,
-       .detach_client  = &rs5c372_detach,
-};
+static unsigned rs5c_reg2hr(struct rs5c372 *rs5c, unsigned reg)
+{
+       unsigned        hour;
 
-static int rs5c372_get_datetime(struct i2c_client *client, struct rtc_time *tm)
+       if (rs5c->time24)
+               return BCD2BIN(reg & 0x3f);
+
+       hour = BCD2BIN(reg & 0x1f);
+       if (hour == 12)
+               hour = 0;
+       if (reg & 0x20)
+               hour += 12;
+       return hour;
+}
+
+static unsigned rs5c_hr2reg(struct rs5c372 *rs5c, unsigned hour)
 {
+       if (rs5c->time24)
+               return BIN2BCD(hour);
+
+       if (hour > 12)
+               return 0x20 | BIN2BCD(hour - 12);
+       if (hour == 12)
+               return 0x20 | BIN2BCD(12);
+       if (hour == 0)
+               return BIN2BCD(12);
+       return BIN2BCD(hour);
+}
 
-       struct rs5c372 *rs5c372 = i2c_get_clientdata(client);
-       u8 *buf = &(rs5c372->regs[1]);
+static int rs5c372_get_datetime(struct i2c_client *client, struct rtc_time *tm)
+{
+       struct rs5c372  *rs5c = i2c_get_clientdata(client);
+       int             status = rs5c_get_regs(rs5c);
 
-       /* this implements the 3rd reading method, according
-        * to the datasheet. rs5c372 defaults to internal
-        * address 0xF, so 0x0 is in regs[1]
-        */
+       if (status < 0)
+               return status;
 
-       if ((i2c_transfer(client->adapter, rs5c372->msg, 1)) != 1) {
-               dev_err(&client->dev, "%s: read error\n", __FUNCTION__);
-               return -EIO;
-       }
+       tm->tm_sec = BCD2BIN(rs5c->regs[RS5C372_REG_SECS] & 0x7f);
+       tm->tm_min = BCD2BIN(rs5c->regs[RS5C372_REG_MINS] & 0x7f);
+       tm->tm_hour = rs5c_reg2hr(rs5c, rs5c->regs[RS5C372_REG_HOURS]);
 
-       tm->tm_sec = BCD2BIN(buf[RS5C372_REG_SECS] & 0x7f);
-       tm->tm_min = BCD2BIN(buf[RS5C372_REG_MINS] & 0x7f);
-       tm->tm_hour = BCD2BIN(buf[RS5C372_REG_HOURS] & 0x3f);
-       tm->tm_wday = BCD2BIN(buf[RS5C372_REG_WDAY] & 0x07);
-       tm->tm_mday = BCD2BIN(buf[RS5C372_REG_DAY] & 0x3f);
+       tm->tm_wday = BCD2BIN(rs5c->regs[RS5C372_REG_WDAY] & 0x07);
+       tm->tm_mday = BCD2BIN(rs5c->regs[RS5C372_REG_DAY] & 0x3f);
 
        /* tm->tm_mon is zero-based */
-       tm->tm_mon = BCD2BIN(buf[RS5C372_REG_MONTH] & 0x1f) - 1;
+       tm->tm_mon = BCD2BIN(rs5c->regs[RS5C372_REG_MONTH] & 0x1f) - 1;
 
        /* year is 1900 + tm->tm_year */
-       tm->tm_year = BCD2BIN(buf[RS5C372_REG_YEAR]) + 100;
+       tm->tm_year = BCD2BIN(rs5c->regs[RS5C372_REG_YEAR]) + 100;
 
        dev_dbg(&client->dev, "%s: tm is secs=%d, mins=%d, hours=%d, "
                "mday=%d, mon=%d, year=%d, wday=%d\n",
@@ -89,22 +179,25 @@ static int rs5c372_get_datetime(struct i2c_client *client, struct rtc_time *tm)
                tm->tm_sec, tm->tm_min, tm->tm_hour,
                tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday);
 
-       return 0;
+       /* rtc might need initialization */
+       return rtc_valid_tm(tm);
 }
 
 static int rs5c372_set_datetime(struct i2c_client *client, struct rtc_time *tm)
 {
-       unsigned char buf[8] = { RS5C372_REG_BASE };
+       struct rs5c372  *rs5c = i2c_get_clientdata(client);
+       unsigned char   buf[8];
 
-       dev_dbg(&client->dev,
-               "%s: secs=%d, mins=%d, hours=%d "
+       dev_dbg(&client->dev, "%s: tm is secs=%d, mins=%d, hours=%d "
                "mday=%d, mon=%d, year=%d, wday=%d\n",
-               __FUNCTION__, tm->tm_sec, tm->tm_min, tm->tm_hour,
+               __FUNCTION__,
+               tm->tm_sec, tm->tm_min, tm->tm_hour,
                tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday);
 
+       buf[0] = RS5C_ADDR(RS5C372_REG_SECS);
        buf[1] = BIN2BCD(tm->tm_sec);
        buf[2] = BIN2BCD(tm->tm_min);
-       buf[3] = BIN2BCD(tm->tm_hour);
+       buf[3] = rs5c_hr2reg(rs5c, tm->tm_hour);
        buf[4] = BIN2BCD(tm->tm_wday);
        buf[5] = BIN2BCD(tm->tm_mday);
        buf[6] = BIN2BCD(tm->tm_mon + 1);
@@ -118,21 +211,43 @@ static int rs5c372_set_datetime(struct i2c_client *client, struct rtc_time *tm)
        return 0;
 }
 
+#if defined(CONFIG_RTC_INTF_PROC) || defined(CONFIG_RTC_INTF_PROC_MODULE)
+#define        NEED_TRIM
+#endif
+
+#if defined(CONFIG_RTC_INTF_SYSFS) || defined(CONFIG_RTC_INTF_SYSFS_MODULE)
+#define        NEED_TRIM
+#endif
+
+#ifdef NEED_TRIM
 static int rs5c372_get_trim(struct i2c_client *client, int *osc, int *trim)
 {
        struct rs5c372 *rs5c372 = i2c_get_clientdata(client);
-       u8 tmp = rs5c372->regs[RS5C372_REG_TRIM + 1];
+       u8 tmp = rs5c372->regs[RS5C372_REG_TRIM];
 
        if (osc)
                *osc = (tmp & RS5C372_TRIM_XSL) ? 32000 : 32768;
 
        if (trim) {
-               *trim = tmp & RS5C372_TRIM_MASK;
-               dev_dbg(&client->dev, "%s: raw trim=%x\n", __FUNCTION__, *trim);
+               dev_dbg(&client->dev, "%s: raw trim=%x\n", __FUNCTION__, tmp);
+               tmp &= RS5C372_TRIM_MASK;
+               if (tmp & 0x3e) {
+                       int t = tmp & 0x3f;
+
+                       if (tmp & 0x40)
+                               t = (~t | (s8)0xc0) + 1;
+                       else
+                               t = t - 1;
+
+                       tmp = t * 2;
+               } else
+                       tmp = 0;
+               *trim = tmp;
        }
 
        return 0;
 }
+#endif
 
 static int rs5c372_rtc_read_time(struct device *dev, struct rtc_time *tm)
 {
@@ -144,25 +259,190 @@ static int rs5c372_rtc_set_time(struct device *dev, struct rtc_time *tm)
        return rs5c372_set_datetime(to_i2c_client(dev), tm);
 }
 
+#if defined(CONFIG_RTC_INTF_DEV) || defined(CONFIG_RTC_INTF_DEV_MODULE)
+
+static int
+rs5c_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
+{
+       struct i2c_client       *client = to_i2c_client(dev);
+       struct rs5c372          *rs5c = i2c_get_clientdata(client);
+       unsigned char           buf[2];
+       int                     status;
+
+       buf[1] = rs5c->regs[RS5C_REG_CTRL1];
+       switch (cmd) {
+       case RTC_UIE_OFF:
+       case RTC_UIE_ON:
+               /* some 327a modes use a different IRQ pin for 1Hz irqs */
+               if (rs5c->type == rtc_rs5c372a
+                               && (buf[1] & RS5C372A_CTRL1_SL1))
+                       return -ENOIOCTLCMD;
+       case RTC_AIE_OFF:
+       case RTC_AIE_ON:
+               /* these irq management calls only make sense for chips
+                * which are wired up to an IRQ.
+                */
+               if (!rs5c->has_irq)
+                       return -ENOIOCTLCMD;
+               break;
+       default:
+               return -ENOIOCTLCMD;
+       }
+
+       status = rs5c_get_regs(rs5c);
+       if (status < 0)
+               return status;
+
+       buf[0] = RS5C_ADDR(RS5C_REG_CTRL1);
+       switch (cmd) {
+       case RTC_AIE_OFF:       /* alarm off */
+               buf[1] &= ~RS5C_CTRL1_AALE;
+               break;
+       case RTC_AIE_ON:        /* alarm on */
+               buf[1] |= RS5C_CTRL1_AALE;
+               break;
+       case RTC_UIE_OFF:       /* update off */
+               buf[1] &= ~RS5C_CTRL1_CT_MASK;
+               break;
+       case RTC_UIE_ON:        /* update on */
+               buf[1] &= ~RS5C_CTRL1_CT_MASK;
+               buf[1] |= RS5C_CTRL1_CT4;
+               break;
+       }
+       if ((i2c_master_send(client, buf, 2)) != 2) {
+               printk(KERN_WARNING "%s: can't update alarm\n",
+                       rs5c->rtc->name);
+               status = -EIO;
+       } else
+               rs5c->regs[RS5C_REG_CTRL1] = buf[1];
+       return status;
+}
+
+#else
+#define        rs5c_rtc_ioctl  NULL
+#endif
+
+
+/* NOTE:  Since RTC_WKALM_{RD,SET} were originally defined for EFI,
+ * which only exposes a polled programming interface; and since
+ * these calls map directly to those EFI requests; we don't demand
+ * we have an IRQ for this chip when we go through this API.
+ *
+ * The older x86_pc derived RTC_ALM_{READ,SET} calls require irqs
+ * though, managed through RTC_AIE_{ON,OFF} requests.
+ */
+
+static int rs5c_read_alarm(struct device *dev, struct rtc_wkalrm *t)
+{
+       struct i2c_client       *client = to_i2c_client(dev);
+       struct rs5c372          *rs5c = i2c_get_clientdata(client);
+       int                     status;
+
+       status = rs5c_get_regs(rs5c);
+       if (status < 0)
+               return status;
+
+       /* report alarm time */
+       t->time.tm_sec = 0;
+       t->time.tm_min = BCD2BIN(rs5c->regs[RS5C_REG_ALARM_A_MIN] & 0x7f);
+       t->time.tm_hour = rs5c_reg2hr(rs5c, rs5c->regs[RS5C_REG_ALARM_A_HOURS]);
+       t->time.tm_mday = -1;
+       t->time.tm_mon = -1;
+       t->time.tm_year = -1;
+       t->time.tm_wday = -1;
+       t->time.tm_yday = -1;
+       t->time.tm_isdst = -1;
+
+       /* ... and status */
+       t->enabled = !!(rs5c->regs[RS5C_REG_CTRL1] & RS5C_CTRL1_AALE);
+       t->pending = !!(rs5c->regs[RS5C_REG_CTRL2] & RS5C_CTRL2_AAFG);
+
+       return 0;
+}
+
+static int rs5c_set_alarm(struct device *dev, struct rtc_wkalrm *t)
+{
+       struct i2c_client       *client = to_i2c_client(dev);
+       struct rs5c372          *rs5c = i2c_get_clientdata(client);
+       int                     status;
+       unsigned char           buf[4];
+
+       /* only handle up to 24 hours in the future, like RTC_ALM_SET */
+       if (t->time.tm_mday != -1
+                       || t->time.tm_mon != -1
+                       || t->time.tm_year != -1)
+               return -EINVAL;
+
+       /* REVISIT: round up tm_sec */
+
+       /* if needed, disable irq (clears pending status) */
+       status = rs5c_get_regs(rs5c);
+       if (status < 0)
+               return status;
+       if (rs5c->regs[RS5C_REG_CTRL1] & RS5C_CTRL1_AALE) {
+               buf[0] = RS5C_ADDR(RS5C_REG_CTRL1);
+               buf[1] = rs5c->regs[RS5C_REG_CTRL1] & ~RS5C_CTRL1_AALE;
+               if (i2c_master_send(client, buf, 2) != 2) {
+                       pr_debug("%s: can't disable alarm\n", rs5c->rtc->name);
+                       return -EIO;
+               }
+               rs5c->regs[RS5C_REG_CTRL1] = buf[1];
+       }
+
+       /* set alarm */
+       buf[0] = RS5C_ADDR(RS5C_REG_ALARM_A_MIN);
+       buf[1] = BIN2BCD(t->time.tm_min);
+       buf[2] = rs5c_hr2reg(rs5c, t->time.tm_hour);
+       buf[3] = 0x7f;  /* any/all days */
+       if ((i2c_master_send(client, buf, 4)) != 4) {
+               pr_debug("%s: can't set alarm time\n", rs5c->rtc->name);
+               return -EIO;
+       }
+
+       /* ... and maybe enable its irq */
+       if (t->enabled) {
+               buf[0] = RS5C_ADDR(RS5C_REG_CTRL1);
+               buf[1] = rs5c->regs[RS5C_REG_CTRL1] | RS5C_CTRL1_AALE;
+               if ((i2c_master_send(client, buf, 2)) != 2)
+                       printk(KERN_WARNING "%s: can't enable alarm\n",
+                               rs5c->rtc->name);
+               rs5c->regs[RS5C_REG_CTRL1] = buf[1];
+       }
+
+       return 0;
+}
+
+#if defined(CONFIG_RTC_INTF_PROC) || defined(CONFIG_RTC_INTF_PROC_MODULE)
+
 static int rs5c372_rtc_proc(struct device *dev, struct seq_file *seq)
 {
        int err, osc, trim;
 
        err = rs5c372_get_trim(to_i2c_client(dev), &osc, &trim);
        if (err == 0) {
-               seq_printf(seq, "%d.%03d KHz\n", osc / 1000, osc % 1000);
-               seq_printf(seq, "trim\t: %d\n", trim);
+               seq_printf(seq, "crystal\t\t: %d.%03d KHz\n",
+                               osc / 1000, osc % 1000);
+               seq_printf(seq, "trim\t\t: %d\n", trim);
        }
 
        return 0;
 }
 
+#else
+#define        rs5c372_rtc_proc        NULL
+#endif
+
 static const struct rtc_class_ops rs5c372_rtc_ops = {
        .proc           = rs5c372_rtc_proc,
+       .ioctl          = rs5c_rtc_ioctl,
        .read_time      = rs5c372_rtc_read_time,
        .set_time       = rs5c372_rtc_set_time,
+       .read_alarm     = rs5c_read_alarm,
+       .set_alarm      = rs5c_set_alarm,
 };
 
+#if defined(CONFIG_RTC_INTF_SYSFS) || defined(CONFIG_RTC_INTF_SYSFS_MODULE)
+
 static ssize_t rs5c372_sysfs_show_trim(struct device *dev,
                                struct device_attribute *attr, char *buf)
 {
@@ -172,7 +452,7 @@ static ssize_t rs5c372_sysfs_show_trim(struct device *dev,
        if (err)
                return err;
 
-       return sprintf(buf, "0x%2x\n", trim);
+       return sprintf(buf, "%d\n", trim);
 }
 static DEVICE_ATTR(trim, S_IRUGO, rs5c372_sysfs_show_trim, NULL);
 
@@ -189,16 +469,35 @@ static ssize_t rs5c372_sysfs_show_osc(struct device *dev,
 }
 static DEVICE_ATTR(osc, S_IRUGO, rs5c372_sysfs_show_osc, NULL);
 
-static int rs5c372_attach(struct i2c_adapter *adapter)
+static int rs5c_sysfs_register(struct device *dev)
 {
-       return i2c_probe(adapter, &addr_data, rs5c372_probe);
+       int err;
+
+       err = device_create_file(dev, &dev_attr_trim);
+       if (err)
+               return err;
+       err = device_create_file(dev, &dev_attr_osc);
+       if (err)
+               device_remove_file(dev, &dev_attr_trim);
+
+       return err;
+}
+
+#else
+static int rs5c_sysfs_register(struct device *dev)
+{
+       return 0;
 }
+#endif /* SYSFS */
+
+static struct i2c_driver rs5c372_driver;
 
 static int rs5c372_probe(struct i2c_adapter *adapter, int address, int kind)
 {
        int err = 0;
        struct i2c_client *client;
        struct rs5c372 *rs5c372;
+       struct rtc_time tm;
 
        dev_dbg(adapter->class_dev.dev, "%s\n", __FUNCTION__);
 
@@ -211,7 +510,15 @@ static int rs5c372_probe(struct i2c_adapter *adapter, int address, int kind)
                err = -ENOMEM;
                goto exit;
        }
-       client = &rs5c372->client;
+
+       /* we read registers 0x0f then 0x00-0x0f; skip the first one */
+       rs5c372->regs=&rs5c372->buf[1];
+
+       /* On conversion to a "new style" i2c driver, we'll be handed
+        * the i2c_client (we won't create it)
+        */
+       client = &rs5c372->dev;
+       rs5c372->client = client;
 
        /* I2C client */
        client->addr = address;
@@ -222,16 +529,99 @@ static int rs5c372_probe(struct i2c_adapter *adapter, int address, int kind)
 
        i2c_set_clientdata(client, rs5c372);
 
-       rs5c372->msg[0].addr = address;
-       rs5c372->msg[0].flags = I2C_M_RD;
-       rs5c372->msg[0].len = sizeof(rs5c372->regs);
-       rs5c372->msg[0].buf = rs5c372->regs;
-
        /* Inform the i2c layer */
        if ((err = i2c_attach_client(client)))
                goto exit_kfree;
 
-       dev_info(&client->dev, "chip found, driver version " DRV_VERSION "\n");
+       err = rs5c_get_regs(rs5c372);
+       if (err < 0)
+               goto exit_detach;
+
+       /* For "new style" drivers, irq is in i2c_client and chip type
+        * info comes from i2c_client.dev.platform_data.  Meanwhile:
+        *
+        * STICK BOARD-SPECIFIC SETUP CODE RIGHT HERE
+        */
+       if (rs5c372->type == rtc_undef) {
+               rs5c372->type = rtc_rs5c372b;
+               dev_warn(&client->dev, "assuming rs5c372b\n");
+       }
+
+       /* clock may be set for am/pm or 24 hr time */
+       switch (rs5c372->type) {
+       case rtc_rs5c372a:
+       case rtc_rs5c372b:
+               /* alarm uses ALARM_A; and nINTRA on 372a, nINTR on 372b.
+                * so does periodic irq, except some 327a modes.
+                */
+               if (rs5c372->regs[RS5C_REG_CTRL2] & RS5C372_CTRL2_24)
+                       rs5c372->time24 = 1;
+               break;
+       case rtc_rv5c386:
+       case rtc_rv5c387a:
+               if (rs5c372->regs[RS5C_REG_CTRL1] & RV5C387_CTRL1_24)
+                       rs5c372->time24 = 1;
+               /* alarm uses ALARM_W; and nINTRB for alarm and periodic
+                * irq, on both 386 and 387
+                */
+               break;
+       default:
+               dev_err(&client->dev, "unknown RTC type\n");
+               goto exit_detach;
+       }
+
+       /* if the oscillator lost power and no other software (like
+        * the bootloader) set it up, do it here.
+        */
+       if (rs5c372->regs[RS5C_REG_CTRL2] & RS5C_CTRL2_XSTP) {
+               unsigned char buf[3];
+
+               rs5c372->regs[RS5C_REG_CTRL2] &= ~RS5C_CTRL2_XSTP;
+
+               buf[0] = RS5C_ADDR(RS5C_REG_CTRL1);
+               buf[1] = rs5c372->regs[RS5C_REG_CTRL1];
+               buf[2] = rs5c372->regs[RS5C_REG_CTRL2];
+
+               /* use 24hr mode */
+               switch (rs5c372->type) {
+               case rtc_rs5c372a:
+               case rtc_rs5c372b:
+                       buf[2] |= RS5C372_CTRL2_24;
+                       rs5c372->time24 = 1;
+                       break;
+               case rtc_rv5c386:
+               case rtc_rv5c387a:
+                       buf[1] |= RV5C387_CTRL1_24;
+                       rs5c372->time24 = 1;
+                       break;
+               default:
+                       /* impossible */
+                       break;
+               }
+
+               if ((i2c_master_send(client, buf, 3)) != 3) {
+                       dev_err(&client->dev, "setup error\n");
+                       goto exit_detach;
+               }
+               rs5c372->regs[RS5C_REG_CTRL1] = buf[1];
+               rs5c372->regs[RS5C_REG_CTRL2] = buf[2];
+       }
+
+       if (rs5c372_get_datetime(client, &tm) < 0)
+               dev_warn(&client->dev, "clock needs to be set\n");
+
+       dev_info(&client->dev, "%s found, %s, driver version " DRV_VERSION "\n",
+                       ({ char *s; switch (rs5c372->type) {
+                       case rtc_rs5c372a:      s = "rs5c372a"; break;
+                       case rtc_rs5c372b:      s = "rs5c372b"; break;
+                       case rtc_rv5c386:       s = "rv5c386"; break;
+                       case rtc_rv5c387a:      s = "rv5c387a"; break;
+                       default:                s = "chip"; break;
+                       }; s;}),
+                       rs5c372->time24 ? "24hr" : "am/pm"
+                       );
+
+       /* FIXME when client->irq exists, use it to register alarm irq */
 
        rs5c372->rtc = rtc_device_register(rs5c372_driver.driver.name,
                                &client->dev, &rs5c372_rtc_ops, THIS_MODULE);
@@ -241,18 +631,12 @@ static int rs5c372_probe(struct i2c_adapter *adapter, int address, int kind)
                goto exit_detach;
        }
 
-       err = device_create_file(&client->dev, &dev_attr_trim);
+       err = rs5c_sysfs_register(&client->dev);
        if (err)
                goto exit_devreg;
-       err = device_create_file(&client->dev, &dev_attr_osc);
-       if (err)
-               goto exit_trim;
 
        return 0;
 
-exit_trim:
-       device_remove_file(&client->dev, &dev_attr_trim);
-
 exit_devreg:
        rtc_device_unregister(rs5c372->rtc);
 
@@ -266,6 +650,11 @@ exit:
        return err;
 }
 
+static int rs5c372_attach(struct i2c_adapter *adapter)
+{
+       return i2c_probe(adapter, &addr_data, rs5c372_probe);
+}
+
 static int rs5c372_detach(struct i2c_client *client)
 {
        int err;
@@ -274,6 +663,8 @@ static int rs5c372_detach(struct i2c_client *client)
        if (rs5c372->rtc)
                rtc_device_unregister(rs5c372->rtc);
 
+       /* REVISIT properly destroy the sysfs files ... */
+
        if ((err = i2c_detach_client(client)))
                return err;
 
@@ -281,6 +672,14 @@ static int rs5c372_detach(struct i2c_client *client)
        return 0;
 }
 
+static struct i2c_driver rs5c372_driver = {
+       .driver         = {
+               .name   = "rtc-rs5c372",
+       },
+       .attach_adapter = &rs5c372_attach,
+       .detach_client  = &rs5c372_detach,
+};
+
 static __init int rs5c372_init(void)
 {
        return i2c_add_driver(&rs5c372_driver);
index 24ee8be..6377db1 100644 (file)
@@ -217,6 +217,7 @@ static const struct quirk_printer_struct quirk_printers[] = {
        { 0x0409, 0xbef4, USBLP_QUIRK_BIDIR }, /* NEC Picty760 (HP OEM) */
        { 0x0409, 0xf0be, USBLP_QUIRK_BIDIR }, /* NEC Picty920 (HP OEM) */
        { 0x0409, 0xf1be, USBLP_QUIRK_BIDIR }, /* NEC Picty800 (HP OEM) */
+       { 0x0482, 0x0010, USBLP_QUIRK_BIDIR }, /* Kyocera Mita FS 820, by zut <kernel@zut.de> */
        { 0, 0 }
 };
 
index c505b76..5e628ae 100644 (file)
@@ -268,6 +268,7 @@ static void ep_device_release(struct device *dev)
        struct ep_device *ep_dev = to_ep_device(dev);
 
        dev_dbg(dev, "%s called for %s\n", __FUNCTION__, dev->bus_id);
+       endpoint_free_minor(ep_dev);
        kfree(ep_dev);
 }
 
@@ -349,7 +350,6 @@ void usb_remove_ep_files(struct usb_host_endpoint *endpoint)
                sprintf(name, "ep_%02x", endpoint->desc.bEndpointAddress);
                sysfs_remove_link(&ep_dev->dev.parent->kobj, name);
                sysfs_remove_group(&ep_dev->dev.kobj, &ep_dev_attr_grp);
-               endpoint_free_minor(ep_dev);
                device_unregister(&ep_dev->dev);
                endpoint->ep_dev = NULL;
                destroy_endpoint_class();
index 15d77c3..cdcfd42 100644 (file)
@@ -42,6 +42,7 @@
 #include <linux/usb_gadget.h>
 #include <linux/usb/otg.h>
 #include <linux/dma-mapping.h>
+#include <linux/clk.h>
 
 #include <asm/byteorder.h>
 #include <asm/io.h>
 /* bulk DMA seems to be behaving for both IN and OUT */
 #define        USE_DMA
 
+/* FIXME: OMAP2 currently has some problem in DMA mode */
+#ifdef CONFIG_ARCH_OMAP2
+#undef USE_DMA
+#endif
+
 /* ISO too */
 #define        USE_ISO
 
@@ -99,7 +105,7 @@ static unsigned fifo_mode = 0;
  * boot parameter "omap_udc:fifo_mode=42"
  */
 module_param (fifo_mode, uint, 0);
-MODULE_PARM_DESC (fifo_mode, "endpoint setup (0 == default)");
+MODULE_PARM_DESC (fifo_mode, "endpoint configuration");
 
 #ifdef USE_DMA
 static unsigned use_dma = 1;
@@ -122,7 +128,7 @@ static const char driver_desc [] = DRIVER_DESC;
 /*-------------------------------------------------------------------------*/
 
 /* there's a notion of "current endpoint" for modifying endpoint
- * state, and PIO access to its FIFO.  
+ * state, and PIO access to its FIFO.
  */
 
 static void use_ep(struct omap_ep *ep, u16 select)
@@ -391,7 +397,7 @@ done(struct omap_ep *ep, struct omap_req *req, int status)
 #define FIFO_EMPTY     (UDC_NON_ISO_FIFO_EMPTY | UDC_ISO_FIFO_EMPTY)
 #define FIFO_UNREADABLE (UDC_EP_HALTED | FIFO_EMPTY)
 
-static inline int 
+static inline int
 write_packet(u8 *buf, struct omap_req *req, unsigned max)
 {
        unsigned        len;
@@ -456,7 +462,7 @@ static int write_fifo(struct omap_ep *ep, struct omap_req *req)
        return is_last;
 }
 
-static inline int 
+static inline int
 read_packet(u8 *buf, struct omap_req *req, unsigned avail)
 {
        unsigned        len;
@@ -542,9 +548,9 @@ static inline dma_addr_t dma_csac(unsigned lch)
        /* omap 3.2/3.3 erratum: sometimes 0 is returned if CSAC/CDAC is
         * read before the DMA controller finished disabling the channel.
         */
-       csac = omap_readw(OMAP_DMA_CSAC(lch));
+       csac = OMAP_DMA_CSAC_REG(lch);
        if (csac == 0)
-               csac = omap_readw(OMAP_DMA_CSAC(lch));
+               csac = OMAP_DMA_CSAC_REG(lch);
        return csac;
 }
 
@@ -555,9 +561,9 @@ static inline dma_addr_t dma_cdac(unsigned lch)
        /* omap 3.2/3.3 erratum: sometimes 0 is returned if CSAC/CDAC is
         * read before the DMA controller finished disabling the channel.
         */
-       cdac = omap_readw(OMAP_DMA_CDAC(lch));
+       cdac = OMAP_DMA_CDAC_REG(lch);
        if (cdac == 0)
-               cdac = omap_readw(OMAP_DMA_CDAC(lch));
+               cdac = OMAP_DMA_CDAC_REG(lch);
        return cdac;
 }
 
@@ -582,7 +588,7 @@ static u16 dma_src_len(struct omap_ep *ep, dma_addr_t start)
 }
 
 #define DMA_DEST_LAST(x) (cpu_is_omap15xx() \
-               ? omap_readw(OMAP_DMA_CSAC(x)) /* really: CPC */ \
+               ? OMAP_DMA_CSAC_REG(x) /* really: CPC */ \
                : dma_cdac(x))
 
 static u16 dma_dest_len(struct omap_ep *ep, dma_addr_t start)
@@ -620,17 +626,19 @@ static void next_in_dma(struct omap_ep *ep, struct omap_req *req)
                        || (cpu_is_omap15xx() && length < ep->maxpacket)) {
                txdma_ctrl = UDC_TXN_EOT | length;
                omap_set_dma_transfer_params(ep->lch, OMAP_DMA_DATA_TYPE_S8,
-                               length, 1, sync_mode);
+                               length, 1, sync_mode, 0, 0);
        } else {
                length = min(length / ep->maxpacket,
                                (unsigned) UDC_TXN_TSC + 1);
-               txdma_ctrl = length;
+               txdma_ctrl = length;
                omap_set_dma_transfer_params(ep->lch, OMAP_DMA_DATA_TYPE_S16,
-                               ep->ep.maxpacket >> 1, length, sync_mode);
+                               ep->ep.maxpacket >> 1, length, sync_mode,
+                               0, 0);
                length *= ep->maxpacket;
        }
        omap_set_dma_src_params(ep->lch, OMAP_DMA_PORT_EMIFF,
-               OMAP_DMA_AMODE_POST_INC, req->req.dma + req->req.actual);
+               OMAP_DMA_AMODE_POST_INC, req->req.dma + req->req.actual,
+               0, 0);
 
        omap_start_dma(ep->lch);
        ep->dma_counter = dma_csac(ep->lch);
@@ -675,9 +683,11 @@ static void next_out_dma(struct omap_ep *ep, struct omap_req *req)
        req->dma_bytes = packets * ep->ep.maxpacket;
        omap_set_dma_transfer_params(ep->lch, OMAP_DMA_DATA_TYPE_S16,
                        ep->ep.maxpacket >> 1, packets,
-                       OMAP_DMA_SYNC_ELEMENT);
+                       OMAP_DMA_SYNC_ELEMENT,
+                       0, 0);
        omap_set_dma_dest_params(ep->lch, OMAP_DMA_PORT_EMIFF,
-               OMAP_DMA_AMODE_POST_INC, req->req.dma + req->req.actual);
+               OMAP_DMA_AMODE_POST_INC, req->req.dma + req->req.actual,
+               0, 0);
        ep->dma_counter = DMA_DEST_LAST(ep->lch);
 
        UDC_RXDMA_REG(ep->dma_channel) = UDC_RXN_STOP | (packets - 1);
@@ -820,7 +830,8 @@ static void dma_channel_claim(struct omap_ep *ep, unsigned channel)
                        omap_set_dma_dest_params(ep->lch,
                                OMAP_DMA_PORT_TIPB,
                                OMAP_DMA_AMODE_CONSTANT,
-                               (unsigned long) io_v2p((u32)&UDC_DATA_DMA_REG));
+                               (unsigned long) io_v2p((u32)&UDC_DATA_DMA_REG),
+                               0, 0);
                }
        } else {
                status = omap_request_dma(OMAP_DMA_USB_W2FC_RX0 - 1 + channel,
@@ -831,7 +842,8 @@ static void dma_channel_claim(struct omap_ep *ep, unsigned channel)
                        omap_set_dma_src_params(ep->lch,
                                OMAP_DMA_PORT_TIPB,
                                OMAP_DMA_AMODE_CONSTANT,
-                               (unsigned long) io_v2p((u32)&UDC_DATA_DMA_REG));
+                               (unsigned long) io_v2p((u32)&UDC_DATA_DMA_REG),
+                               0, 0);
                        /* EMIFF */
                        omap_set_dma_dest_burst_mode(ep->lch,
                                                OMAP_DMA_DATA_BURST_4);
@@ -846,7 +858,7 @@ static void dma_channel_claim(struct omap_ep *ep, unsigned channel)
 
                /* channel type P: hw synch (fifo) */
                if (!cpu_is_omap15xx())
-                       omap_writew(2, OMAP_DMA_LCH_CTRL(ep->lch));
+                       OMAP1_DMA_LCH_CTRL_REG(ep->lch) = 2;
        }
 
 just_restart:
@@ -893,7 +905,7 @@ static void dma_channel_release(struct omap_ep *ep)
        else
                req = NULL;
 
-       active = ((1 << 7) & omap_readl(OMAP_DMA_CCR(ep->lch))) != 0;
+       active = ((1 << 7) & OMAP_DMA_CCR_REG(ep->lch)) != 0;
 
        DBG("%s release %s %cxdma%d %p\n", ep->ep.name,
                        active ? "active" : "idle",
@@ -1117,7 +1129,7 @@ static int omap_ep_dequeue(struct usb_ep *_ep, struct usb_request *_req)
                 */
                dma_channel_release(ep);
                dma_channel_claim(ep, channel);
-       } else 
+       } else
                done(ep, req, -ECONNRESET);
        spin_unlock_irqrestore(&ep->udc->lock, flags);
        return 0;
@@ -1153,7 +1165,7 @@ static int omap_ep_set_halt(struct usb_ep *_ep, int value)
 
                /* IN endpoints must already be idle */
                if ((ep->bEndpointAddress & USB_DIR_IN)
-                               && !list_empty(&ep->queue)) { 
+                               && !list_empty(&ep->queue)) {
                        status = -EAGAIN;
                        goto done;
                }
@@ -1298,6 +1310,23 @@ static void pullup_disable(struct omap_udc *udc)
        UDC_SYSCON1_REG &= ~UDC_PULLUP_EN;
 }
 
+static struct omap_udc *udc;
+
+static void omap_udc_enable_clock(int enable)
+{
+       if (udc == NULL || udc->dc_clk == NULL || udc->hhc_clk == NULL)
+               return;
+
+       if (enable) {
+               clk_enable(udc->dc_clk);
+               clk_enable(udc->hhc_clk);
+               udelay(100);
+       } else {
+               clk_disable(udc->hhc_clk);
+               clk_disable(udc->dc_clk);
+       }
+}
+
 /*
  * Called by whatever detects VBUS sessions:  external transceiver
  * driver, or maybe GPIO0 VBUS IRQ.  May request 48 MHz clock.
@@ -1318,10 +1347,22 @@ static int omap_vbus_session(struct usb_gadget *gadget, int is_active)
                else
                        FUNC_MUX_CTRL_0_REG &= ~VBUS_CTRL_1510;
        }
+       if (udc->dc_clk != NULL && is_active) {
+               if (!udc->clk_requested) {
+                       omap_udc_enable_clock(1);
+                       udc->clk_requested = 1;
+               }
+       }
        if (can_pullup(udc))
                pullup_enable(udc);
        else
                pullup_disable(udc);
+       if (udc->dc_clk != NULL && !is_active) {
+               if (udc->clk_requested) {
+                       omap_udc_enable_clock(0);
+                       udc->clk_requested = 0;
+               }
+       }
        spin_unlock_irqrestore(&udc->lock, flags);
        return 0;
 }
@@ -1441,7 +1482,7 @@ static void ep0_irq(struct omap_udc *udc, u16 irq_src)
                }
        }
 
-       /* IN/OUT packets mean we're in the DATA or STATUS stage.  
+       /* IN/OUT packets mean we're in the DATA or STATUS stage.
         * This driver uses only uses protocol stalls (ep0 never halts),
         * and if we got this far the gadget driver already had a
         * chance to stall.  Tries to be forgiving of host oddities.
@@ -1509,7 +1550,7 @@ static void ep0_irq(struct omap_udc *udc, u16 irq_src)
                                } else if (stat == 0)
                                        UDC_CTRL_REG = UDC_SET_FIFO_EN;
                                UDC_EP_NUM_REG = 0;
-                               
+
                                /* activate status stage */
                                if (stat == 1) {
                                        done(ep0, req, 0);
@@ -1866,7 +1907,7 @@ static void pio_out_timer(unsigned long _ep)
 
        spin_lock_irqsave(&ep->udc->lock, flags);
        if (!list_empty(&ep->queue) && ep->ackwait) {
-               use_ep(ep, 0);
+               use_ep(ep, UDC_EP_SEL);
                stat_flg = UDC_STAT_FLG_REG;
 
                if ((stat_flg & UDC_ACK) && (!(stat_flg & UDC_FIFO_EN)
@@ -1876,12 +1917,12 @@ static void pio_out_timer(unsigned long _ep)
                        VDBG("%s: lose, %04x\n", ep->ep.name, stat_flg);
                        req = container_of(ep->queue.next,
                                        struct omap_req, queue);
-                       UDC_EP_NUM_REG = ep->bEndpointAddress | UDC_EP_SEL;
                        (void) read_fifo(ep, req);
                        UDC_EP_NUM_REG = ep->bEndpointAddress;
                        UDC_CTRL_REG = UDC_SET_FIFO_EN;
                        ep->ackwait = 1 + ep->double_buf;
-               }
+               } else
+                       deselect_ep();
        }
        mod_timer(&ep->timer, PIO_OUT_TIMEOUT);
        spin_unlock_irqrestore(&ep->udc->lock, flags);
@@ -2028,7 +2069,17 @@ static irqreturn_t omap_udc_iso_irq(int irq, void *_dev)
 
 /*-------------------------------------------------------------------------*/
 
-static struct omap_udc *udc;
+static inline int machine_needs_vbus_session(void)
+{
+       return (machine_is_omap_innovator()
+               || machine_is_omap_osk()
+               || machine_is_omap_apollon()
+#ifndef CONFIG_MACH_OMAP_H4_OTG
+               || machine_is_omap_h4()
+#endif
+               || machine_is_sx1()
+               );
+}
 
 int usb_gadget_register_driver (struct usb_gadget_driver *driver)
 {
@@ -2070,6 +2121,9 @@ int usb_gadget_register_driver (struct usb_gadget_driver *driver)
        udc->gadget.dev.driver = &driver->driver;
        spin_unlock_irqrestore(&udc->lock, flags);
 
+       if (udc->dc_clk != NULL)
+               omap_udc_enable_clock(1);
+
        status = driver->bind (&udc->gadget);
        if (status) {
                DBG("bind to %s --> %d\n", driver->driver.name, status);
@@ -2103,10 +2157,12 @@ int usb_gadget_register_driver (struct usb_gadget_driver *driver)
        /* boards that don't have VBUS sensing can't autogate 48MHz;
         * can't enter deep sleep while a gadget driver is active.
         */
-       if (machine_is_omap_innovator() || machine_is_omap_osk())
+       if (machine_needs_vbus_session())
                omap_vbus_session(&udc->gadget, 1);
 
 done:
+       if (udc->dc_clk != NULL)
+               omap_udc_enable_clock(0);
        return status;
 }
 EXPORT_SYMBOL(usb_gadget_register_driver);
@@ -2121,7 +2177,10 @@ int usb_gadget_unregister_driver (struct usb_gadget_driver *driver)
        if (!driver || driver != udc->driver || !driver->unbind)
                return -EINVAL;
 
-       if (machine_is_omap_innovator() || machine_is_omap_osk())
+       if (udc->dc_clk != NULL)
+               omap_udc_enable_clock(1);
+
+       if (machine_needs_vbus_session())
                omap_vbus_session(&udc->gadget, 0);
 
        if (udc->transceiver)
@@ -2137,6 +2196,8 @@ int usb_gadget_unregister_driver (struct usb_gadget_driver *driver)
        udc->gadget.dev.driver = NULL;
        udc->driver = NULL;
 
+       if (udc->dc_clk != NULL)
+               omap_udc_enable_clock(0);
        DBG("unregistered driver '%s'\n", driver->driver.name);
        return status;
 }
@@ -2219,7 +2280,7 @@ static char *trx_mode(unsigned m, int enabled)
        case 0:         return enabled ? "*6wire" : "unused";
        case 1:         return "4wire";
        case 2:         return "3wire";
-       case 3:         return "6wire";
+       case 3:         return "6wire";
        default:        return "unknown";
        }
 }
@@ -2228,11 +2289,18 @@ static int proc_otg_show(struct seq_file *s)
 {
        u32             tmp;
        u32             trans;
+       char            *ctrl_name;
 
        tmp = OTG_REV_REG;
-       trans = USB_TRANSCEIVER_CTRL_REG;
-       seq_printf(s, "\nOTG rev %d.%d, transceiver_ctrl %05x\n",
-               tmp >> 4, tmp & 0xf, trans);
+       if (cpu_is_omap24xx()) {
+               ctrl_name = "control_devconf";
+               trans = CONTROL_DEVCONF_REG;
+       } else {
+               ctrl_name = "tranceiver_ctrl";
+               trans = USB_TRANSCEIVER_CTRL_REG;
+       }
+       seq_printf(s, "\nOTG rev %d.%d, %s %05x\n",
+               tmp >> 4, tmp & 0xf, ctrl_name, trans);
        tmp = OTG_SYSCON_1_REG;
        seq_printf(s, "otg_syscon1 %08x usb2 %s, usb1 %s, usb0 %s,"
                        FOURBITS "\n", tmp,
@@ -2307,7 +2375,7 @@ static int proc_udc_show(struct seq_file *s, void *_)
                driver_desc,
                use_dma ?  " (dma)" : "");
 
-       tmp = UDC_REV_REG & 0xff; 
+       tmp = UDC_REV_REG & 0xff;
        seq_printf(s,
                "UDC rev %d.%d, fifo mode %d, gadget %s\n"
                "hmc %d, transceiver %s\n",
@@ -2315,11 +2383,16 @@ static int proc_udc_show(struct seq_file *s, void *_)
                fifo_mode,
                udc->driver ? udc->driver->driver.name : "(none)",
                HMC,
-               udc->transceiver ? udc->transceiver->label : "(none)");
-       seq_printf(s, "ULPD control %04x req %04x status %04x\n",
-               __REG16(ULPD_CLOCK_CTRL),
-               __REG16(ULPD_SOFT_REQ),
-               __REG16(ULPD_STATUS_REQ));
+               udc->transceiver
+                       ? udc->transceiver->label
+                       : ((cpu_is_omap1710() || cpu_is_omap24xx())
+                               ? "external" : "(none)"));
+       if (cpu_class_is_omap1()) {
+               seq_printf(s, "ULPD control %04x req %04x status %04x\n",
+                       __REG16(ULPD_CLOCK_CTRL),
+                       __REG16(ULPD_SOFT_REQ),
+                       __REG16(ULPD_STATUS_REQ));
+       }
 
        /* OTG controller registers */
        if (!cpu_is_omap15xx())
@@ -2504,9 +2577,10 @@ omap_ep_setup(char *name, u8 addr, u8 type,
                dbuf = 1;
        } else {
                /* double-buffering "not supported" on 15xx,
-                * and ignored for PIO-IN on 16xx
+                * and ignored for PIO-IN on newer chips
+                * (for more reliable behavior)
                 */
-               if (!use_dma || cpu_is_omap15xx())
+               if (!use_dma || cpu_is_omap15xx() || cpu_is_omap24xx())
                        dbuf = 0;
 
                switch (maxp) {
@@ -2549,7 +2623,7 @@ omap_ep_setup(char *name, u8 addr, u8 type,
        ep->bEndpointAddress = addr;
        ep->bmAttributes = type;
        ep->double_buf = dbuf;
-       ep->udc = udc; 
+       ep->udc = udc;
 
        ep->ep.name = ep->name;
        ep->ep.ops = &omap_ep_ops;
@@ -2709,15 +2783,37 @@ static int __init omap_udc_probe(struct platform_device *pdev)
        struct otg_transceiver  *xceiv = NULL;
        const char              *type = NULL;
        struct omap_usb_config  *config = pdev->dev.platform_data;
+       struct clk              *dc_clk;
+       struct clk              *hhc_clk;
 
        /* NOTE:  "knows" the order of the resources! */
-       if (!request_mem_region(pdev->resource[0].start, 
+       if (!request_mem_region(pdev->resource[0].start,
                        pdev->resource[0].end - pdev->resource[0].start + 1,
                        driver_name)) {
                DBG("request_mem_region failed\n");
                return -EBUSY;
        }
 
+       if (cpu_is_omap16xx()) {
+               dc_clk = clk_get(&pdev->dev, "usb_dc_ck");
+               hhc_clk = clk_get(&pdev->dev, "usb_hhc_ck");
+               BUG_ON(IS_ERR(dc_clk) || IS_ERR(hhc_clk));
+               /* can't use omap_udc_enable_clock yet */
+               clk_enable(dc_clk);
+               clk_enable(hhc_clk);
+               udelay(100);
+       }
+
+       if (cpu_is_omap24xx()) {
+               dc_clk = clk_get(&pdev->dev, "usb_fck");
+               hhc_clk = clk_get(&pdev->dev, "usb_l4_ick");
+               BUG_ON(IS_ERR(dc_clk) || IS_ERR(hhc_clk));
+               /* can't use omap_udc_enable_clock yet */
+               clk_enable(dc_clk);
+               clk_enable(hhc_clk);
+               udelay(100);
+       }
+
        INFO("OMAP UDC rev %d.%d%s\n",
                UDC_REV_REG >> 4, UDC_REV_REG & 0xf,
                config->otg ? ", Mini-AB" : "");
@@ -2727,7 +2823,7 @@ static int __init omap_udc_probe(struct platform_device *pdev)
                hmc = HMC_1510;
                type = "(unknown)";
 
-               if (machine_is_omap_innovator()) {
+               if (machine_is_omap_innovator() || machine_is_sx1()) {
                        /* just set up software VBUS detect, and then
                         * later rig it so we always report VBUS.
                         * FIXME without really sensing VBUS, we can't
@@ -2756,6 +2852,15 @@ static int __init omap_udc_probe(struct platform_device *pdev)
                }
 
                hmc = HMC_1610;
+
+               if (cpu_is_omap24xx()) {
+                       /* this could be transceiverless in one of the
+                        * "we don't need to know" modes.
+                        */
+                       type = "external";
+                       goto known;
+               }
+
                switch (hmc) {
                case 0:                 /* POWERUP DEFAULT == 0 */
                case 4:
@@ -2794,6 +2899,7 @@ bad_on_1710:
                        goto cleanup0;
                }
        }
+known:
        INFO("hmc mode %d, %s transceiver\n", hmc, type);
 
        /* a "gadget" abstracts/virtualizes the controller */
@@ -2818,8 +2924,8 @@ bad_on_1710:
        status = request_irq(pdev->resource[1].start, omap_udc_irq,
                        IRQF_SAMPLE_RANDOM, driver_name, udc);
        if (status != 0) {
-               ERR( "can't get irq %ld, err %d\n",
-                       pdev->resource[1].start, status);
+               ERR("can't get irq %d, err %d\n",
+                       (int) pdev->resource[1].start, status);
                goto cleanup1;
        }
 
@@ -2827,24 +2933,41 @@ bad_on_1710:
        status = request_irq(pdev->resource[2].start, omap_udc_pio_irq,
                        IRQF_SAMPLE_RANDOM, "omap_udc pio", udc);
        if (status != 0) {
-               ERR( "can't get irq %ld, err %d\n",
-                       pdev->resource[2].start, status);
+               ERR("can't get irq %d, err %d\n",
+                       (int) pdev->resource[2].start, status);
                goto cleanup2;
        }
 #ifdef USE_ISO
        status = request_irq(pdev->resource[3].start, omap_udc_iso_irq,
                        IRQF_DISABLED, "omap_udc iso", udc);
        if (status != 0) {
-               ERR("can't get irq %ld, err %d\n",
-                       pdev->resource[3].start, status);
+               ERR("can't get irq %d, err %d\n",
+                       (int) pdev->resource[3].start, status);
                goto cleanup3;
        }
 #endif
+       if (cpu_is_omap16xx()) {
+               udc->dc_clk = dc_clk;
+               udc->hhc_clk = hhc_clk;
+               clk_disable(hhc_clk);
+               clk_disable(dc_clk);
+       }
+
+       if (cpu_is_omap24xx()) {
+               udc->dc_clk = dc_clk;
+               udc->hhc_clk = hhc_clk;
+               /* FIXME OMAP2 don't release hhc & dc clock */
+#if 0
+               clk_disable(hhc_clk);
+               clk_disable(dc_clk);
+#endif
+       }
 
        create_proc_file();
-       device_add(&udc->gadget.dev);
-       return 0;
-
+       status = device_add(&udc->gadget.dev);
+       if (!status)
+               return status;
+       /* If fail, fall through */
 #ifdef USE_ISO
 cleanup3:
        free_irq(pdev->resource[2].start, udc);
@@ -2860,8 +2983,17 @@ cleanup1:
 cleanup0:
        if (xceiv)
                put_device(xceiv->dev);
+
+       if (cpu_is_omap16xx() || cpu_is_omap24xx()) {
+               clk_disable(hhc_clk);
+               clk_disable(dc_clk);
+               clk_put(hhc_clk);
+               clk_put(dc_clk);
+       }
+
        release_mem_region(pdev->resource[0].start,
                        pdev->resource[0].end - pdev->resource[0].start + 1);
+
        return status;
 }
 
@@ -2891,6 +3023,13 @@ static int __exit omap_udc_remove(struct platform_device *pdev)
        free_irq(pdev->resource[2].start, udc);
        free_irq(pdev->resource[1].start, udc);
 
+       if (udc->dc_clk) {
+               if (udc->clk_requested)
+                       omap_udc_enable_clock(0);
+               clk_put(udc->hhc_clk);
+               clk_put(udc->dc_clk);
+       }
+
        release_mem_region(pdev->resource[0].start,
                        pdev->resource[0].end - pdev->resource[0].start + 1);
 
index 652ee46..1dc398b 100644 (file)
@@ -175,6 +175,9 @@ struct omap_udc {
        unsigned                        ep0_reset_config:1;
        unsigned                        ep0_setup:1;
        struct completion               *done;
+       struct clk                      *dc_clk;
+       struct clk                      *hhc_clk;
+       unsigned                        clk_requested:1;
 };
 
 /*-------------------------------------------------------------------------*/
index acd101c..e0d4c23 100644 (file)
@@ -209,24 +209,16 @@ static int resume_detect_interrupts_are_broken(struct uhci_hcd *uhci)
 
 static int remote_wakeup_is_broken(struct uhci_hcd *uhci)
 {
-       static struct dmi_system_id broken_wakeup_table[] = {
-               {
-                       .ident = "Asus A7V8X",
-                       .matches = {
-                               DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK"),
-                               DMI_MATCH(DMI_BOARD_NAME, "A7V8X"),
-                               DMI_MATCH(DMI_BOARD_VERSION, "REV 1.xx"),
-                       }
-               },
-               { }
-       };
        int port;
+       char *sys_info;
+       static char bad_Asus_board[] = "A7V8X";
 
        /* One of Asus's motherboards has a bug which causes it to
         * wake up immediately from suspend-to-RAM if any of the ports
         * are connected.  In such cases we will not set EGSM.
         */
-       if (dmi_check_system(broken_wakeup_table)) {
+       sys_info = dmi_get_system_info(DMI_BOARD_NAME);
+       if (sys_info && !strcmp(sys_info, bad_Asus_board)) {
                for (port = 0; port < uhci->rh_numports; ++port) {
                        if (inw(uhci->io_addr + USBPORTSC1 + port * 2) &
                                        USBPORTSC_CCS)
@@ -265,7 +257,9 @@ __acquires(uhci->lock)
        int_enable = USBINTR_RESUME;
        if (remote_wakeup_is_broken(uhci))
                egsm_enable = 0;
-       if (resume_detect_interrupts_are_broken(uhci) || !egsm_enable)
+       if (resume_detect_interrupts_are_broken(uhci) || !egsm_enable ||
+                       !device_may_wakeup(
+                               &uhci_to_hcd(uhci)->self.root_hub->dev))
                uhci->working_RD = int_enable = 0;
 
        outw(int_enable, uhci->io_addr + USBINTR);
index bf26c3c..9148694 100644 (file)
@@ -403,7 +403,7 @@ sisusbcon_putc(struct vc_data *c, int ch, int y, int x)
 
 
        sisusb_copy_memory(sisusb, (char *)SISUSB_VADDR(x, y),
-                               (u32)SISUSB_HADDR(x, y), 2, &written);
+                               (long)SISUSB_HADDR(x, y), 2, &written);
 
        mutex_unlock(&sisusb->lock);
 }
@@ -438,7 +438,7 @@ sisusbcon_putcs(struct vc_data *c, const unsigned short *s,
        }
 
        sisusb_copy_memory(sisusb, (char *)SISUSB_VADDR(x, y),
-                               (u32)SISUSB_HADDR(x, y), count * 2, &written);
+                               (long)SISUSB_HADDR(x, y), count * 2, &written);
 
        mutex_unlock(&sisusb->lock);
 }
@@ -492,7 +492,7 @@ sisusbcon_clear(struct vc_data *c, int y, int x, int height, int width)
 
 
        sisusb_copy_memory(sisusb, (unsigned char *)SISUSB_VADDR(x, y),
-                               (u32)SISUSB_HADDR(x, y), length, &written);
+                               (long)SISUSB_HADDR(x, y), length, &written);
 
        mutex_unlock(&sisusb->lock);
 }
@@ -564,7 +564,7 @@ sisusbcon_bmove(struct vc_data *c, int sy, int sx,
 
 
        sisusb_copy_memory(sisusb, (unsigned char *)SISUSB_VADDR(dx, dy),
-                               (u32)SISUSB_HADDR(dx, dy), length, &written);
+                               (long)SISUSB_HADDR(dx, dy), length, &written);
 
        mutex_unlock(&sisusb->lock);
 }
@@ -612,7 +612,7 @@ sisusbcon_switch(struct vc_data *c)
                                                                length);
 
        sisusb_copy_memory(sisusb, (unsigned char *)c->vc_origin,
-                               (u32)SISUSB_HADDR(0, 0),
+                               (long)SISUSB_HADDR(0, 0),
                                length, &written);
 
        mutex_unlock(&sisusb->lock);
@@ -939,7 +939,7 @@ sisusbcon_scroll_area(struct vc_data *c, struct sisusb_usb_data *sisusb,
        }
 
        sisusb_copy_memory(sisusb, (char *)SISUSB_VADDR(0, t),
-                               (u32)SISUSB_HADDR(0, t), length, &written);
+                               (long)SISUSB_HADDR(0, t), length, &written);
 
        mutex_unlock(&sisusb->lock);
 
index 95e682e..f538013 100644 (file)
@@ -920,7 +920,7 @@ static int ax88772_bind(struct usbnet *dev, struct usb_interface *intf)
                goto out2;
 
        if ((ret = asix_write_cmd(dev, AX_CMD_SW_PHY_SELECT,
-                               0x0000, 0, 0, buf)) < 0) {
+                               1, 0, 0, buf)) < 0) {
                dbg("Select PHY #1 failed: %d", ret);
                goto out2;
        }
index 2f4d303..c8999ae 100644 (file)
@@ -170,7 +170,7 @@ config USB_SERIAL_FTDI_SIO
 
 config USB_SERIAL_FUNSOFT
        tristate "USB Fundamental Software Dongle Driver"
-       depends on USB_SERIAL
+       depends on USB_SERIAL && !(SPARC || SPARC64)
        ---help---
          Say Y here if you want to use the Fundamental Software dongle.
 
index 819266b..5ca04e8 100644 (file)
@@ -625,6 +625,9 @@ static int option_send_setup(struct usb_serial_port *port)
 
        dbg("%s", __FUNCTION__);
 
+       if (port->number != 0)
+               return 0;
+
        portdata = usb_get_serial_port_data(port);
 
        if (port->tty) {
index 5fe7ff4..cddef3e 100644 (file)
@@ -728,7 +728,7 @@ UNUSUAL_DEV( 0x05ac, 0x1204, 0x0000, 0x9999,
                "Apple",
                "iPod",
                US_SC_DEVICE, US_PR_DEVICE, NULL,
-               US_FL_FIX_CAPACITY ),
+               US_FL_FIX_CAPACITY | US_FL_NOT_LOCKABLE ),
 
 UNUSUAL_DEV( 0x05ac, 0x1205, 0x0000, 0x9999,
                "Apple",
@@ -1358,6 +1358,21 @@ UNUSUAL_DEV(  0x1370, 0x6828, 0x0110, 0x0110,
                US_SC_DEVICE, US_PR_DEVICE, NULL,
                US_FL_IGNORE_RESIDUE ),
 
+/* Reported by Francesco Foresti <frafore@tiscali.it> */
+UNUSUAL_DEV(  0x14cd, 0x6600, 0x0201, 0x0201,
+               "Super Top",
+               "IDE DEVICE",
+               US_SC_DEVICE, US_PR_DEVICE, NULL,
+               US_FL_IGNORE_RESIDUE ),
+
+/* Reported by Robert Schedel <r.schedel@yahoo.de>
+ * Note: this is a 'super top' device like the above 14cd/6600 device */
+UNUSUAL_DEV(  0x1652, 0x6600, 0x0201, 0x0201,
+               "Teac",
+               "HD-35PUK-B",
+               US_SC_DEVICE, US_PR_DEVICE, NULL,
+               US_FL_IGNORE_RESIDUE ),
+
 /* patch submitted by Davide Perini <perini.davide@dpsoftware.org>
  * and Renato Perini <rperini@email.it>
  */
index bbfc862..b9b2b27 100644 (file)
@@ -53,7 +53,7 @@ static inline int adfs_readname(char *buf, char *ptr, int maxlen)
 {
        char *old_buf = buf;
 
-       while (*ptr >= ' ' && maxlen--) {
+       while ((unsigned char)*ptr >= ' ' && maxlen--) {
                if (*ptr == '/')
                        *buf++ = '.';
                else
index 34e6d7b..869f519 100644 (file)
 #include <linux/time.h>
 #include <linux/smp_lock.h>
 #include <linux/namei.h>
+#include <linux/poll.h>
 
-static int return_EIO(void)
+
+static loff_t bad_file_llseek(struct file *file, loff_t offset, int origin)
+{
+       return -EIO;
+}
+
+static ssize_t bad_file_read(struct file *filp, char __user *buf,
+                       size_t size, loff_t *ppos)
+{
+        return -EIO;
+}
+
+static ssize_t bad_file_write(struct file *filp, const char __user *buf,
+                       size_t siz, loff_t *ppos)
+{
+        return -EIO;
+}
+
+static ssize_t bad_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+                       unsigned long nr_segs, loff_t pos)
+{
+       return -EIO;
+}
+
+static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+                       unsigned long nr_segs, loff_t pos)
+{
+       return -EIO;
+}
+
+static int bad_file_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+       return -EIO;
+}
+
+static unsigned int bad_file_poll(struct file *filp, poll_table *wait)
+{
+       return POLLERR;
+}
+
+static int bad_file_ioctl (struct inode *inode, struct file *filp,
+                       unsigned int cmd, unsigned long arg)
+{
+       return -EIO;
+}
+
+static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd,
+                       unsigned long arg)
+{
+       return -EIO;
+}
+
+static long bad_file_compat_ioctl(struct file *file, unsigned int cmd,
+                       unsigned long arg)
+{
+       return -EIO;
+}
+
+static int bad_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       return -EIO;
+}
+
+static int bad_file_open(struct inode *inode, struct file *filp)
+{
+       return -EIO;
+}
+
+static int bad_file_flush(struct file *file, fl_owner_t id)
+{
+       return -EIO;
+}
+
+static int bad_file_release(struct inode *inode, struct file *filp)
+{
+       return -EIO;
+}
+
+static int bad_file_fsync(struct file *file, struct dentry *dentry,
+                       int datasync)
+{
+       return -EIO;
+}
+
+static int bad_file_aio_fsync(struct kiocb *iocb, int datasync)
+{
+       return -EIO;
+}
+
+static int bad_file_fasync(int fd, struct file *filp, int on)
+{
+       return -EIO;
+}
+
+static int bad_file_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+       return -EIO;
+}
+
+static ssize_t bad_file_sendfile(struct file *in_file, loff_t *ppos,
+                       size_t count, read_actor_t actor, void *target)
+{
+       return -EIO;
+}
+
+static ssize_t bad_file_sendpage(struct file *file, struct page *page,
+                       int off, size_t len, loff_t *pos, int more)
+{
+       return -EIO;
+}
+
+static unsigned long bad_file_get_unmapped_area(struct file *file,
+                               unsigned long addr, unsigned long len,
+                               unsigned long pgoff, unsigned long flags)
+{
+       return -EIO;
+}
+
+static int bad_file_check_flags(int flags)
 {
        return -EIO;
 }
 
-#define EIO_ERROR ((void *) (return_EIO))
+static int bad_file_dir_notify(struct file *file, unsigned long arg)
+{
+       return -EIO;
+}
+
+static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl)
+{
+       return -EIO;
+}
+
+static ssize_t bad_file_splice_write(struct pipe_inode_info *pipe,
+                       struct file *out, loff_t *ppos, size_t len,
+                       unsigned int flags)
+{
+       return -EIO;
+}
+
+static ssize_t bad_file_splice_read(struct file *in, loff_t *ppos,
+                       struct pipe_inode_info *pipe, size_t len,
+                       unsigned int flags)
+{
+       return -EIO;
+}
 
 static const struct file_operations bad_file_ops =
 {
-       .llseek         = EIO_ERROR,
-       .aio_read       = EIO_ERROR,
-       .read           = EIO_ERROR,
-       .write          = EIO_ERROR,
-       .aio_write      = EIO_ERROR,
-       .readdir        = EIO_ERROR,
-       .poll           = EIO_ERROR,
-       .ioctl          = EIO_ERROR,
-       .mmap           = EIO_ERROR,
-       .open           = EIO_ERROR,
-       .flush          = EIO_ERROR,
-       .release        = EIO_ERROR,
-       .fsync          = EIO_ERROR,
-       .aio_fsync      = EIO_ERROR,
-       .fasync         = EIO_ERROR,
-       .lock           = EIO_ERROR,
-       .sendfile       = EIO_ERROR,
-       .sendpage       = EIO_ERROR,
-       .get_unmapped_area = EIO_ERROR,
+       .llseek         = bad_file_llseek,
+       .read           = bad_file_read,
+       .write          = bad_file_write,
+       .aio_read       = bad_file_aio_read,
+       .aio_write      = bad_file_aio_write,
+       .readdir        = bad_file_readdir,
+       .poll           = bad_file_poll,
+       .ioctl          = bad_file_ioctl,
+       .unlocked_ioctl = bad_file_unlocked_ioctl,
+       .compat_ioctl   = bad_file_compat_ioctl,
+       .mmap           = bad_file_mmap,
+       .open           = bad_file_open,
+       .flush          = bad_file_flush,
+       .release        = bad_file_release,
+       .fsync          = bad_file_fsync,
+       .aio_fsync      = bad_file_aio_fsync,
+       .fasync         = bad_file_fasync,
+       .lock           = bad_file_lock,
+       .sendfile       = bad_file_sendfile,
+       .sendpage       = bad_file_sendpage,
+       .get_unmapped_area = bad_file_get_unmapped_area,
+       .check_flags    = bad_file_check_flags,
+       .dir_notify     = bad_file_dir_notify,
+       .flock          = bad_file_flock,
+       .splice_write   = bad_file_splice_write,
+       .splice_read    = bad_file_splice_read,
 };
 
+static int bad_inode_create (struct inode *dir, struct dentry *dentry,
+               int mode, struct nameidata *nd)
+{
+       return -EIO;
+}
+
+static struct dentry *bad_inode_lookup(struct inode *dir,
+                       struct dentry *dentry, struct nameidata *nd)
+{
+       return ERR_PTR(-EIO);
+}
+
+static int bad_inode_link (struct dentry *old_dentry, struct inode *dir,
+               struct dentry *dentry)
+{
+       return -EIO;
+}
+
+static int bad_inode_unlink(struct inode *dir, struct dentry *dentry)
+{
+       return -EIO;
+}
+
+static int bad_inode_symlink (struct inode *dir, struct dentry *dentry,
+               const char *symname)
+{
+       return -EIO;
+}
+
+static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry,
+                       int mode)
+{
+       return -EIO;
+}
+
+static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry)
+{
+       return -EIO;
+}
+
+static int bad_inode_mknod (struct inode *dir, struct dentry *dentry,
+                       int mode, dev_t rdev)
+{
+       return -EIO;
+}
+
+static int bad_inode_rename (struct inode *old_dir, struct dentry *old_dentry,
+               struct inode *new_dir, struct dentry *new_dentry)
+{
+       return -EIO;
+}
+
+static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
+               int buflen)
+{
+       return -EIO;
+}
+
+static int bad_inode_permission(struct inode *inode, int mask,
+                       struct nameidata *nd)
+{
+       return -EIO;
+}
+
+static int bad_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                       struct kstat *stat)
+{
+       return -EIO;
+}
+
+static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs)
+{
+       return -EIO;
+}
+
+static int bad_inode_setxattr(struct dentry *dentry, const char *name,
+               const void *value, size_t size, int flags)
+{
+       return -EIO;
+}
+
+static ssize_t bad_inode_getxattr(struct dentry *dentry, const char *name,
+                       void *buffer, size_t size)
+{
+       return -EIO;
+}
+
+static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer,
+                       size_t buffer_size)
+{
+       return -EIO;
+}
+
+static int bad_inode_removexattr(struct dentry *dentry, const char *name)
+{
+       return -EIO;
+}
+
 static struct inode_operations bad_inode_ops =
 {
-       .create         = EIO_ERROR,
-       .lookup         = EIO_ERROR,
-       .link           = EIO_ERROR,
-       .unlink         = EIO_ERROR,
-       .symlink        = EIO_ERROR,
-       .mkdir          = EIO_ERROR,
-       .rmdir          = EIO_ERROR,
-       .mknod          = EIO_ERROR,
-       .rename         = EIO_ERROR,
-       .readlink       = EIO_ERROR,
+       .create         = bad_inode_create,
+       .lookup         = bad_inode_lookup,
+       .link           = bad_inode_link,
+       .unlink         = bad_inode_unlink,
+       .symlink        = bad_inode_symlink,
+       .mkdir          = bad_inode_mkdir,
+       .rmdir          = bad_inode_rmdir,
+       .mknod          = bad_inode_mknod,
+       .rename         = bad_inode_rename,
+       .readlink       = bad_inode_readlink,
        /* follow_link must be no-op, otherwise unmounting this inode
           won't work */
-       .truncate       = EIO_ERROR,
-       .permission     = EIO_ERROR,
-       .getattr        = EIO_ERROR,
-       .setattr        = EIO_ERROR,
-       .setxattr       = EIO_ERROR,
-       .getxattr       = EIO_ERROR,
-       .listxattr      = EIO_ERROR,
-       .removexattr    = EIO_ERROR,
+       /* put_link returns void */
+       /* truncate returns void */
+       .permission     = bad_inode_permission,
+       .getattr        = bad_inode_getattr,
+       .setattr        = bad_inode_setattr,
+       .setxattr       = bad_inode_setxattr,
+       .getxattr       = bad_inode_getxattr,
+       .listxattr      = bad_inode_listxattr,
+       .removexattr    = bad_inode_removexattr,
+       /* truncate_range returns void */
 };
 
 
@@ -88,7 +336,7 @@ static struct inode_operations bad_inode_ops =
  *     on it to fail from this point on.
  */
  
-void make_bad_inode(struct inode * inode) 
+void make_bad_inode(struct inode *inode)
 {
        remove_inode_hash(inode);
 
@@ -113,7 +361,7 @@ EXPORT_SYMBOL(make_bad_inode);
  *     Returns true if the inode in question has been marked as bad.
  */
  
-int is_bad_inode(struct inode * inode) 
+int is_bad_inode(struct inode *inode)
 {
        return (inode->i_op == &bad_inode_ops); 
 }
index d3adfd3..7cb2872 100644 (file)
@@ -854,13 +854,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                         * default mmap base, as well as whatever program they
                         * might try to exec.  This is because the brk will
                         * follow the loader, and is not movable.  */
-                       if (current->flags & PF_RANDOMIZE)
-                               load_bias = randomize_range(0x10000,
-                                                           ELF_ET_DYN_BASE,
-                                                           0);
-                       else
-                               load_bias = ELF_ET_DYN_BASE;
-                       load_bias = ELF_PAGESTART(load_bias - vaddr);
+                       load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
                }
 
                error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
index b823814..2e0021e 100644 (file)
@@ -275,6 +275,25 @@ static void ufs_change_blocknr(struct inode *inode, unsigned int baseblk,
        UFSD("EXIT\n");
 }
 
+static void ufs_clear_frags(struct inode *inode, sector_t beg, unsigned int n,
+                           int sync)
+{
+       struct buffer_head *bh;
+       sector_t end = beg + n;
+
+       for (; beg < end; ++beg) {
+               bh = sb_getblk(inode->i_sb, beg);
+               lock_buffer(bh);
+               memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+               set_buffer_uptodate(bh);
+               mark_buffer_dirty(bh);
+               unlock_buffer(bh);
+               if (IS_SYNC(inode) || sync)
+                       sync_dirty_buffer(bh);
+               brelse(bh);
+       }
+}
+
 unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
                           unsigned goal, unsigned count, int * err, struct page *locked_page)
 {
@@ -350,6 +369,8 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
                        *p = cpu_to_fs32(sb, result);
                        *err = 0;
                        UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
+                       ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
+                                       locked_page != NULL);
                }
                unlock_super(sb);
                UFSD("EXIT, result %u\n", result);
@@ -363,6 +384,8 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
        if (result) {
                *err = 0;
                UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
+               ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
+                               locked_page != NULL);
                unlock_super(sb);
                UFSD("EXIT, result %u\n", result);
                return result;
@@ -398,6 +421,8 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
                *p = cpu_to_fs32(sb, result);
                *err = 0;
                UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
+               ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
+                               locked_page != NULL);
                unlock_super(sb);
                if (newcount < request)
                        ufs_free_fragments (inode, result + newcount, request - newcount);
index ee1eaa6..2fbab0a 100644 (file)
@@ -156,36 +156,6 @@ out:
        return ret;
 }
 
-static void ufs_clear_frag(struct inode *inode, struct buffer_head *bh)
-{
-       lock_buffer(bh);
-       memset(bh->b_data, 0, inode->i_sb->s_blocksize);
-       set_buffer_uptodate(bh);
-       mark_buffer_dirty(bh);
-       unlock_buffer(bh);
-       if (IS_SYNC(inode))
-               sync_dirty_buffer(bh);
-}
-
-static struct buffer_head *
-ufs_clear_frags(struct inode *inode, sector_t beg,
-               unsigned int n, sector_t want)
-{
-       struct buffer_head *res = NULL, *bh;
-       sector_t end = beg + n;
-
-       for (; beg < end; ++beg) {
-               bh = sb_getblk(inode->i_sb, beg);
-               ufs_clear_frag(inode, bh);
-               if (want != beg)
-                       brelse(bh);
-               else
-                       res = bh;
-       }
-       BUG_ON(!res);
-       return res;
-}
-
 /**
  * ufs_inode_getfrag() - allocate new fragment(s)
  * @inode - pointer to inode
@@ -302,7 +272,7 @@ repeat:
        }
 
        if (!phys) {
-               result = ufs_clear_frags(inode, tmp, required, tmp + blockoff);
+               result = sb_getblk(sb, tmp + blockoff);
        } else {
                *phys = tmp + blockoff;
                result = NULL;
@@ -403,8 +373,7 @@ repeat:
 
 
        if (!phys) {
-               result = ufs_clear_frags(inode, tmp, uspi->s_fpb,
-                                        tmp + blockoff);
+               result = sb_getblk(sb, tmp + blockoff);
        } else {
                *phys = tmp + blockoff;
                *new = 1;
@@ -471,13 +440,13 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head
 #define GET_INODE_DATABLOCK(x) \
        ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new, bh_result->b_page)
 #define GET_INODE_PTR(x) \
-       ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL, bh_result->b_page)
+       ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL, NULL)
 #define GET_INDIRECT_DATABLOCK(x) \
        ufs_inode_getblock(inode, bh, x, fragment,      \
-                         &err, &phys, &new, bh_result->b_page);
+                         &err, &phys, &new, bh_result->b_page)
 #define GET_INDIRECT_PTR(x) \
        ufs_inode_getblock(inode, bh, x, fragment,      \
-                         &err, NULL, NULL, bh_result->b_page);
+                         &err, NULL, NULL, NULL)
 
        if (ptr < UFS_NDIR_FRAGMENT) {
                bh = GET_INODE_DATABLOCK(ptr);
index 8ce79a6..e7686d0 100644 (file)
@@ -13,7 +13,8 @@
 #define ASK_VGA                0xfffd          /* ask for it at bootup */
 
 /* Physical address where kenrel should be loaded. */
-#define LOAD_PHYSICAL_ADDR ((0x100000 + CONFIG_PHYSICAL_ALIGN - 1) \
+#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
+                               + (CONFIG_PHYSICAL_ALIGN - 1)) \
                                & ~(CONFIG_PHYSICAL_ALIGN - 1))
 
 #endif /* _LINUX_BOOT_H */
index 28fdce1..bc8b461 100644 (file)
@@ -11,7 +11,7 @@
 #include <asm/types.h>
 #include <linux/ioctl.h>
 
-#define KVM_API_VERSION 1
+#define KVM_API_VERSION 2
 
 /*
  * Architectural interrupt line count, and the size of the bitmap needed
@@ -45,6 +45,7 @@ enum kvm_exit_reason {
        KVM_EXIT_DEBUG            = 4,
        KVM_EXIT_HLT              = 5,
        KVM_EXIT_MMIO             = 6,
+       KVM_EXIT_IRQ_WINDOW_OPEN  = 7,
 };
 
 /* for KVM_RUN */
@@ -53,11 +54,19 @@ struct kvm_run {
        __u32 vcpu;
        __u32 emulated;  /* skip current instruction */
        __u32 mmio_completed; /* mmio request completed */
+       __u8 request_interrupt_window;
+       __u8 padding1[3];
 
        /* out */
        __u32 exit_type;
        __u32 exit_reason;
        __u32 instruction_length;
+       __u8 ready_for_interrupt_injection;
+       __u8 if_flag;
+       __u16 padding2;
+       __u64 cr8;
+       __u64 apic_base;
+
        union {
                /* KVM_EXIT_UNKNOWN */
                struct {
index 156c40f..b78bbf4 100644 (file)
@@ -3,6 +3,7 @@
 
 #define ADFS_SUPER_MAGIC       0xadf5
 #define AFFS_SUPER_MAGIC       0xadff
+#define AFS_SUPER_MAGIC                0x5346414F
 #define AUTOFS_SUPER_MAGIC     0x0187
 #define CODA_SUPER_MAGIC       0x73757245
 #define EFS_SUPER_MAGIC                0x414A53
index add51ce..5423559 100644 (file)
@@ -245,7 +245,7 @@ extern int swap_duplicate(swp_entry_t);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
 extern void swap_free(swp_entry_t);
 extern void free_swap_and_cache(swp_entry_t);
-extern int swap_type_of(dev_t, sector_t);
+extern int swap_type_of(dev_t, sector_t, struct block_device **);
 extern unsigned int count_swap_pages(int, int);
 extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t);
 extern sector_t swapdev_block(int, pgoff_t);
index 2b1cdaa..bc27d72 100644 (file)
@@ -538,6 +538,11 @@ asmlinkage void __init start_kernel(void)
        parse_args("Booting kernel", command_line, __start___param,
                   __stop___param - __start___param,
                   &unknown_bootoption);
+       if (!irqs_disabled()) {
+               printk(KERN_WARNING "start_kernel(): bug: interrupts were "
+                               "enabled *very* early, fixing it\n");
+               local_irq_disable();
+       }
        sort_main_extable();
        trap_init();
        rcu_init();
index dbce132..d0f2260 100644 (file)
@@ -1148,10 +1148,10 @@ static int mod_sysfs_setup(struct module *mod,
        kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
        return 0;
 
-out_unreg_drivers:
-       kobject_unregister(mod->drivers_dir);
 out_unreg_param:
        module_param_sysfs_remove(mod);
+out_unreg_drivers:
+       kobject_unregister(mod->drivers_dir);
 out_unreg:
        kobject_del(&mod->mkobj.kobj);
        kobject_put(&mod->mkobj.kobj);
@@ -2327,8 +2327,22 @@ void print_modules(void)
        printk("\n");
 }
 
+static char *make_driver_name(struct device_driver *drv)
+{
+       char *driver_name;
+
+       driver_name = kmalloc(strlen(drv->name) + strlen(drv->bus->name) + 2,
+                             GFP_KERNEL);
+       if (!driver_name)
+               return NULL;
+
+       sprintf(driver_name, "%s:%s", drv->bus->name, drv->name);
+       return driver_name;
+}
+
 void module_add_driver(struct module *mod, struct device_driver *drv)
 {
+       char *driver_name;
        int no_warn;
 
        if (!mod || !drv)
@@ -2336,17 +2350,31 @@ void module_add_driver(struct module *mod, struct device_driver *drv)
 
        /* Don't check return codes; these calls are idempotent */
        no_warn = sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module");
-       no_warn = sysfs_create_link(mod->drivers_dir, &drv->kobj, drv->name);
+       driver_name = make_driver_name(drv);
+       if (driver_name) {
+               no_warn = sysfs_create_link(mod->drivers_dir, &drv->kobj,
+                                           driver_name);
+               kfree(driver_name);
+       }
 }
 EXPORT_SYMBOL(module_add_driver);
 
 void module_remove_driver(struct device_driver *drv)
 {
+       char *driver_name;
+
        if (!drv)
                return;
+
        sysfs_remove_link(&drv->kobj, "module");
-       if (drv->owner && drv->owner->drivers_dir)
-               sysfs_remove_link(drv->owner->drivers_dir, drv->name);
+       if (drv->owner && drv->owner->drivers_dir) {
+               driver_name = make_driver_name(drv);
+               if (driver_name) {
+                       sysfs_remove_link(drv->owner->drivers_dir,
+                                         driver_name);
+                       kfree(driver_name);
+               }
+       }
 }
 EXPORT_SYMBOL(module_remove_driver);
 
index f406655..718945d 100644 (file)
@@ -143,9 +143,15 @@ int parse_args(const char *name,
 
        while (*args) {
                int ret;
+               int irq_was_disabled;
 
                args = next_arg(args, &param, &val);
+               irq_was_disabled = irqs_disabled();
                ret = parse_one(param, val, params, num, unknown);
+               if (irq_was_disabled && !irqs_disabled()) {
+                       printk(KERN_WARNING "parse_args(): option '%s' enabled "
+                                       "irq's!\n", param);
+               }
                switch (ret) {
                case -ENOENT:
                        printk(KERN_ERR "%s: Unknown parameter `%s'\n",
index f133d4a..3581f8f 100644 (file)
@@ -165,14 +165,15 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 {
        int res;
 
-       res = swap_type_of(swsusp_resume_device, swsusp_resume_block);
+       res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
+                       &resume_bdev);
        if (res < 0)
                return res;
 
        root_swap = res;
-       resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_WRITE);
-       if (IS_ERR(resume_bdev))
-               return PTR_ERR(resume_bdev);
+       res = blkdev_get(resume_bdev, FMODE_WRITE, O_RDWR);
+       if (res)
+               return res;
 
        res = set_blocksize(resume_bdev, PAGE_SIZE);
        if (res < 0)
index 89443b8..f7b7a78 100644 (file)
@@ -57,7 +57,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
        memset(&data->handle, 0, sizeof(struct snapshot_handle));
        if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
                data->swap = swsusp_resume_device ?
-                               swap_type_of(swsusp_resume_device, 0) : -1;
+                       swap_type_of(swsusp_resume_device, 0, NULL) : -1;
                data->mode = O_RDONLY;
        } else {
                data->swap = -1;
@@ -268,7 +268,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                         * so we need to recode them
                         */
                        if (old_decode_dev(arg)) {
-                               data->swap = swap_type_of(old_decode_dev(arg), 0);
+                               data->swap = swap_type_of(old_decode_dev(arg),
+                                                       0, NULL);
                                if (data->swap < 0)
                                        error = -ENODEV;
                        } else {
@@ -365,7 +366,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                        swdev = old_decode_dev(swap_area.dev);
                        if (swdev) {
                                offset = swap_area.offset;
-                               data->swap = swap_type_of(swdev, offset);
+                               data->swap = swap_type_of(swdev, offset, NULL);
                                if (data->swap < 0)
                                        error = -ENODEV;
                        } else {
index fb5e03d..11550b2 100644 (file)
@@ -63,7 +63,7 @@ static int __init profile_setup(char * str)
                printk(KERN_INFO
                        "kernel sleep profiling enabled (shift: %ld)\n",
                        prof_shift);
-       } else if (!strncmp(str, sleepstr, strlen(sleepstr))) {
+       } else if (!strncmp(str, schedstr, strlen(schedstr))) {
                prof_on = SCHED_PROFILING;
                if (str[strlen(schedstr)] == ',')
                        str += strlen(schedstr) + 1;
index 6969cfb..b278b8d 100644 (file)
@@ -60,12 +60,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
                return 0;
        }
 
-       /*
-        * swapoff can easily use up all memory, so kill those first.
-        */
-       if (p->flags & PF_SWAPOFF)
-               return ULONG_MAX;
-
        /*
         * The memory size of the process is the basis for the badness.
         */
@@ -76,6 +70,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
         */
        task_unlock(p);
 
+       /*
+        * swapoff can easily use up all memory, so kill those first.
+        */
+       if (p->flags & PF_SWAPOFF)
+               return ULONG_MAX;
+
        /*
         * Processes which fork a lot of child processes are likely
         * a good choice. We add half the vmsize of the children if they
index 8c1a116..a49f96b 100644 (file)
@@ -711,6 +711,9 @@ static void __drain_pages(unsigned int cpu)
        for_each_zone(zone) {
                struct per_cpu_pageset *pset;
 
+               if (!populated_zone(zone))
+                       continue;
+
                pset = zone_pcp(zone, cpu);
                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
                        struct per_cpu_pages *pcp;
@@ -3321,6 +3324,10 @@ void *__init alloc_large_system_hash(const char *tablename,
                        numentries >>= (scale - PAGE_SHIFT);
                else
                        numentries <<= (PAGE_SHIFT - scale);
+
+               /* Make sure we've got at least a 0-order allocation.. */
+               if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+                       numentries = PAGE_SIZE / bucketsize;
        }
        numentries = roundup_pow_of_two(numentries);
 
index 0d4e574..c610062 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3281,7 +3281,7 @@ retry:
                                        flags | GFP_THISNODE, nid);
        }
 
-       if (!obj) {
+       if (!obj && !(flags & __GFP_NO_GROW)) {
                /*
                 * This allocation will be performed within the constraints
                 * of the current cpuset / memory policy requirements.
@@ -3310,7 +3310,7 @@ retry:
                                         */
                                        goto retry;
                        } else {
-                               kmem_freepages(cache, obj);
+                               /* cache_grow already freed obj */
                                obj = NULL;
                        }
                }
index b9fc0e5..a2d9bb4 100644 (file)
@@ -434,7 +434,7 @@ void free_swap_and_cache(swp_entry_t entry)
  *
  * This is needed for the suspend to disk (aka swsusp).
  */
-int swap_type_of(dev_t device, sector_t offset)
+int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
 {
        struct block_device *bdev = NULL;
        int i;
@@ -450,6 +450,9 @@ int swap_type_of(dev_t device, sector_t offset)
                        continue;
 
                if (!bdev) {
+                       if (bdev_p)
+                               *bdev_p = sis->bdev;
+
                        spin_unlock(&swap_lock);
                        return i;
                }
@@ -459,6 +462,9 @@ int swap_type_of(dev_t device, sector_t offset)
                        se = list_entry(sis->extent_list.next,
                                        struct swap_extent, list);
                        if (se->start_block == offset) {
+                               if (bdev_p)
+                                       *bdev_p = sis->bdev;
+
                                spin_unlock(&swap_lock);
                                bdput(bdev);
                                return i;
index 40fea49..7430df6 100644 (file)
@@ -1406,6 +1406,16 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
        return ret;
 }
 
+static unsigned long count_lru_pages(void)
+{
+       struct zone *zone;
+       unsigned long ret = 0;
+
+       for_each_zone(zone)
+               ret += zone->nr_active + zone->nr_inactive;
+       return ret;
+}
+
 /*
  * Try to free `nr_pages' of memory, system-wide, and return the number of
  * freed pages.
@@ -1420,7 +1430,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
        unsigned long ret = 0;
        int pass;
        struct reclaim_state reclaim_state;
-       struct zone *zone;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .may_swap = 0,
@@ -1431,10 +1440,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 
        current->reclaim_state = &reclaim_state;
 
-       lru_pages = 0;
-       for_each_zone(zone)
-               lru_pages += zone->nr_active + zone->nr_inactive;
-
+       lru_pages = count_lru_pages();
        nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
        /* If slab caches are huge, it's better to hit them first */
        while (nr_slab >= lru_pages) {
@@ -1461,13 +1467,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
        for (pass = 0; pass < 5; pass++) {
                int prio;
 
-               /* Needed for shrinking slab caches later on */
-               if (!lru_pages)
-                       for_each_zone(zone) {
-                               lru_pages += zone->nr_active;
-                               lru_pages += zone->nr_inactive;
-                       }
-
                /* Force reclaiming mapped pages in the passes #3 and #4 */
                if (pass > 2) {
                        sc.may_swap = 1;
@@ -1483,7 +1482,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
                                goto out;
 
                        reclaim_state.reclaimed_slab = 0;
-                       shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
+                       shrink_slab(sc.nr_scanned, sc.gfp_mask,
+                                       count_lru_pages());
                        ret += reclaim_state.reclaimed_slab;
                        if (ret >= nr_pages)
                                goto out;
@@ -1491,20 +1491,19 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
                        if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
                                congestion_wait(WRITE, HZ / 10);
                }
-
-               lru_pages = 0;
        }
 
        /*
         * If ret = 0, we could not shrink LRUs, but there may be something
         * in slab caches
         */
-       if (!ret)
+       if (!ret) {
                do {
                        reclaim_state.reclaimed_slab = 0;
-                       shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+                       shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
                        ret += reclaim_state.reclaimed_slab;
                } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
+       }
 
 out:
        current->reclaim_state = NULL;
index 0b2fcc4..a8ffc32 100644 (file)
@@ -925,6 +925,8 @@ ConfigInfoView::ConfigInfoView(QWidget* parent, const char *name)
                configSettings->endGroup();
                connect(configApp, SIGNAL(aboutToQuit()), SLOT(saveSettings()));
        }
+
+       has_dbg_info = 0;
 }
 
 void ConfigInfoView::saveSettings(void)
@@ -953,10 +955,13 @@ void ConfigInfoView::setInfo(struct menu *m)
        if (menu == m)
                return;
        menu = m;
-       if (!menu)
+       if (!menu) {
+               has_dbg_info = 0;
                clear();
-       else
+       } else {
+               has_dbg_info = 1;
                menuInfo();
+       }
 }
 
 void ConfigInfoView::setSource(const QString& name)
@@ -991,6 +996,9 @@ void ConfigInfoView::symbolInfo(void)
 {
        QString str;
 
+       if (!has_dbg_info)
+               return;
+
        str += "<big>Symbol: <b>";
        str += print_filter(sym->name);
        str += "</b></big><br><br>value: ";
index 6fc1c5f..a397edb 100644 (file)
@@ -273,6 +273,8 @@ protected:
        struct symbol *sym;
        struct menu *menu;
        bool _showDebug;
+
+       int has_dbg_info;
 };
 
 class ConfigSearchWindow : public QDialog {