Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 25 Oct 2011 07:34:10 +0000 (09:34 +0200)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 25 Oct 2011 07:34:10 +0000 (09:34 +0200)
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux-m68k:
  m68k: Finally remove leftover markers sections
  m68k/mac: Fix mac_irq_pending() for PSC MACE and SCC
  m68k/mac: Fix compiler warning in via_read_time()
  zorro: Fix four checkpatch warnings

38 files changed:
arch/x86/include/asm/xen/page.h
arch/x86/kernel/kprobes.c
arch/x86/pci/xen.c
arch/x86/xen/Kconfig
arch/x86/xen/enlighten.c
arch/x86/xen/mmu.c
arch/x86/xen/p2m.c
arch/x86/xen/setup.c
drivers/block/xen-blkback/blkback.c
drivers/pci/xen-pcifront.c
drivers/xen/Kconfig
drivers/xen/Makefile
drivers/xen/balloon.c
drivers/xen/events.c
drivers/xen/gntdev.c
drivers/xen/grant-table.c
drivers/xen/pci.c
drivers/xen/swiotlb-xen.c
drivers/xen/xen-pciback/conf_space.c
drivers/xen/xen-pciback/conf_space_header.c
drivers/xen/xen-pciback/conf_space_quirks.c
drivers/xen/xen-pciback/passthrough.c
drivers/xen/xen-pciback/pci_stub.c
drivers/xen/xen-pciback/pciback.h
drivers/xen/xen-pciback/pciback_ops.c
drivers/xen/xen-pciback/vpci.c
drivers/xen/xen-pciback/xenbus.c
drivers/xen/xen-selfballoon.c
drivers/xen/xenbus/xenbus_comms.c
drivers/xen/xenbus/xenbus_probe.c
drivers/xen/xenbus/xenbus_probe_backend.c
drivers/xen/xenbus/xenbus_probe_frontend.c
drivers/xen/xenbus/xenbus_xs.c
include/xen/balloon.h
include/xen/grant_table.h
include/xen/interface/io/xs_wire.h
include/xen/interface/physdev.h
include/xen/page.h

index 7ff4669..c34f96c 100644 (file)
@@ -12,6 +12,7 @@
 #include <asm/pgtable.h>
 
 #include <xen/interface/xen.h>
+#include <xen/grant_table.h>
 #include <xen/features.h>
 
 /* Xen machine address */
@@ -48,14 +49,11 @@ extern unsigned long set_phys_range_identity(unsigned long pfn_s,
                                             unsigned long pfn_e);
 
 extern int m2p_add_override(unsigned long mfn, struct page *page,
-                           bool clear_pte);
+                           struct gnttab_map_grant_ref *kmap_op);
 extern int m2p_remove_override(struct page *page, bool clear_pte);
 extern struct page *m2p_find_override(unsigned long mfn);
 extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
 
-#ifdef CONFIG_XEN_DEBUG_FS
-extern int p2m_dump_show(struct seq_file *m, void *v);
-#endif
 static inline unsigned long pfn_to_mfn(unsigned long pfn)
 {
        unsigned long mfn;
index f1a6244..794bc95 100644 (file)
@@ -75,8 +75,10 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
        /*
         * Undefined/reserved opcodes, conditional jump, Opcode Extension
         * Groups, and some special opcodes can not boost.
+        * This is non-const to keep gcc from statically optimizing it out, as
+        * variable_test_bit makes gcc think only *(unsigned long*) is used.
         */
-static const u32 twobyte_is_boostable[256 / 32] = {
+static u32 twobyte_is_boostable[256 / 32] = {
        /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
        /*      ----------------------------------------------          */
        W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
index 1017c7b..492ade8 100644 (file)
@@ -175,8 +175,10 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
                                               "pcifront-msi-x" :
                                               "pcifront-msi",
                                                DOMID_SELF);
-               if (irq < 0)
+               if (irq < 0) {
+                       ret = irq;
                        goto free;
+               }
                i++;
        }
        kfree(v);
@@ -221,8 +223,10 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
                if (msg.data != XEN_PIRQ_MSI_DATA ||
                    xen_irq_from_pirq(pirq) < 0) {
                        pirq = xen_allocate_pirq_msi(dev, msidesc);
-                       if (pirq < 0)
+                       if (pirq < 0) {
+                               irq = -ENODEV;
                                goto error;
+                       }
                        xen_msi_compose_msg(dev, pirq, &msg);
                        __write_msi_msg(msidesc, &msg);
                        dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq);
@@ -244,10 +248,12 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 error:
        dev_err(&dev->dev,
                "Xen PCI frontend has not registered MSI/MSI-X support!\n");
-       return -ENODEV;
+       return irq;
 }
 
 #ifdef CONFIG_XEN_DOM0
+static bool __read_mostly pci_seg_supported = true;
+
 static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
        int ret = 0;
@@ -265,10 +271,11 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 
                memset(&map_irq, 0, sizeof(map_irq));
                map_irq.domid = domid;
-               map_irq.type = MAP_PIRQ_TYPE_MSI;
+               map_irq.type = MAP_PIRQ_TYPE_MSI_SEG;
                map_irq.index = -1;
                map_irq.pirq = -1;
-               map_irq.bus = dev->bus->number;
+               map_irq.bus = dev->bus->number |
+                             (pci_domain_nr(dev->bus) << 16);
                map_irq.devfn = dev->devfn;
 
                if (type == PCI_CAP_ID_MSIX) {
@@ -285,7 +292,20 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
                        map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
                }
 
-               ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+               ret = -EINVAL;
+               if (pci_seg_supported)
+                       ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq,
+                                                   &map_irq);
+               if (ret == -EINVAL && !pci_domain_nr(dev->bus)) {
+                       map_irq.type = MAP_PIRQ_TYPE_MSI;
+                       map_irq.index = -1;
+                       map_irq.pirq = -1;
+                       map_irq.bus = dev->bus->number;
+                       ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq,
+                                                   &map_irq);
+                       if (ret != -EINVAL)
+                               pci_seg_supported = false;
+               }
                if (ret) {
                        dev_warn(&dev->dev, "xen map irq failed %d for %d domain\n",
                                 ret, domid);
index 5cc821c..26c731a 100644 (file)
@@ -25,8 +25,7 @@ config XEN_PRIVILEGED_GUEST
 
 config XEN_PVHVM
        def_bool y
-       depends on XEN
-       depends on X86_LOCAL_APIC
+       depends on XEN && PCI && X86_LOCAL_APIC
 
 config XEN_MAX_DOMAIN_MEMORY
        int
@@ -49,11 +48,3 @@ config XEN_DEBUG_FS
        help
          Enable statistics output and various tuning options in debugfs.
          Enabling this option may incur a significant performance overhead.
-
-config XEN_DEBUG
-       bool "Enable Xen debug checks"
-       depends on XEN
-       default n
-       help
-         Enable various WARN_ON checks in the Xen MMU code.
-         Enabling this option WILL incur a significant performance overhead.
index 2d69617..da8afd5 100644 (file)
@@ -251,6 +251,7 @@ static void __init xen_init_cpuid_mask(void)
                        ~((1 << X86_FEATURE_APIC) |  /* disable local APIC */
                          (1 << X86_FEATURE_ACPI));  /* disable ACPI */
        ax = 1;
+       cx = 0;
        xen_cpuid(&ax, &bx, &cx, &dx);
 
        xsave_mask =
index 3dd53f9..87f6673 100644 (file)
@@ -495,41 +495,6 @@ static pte_t xen_make_pte(pteval_t pte)
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
 
-#ifdef CONFIG_XEN_DEBUG
-pte_t xen_make_pte_debug(pteval_t pte)
-{
-       phys_addr_t addr = (pte & PTE_PFN_MASK);
-       phys_addr_t other_addr;
-       bool io_page = false;
-       pte_t _pte;
-
-       if (pte & _PAGE_IOMAP)
-               io_page = true;
-
-       _pte = xen_make_pte(pte);
-
-       if (!addr)
-               return _pte;
-
-       if (io_page &&
-           (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
-               other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
-               WARN_ONCE(addr != other_addr,
-                       "0x%lx is using VM_IO, but it is 0x%lx!\n",
-                       (unsigned long)addr, (unsigned long)other_addr);
-       } else {
-               pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
-               other_addr = (_pte.pte & PTE_PFN_MASK);
-               WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set),
-                       "0x%lx is missing VM_IO (and wasn't fixed)!\n",
-                       (unsigned long)addr);
-       }
-
-       return _pte;
-}
-PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
-#endif
-
 static pgd_t xen_make_pgd(pgdval_t pgd)
 {
        pgd = pte_pfn_to_mfn(pgd);
@@ -1992,9 +1957,6 @@ void __init xen_ident_map_ISA(void)
 
 static void __init xen_post_allocator_init(void)
 {
-#ifdef CONFIG_XEN_DEBUG
-       pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
-#endif
        pv_mmu_ops.set_pte = xen_set_pte;
        pv_mmu_ops.set_pmd = xen_set_pmd;
        pv_mmu_ops.set_pud = xen_set_pud;
@@ -2404,17 +2366,3 @@ out:
        return err;
 }
 EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
-
-#ifdef CONFIG_XEN_DEBUG_FS
-static int p2m_dump_open(struct inode *inode, struct file *filp)
-{
-       return single_open(filp, p2m_dump_show, NULL);
-}
-
-static const struct file_operations p2m_dump_fops = {
-       .open           = p2m_dump_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-#endif /* CONFIG_XEN_DEBUG_FS */
index 58efeb9..1b267e7 100644 (file)
 #include <asm/xen/page.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
+#include <xen/grant_table.h>
 
+#include "multicalls.h"
 #include "xen-ops.h"
 
 static void __init m2p_override_init(void);
@@ -676,7 +678,8 @@ static unsigned long mfn_hash(unsigned long mfn)
 }
 
 /* Add an MFN override for a particular page */
-int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte)
+int m2p_add_override(unsigned long mfn, struct page *page,
+               struct gnttab_map_grant_ref *kmap_op)
 {
        unsigned long flags;
        unsigned long pfn;
@@ -692,16 +695,28 @@ int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte)
                                        "m2p_add_override: pfn %lx not mapped", pfn))
                        return -EINVAL;
        }
-
-       page->private = mfn;
+       WARN_ON(PagePrivate(page));
+       SetPagePrivate(page);
+       set_page_private(page, mfn);
        page->index = pfn_to_mfn(pfn);
 
        if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
                return -ENOMEM;
 
-       if (clear_pte && !PageHighMem(page))
-               /* Just zap old mapping for now */
-               pte_clear(&init_mm, address, ptep);
+       if (kmap_op != NULL) {
+               if (!PageHighMem(page)) {
+                       struct multicall_space mcs =
+                               xen_mc_entry(sizeof(*kmap_op));
+
+                       MULTI_grant_table_op(mcs.mc,
+                                       GNTTABOP_map_grant_ref, kmap_op, 1);
+
+                       xen_mc_issue(PARAVIRT_LAZY_MMU);
+               }
+               /* let's use dev_bus_addr to record the old mfn instead */
+               kmap_op->dev_bus_addr = page->index;
+               page->index = (unsigned long) kmap_op;
+       }
        spin_lock_irqsave(&m2p_override_lock, flags);
        list_add(&page->lru,  &m2p_overrides[mfn_hash(mfn)]);
        spin_unlock_irqrestore(&m2p_override_lock, flags);
@@ -735,13 +750,56 @@ int m2p_remove_override(struct page *page, bool clear_pte)
        spin_lock_irqsave(&m2p_override_lock, flags);
        list_del(&page->lru);
        spin_unlock_irqrestore(&m2p_override_lock, flags);
-       set_phys_to_machine(pfn, page->index);
+       WARN_ON(!PagePrivate(page));
+       ClearPagePrivate(page);
 
-       if (clear_pte && !PageHighMem(page))
-               set_pte_at(&init_mm, address, ptep,
-                               pfn_pte(pfn, PAGE_KERNEL));
-               /* No tlb flush necessary because the caller already
-                * left the pte unmapped. */
+       if (clear_pte) {
+               struct gnttab_map_grant_ref *map_op =
+                       (struct gnttab_map_grant_ref *) page->index;
+               set_phys_to_machine(pfn, map_op->dev_bus_addr);
+               if (!PageHighMem(page)) {
+                       struct multicall_space mcs;
+                       struct gnttab_unmap_grant_ref *unmap_op;
+
+                       /*
+                        * It might be that we queued all the m2p grant table
+                        * hypercalls in a multicall, then m2p_remove_override
+                        * get called before the multicall has actually been
+                        * issued. In this case handle is going to -1 because
+                        * it hasn't been modified yet.
+                        */
+                       if (map_op->handle == -1)
+                               xen_mc_flush();
+                       /*
+                        * Now if map_op->handle is negative it means that the
+                        * hypercall actually returned an error.
+                        */
+                       if (map_op->handle == GNTST_general_error) {
+                               printk(KERN_WARNING "m2p_remove_override: "
+                                               "pfn %lx mfn %lx, failed to modify kernel mappings",
+                                               pfn, mfn);
+                               return -1;
+                       }
+
+                       mcs = xen_mc_entry(
+                                       sizeof(struct gnttab_unmap_grant_ref));
+                       unmap_op = mcs.args;
+                       unmap_op->host_addr = map_op->host_addr;
+                       unmap_op->handle = map_op->handle;
+                       unmap_op->dev_bus_addr = 0;
+
+                       MULTI_grant_table_op(mcs.mc,
+                                       GNTTABOP_unmap_grant_ref, unmap_op, 1);
+
+                       xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+                       set_pte_at(&init_mm, address, ptep,
+                                       pfn_pte(pfn, PAGE_KERNEL));
+                       __flush_tlb_single(address);
+                       map_op->host_addr = 0;
+               }
+       } else
+               set_phys_to_machine(pfn, page->index);
 
        return 0;
 }
@@ -758,7 +816,7 @@ struct page *m2p_find_override(unsigned long mfn)
        spin_lock_irqsave(&m2p_override_lock, flags);
 
        list_for_each_entry(p, bucket, lru) {
-               if (p->private == mfn) {
+               if (page_private(p) == mfn) {
                        ret = p;
                        break;
                }
@@ -782,17 +840,21 @@ unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
 EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
 
 #ifdef CONFIG_XEN_DEBUG_FS
-
-int p2m_dump_show(struct seq_file *m, void *v)
+#include <linux/debugfs.h>
+#include "debugfs.h"
+static int p2m_dump_show(struct seq_file *m, void *v)
 {
        static const char * const level_name[] = { "top", "middle",
-                                               "entry", "abnormal" };
-       static const char * const type_name[] = { "identity", "missing",
-                                               "pfn", "abnormal"};
+                                               "entry", "abnormal", "error"};
 #define TYPE_IDENTITY 0
 #define TYPE_MISSING 1
 #define TYPE_PFN 2
 #define TYPE_UNKNOWN 3
+       static const char * const type_name[] = {
+                               [TYPE_IDENTITY] = "identity",
+                               [TYPE_MISSING] = "missing",
+                               [TYPE_PFN] = "pfn",
+                               [TYPE_UNKNOWN] = "abnormal"};
        unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0;
        unsigned int uninitialized_var(prev_level);
        unsigned int uninitialized_var(prev_type);
@@ -856,4 +918,32 @@ int p2m_dump_show(struct seq_file *m, void *v)
 #undef TYPE_PFN
 #undef TYPE_UNKNOWN
 }
-#endif
+
+static int p2m_dump_open(struct inode *inode, struct file *filp)
+{
+       return single_open(filp, p2m_dump_show, NULL);
+}
+
+static const struct file_operations p2m_dump_fops = {
+       .open           = p2m_dump_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static struct dentry *d_mmu_debug;
+
+static int __init xen_p2m_debugfs(void)
+{
+       struct dentry *d_xen = xen_init_debugfs();
+
+       if (d_xen == NULL)
+               return -ENOMEM;
+
+       d_mmu_debug = debugfs_create_dir("mmu", d_xen);
+
+       debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
+       return 0;
+}
+fs_initcall(xen_p2m_debugfs);
+#endif /* CONFIG_XEN_DEBUG_FS */
index 46d6d21..38d0af4 100644 (file)
@@ -37,7 +37,10 @@ extern void xen_syscall_target(void);
 extern void xen_syscall32_target(void);
 
 /* Amount of extra memory space we add to the e820 ranges */
-phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
+struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
+
+/* Number of pages released from the initial allocation. */
+unsigned long xen_released_pages;
 
 /* 
  * The maximum amount of extra memory compared to the base size.  The
@@ -51,48 +54,47 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
  */
 #define EXTRA_MEM_RATIO                (10)
 
-static void __init xen_add_extra_mem(unsigned long pages)
+static void __init xen_add_extra_mem(u64 start, u64 size)
 {
        unsigned long pfn;
+       int i;
 
-       u64 size = (u64)pages * PAGE_SIZE;
-       u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
-
-       if (!pages)
-               return;
-
-       e820_add_region(extra_start, size, E820_RAM);
-       sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-
-       memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA");
+       for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+               /* Add new region. */
+               if (xen_extra_mem[i].size == 0) {
+                       xen_extra_mem[i].start = start;
+                       xen_extra_mem[i].size  = size;
+                       break;
+               }
+               /* Append to existing region. */
+               if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
+                       xen_extra_mem[i].size += size;
+                       break;
+               }
+       }
+       if (i == XEN_EXTRA_MEM_MAX_REGIONS)
+               printk(KERN_WARNING "Warning: not enough extra memory regions\n");
 
-       xen_extra_mem_size += size;
+       memblock_x86_reserve_range(start, start + size, "XEN EXTRA");
 
-       xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
+       xen_max_p2m_pfn = PFN_DOWN(start + size);
 
-       for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++)
+       for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++)
                __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 }
 
-static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
-                                             phys_addr_t end_addr)
+static unsigned long __init xen_release_chunk(unsigned long start,
+                                             unsigned long end)
 {
        struct xen_memory_reservation reservation = {
                .address_bits = 0,
                .extent_order = 0,
                .domid        = DOMID_SELF
        };
-       unsigned long start, end;
        unsigned long len = 0;
        unsigned long pfn;
        int ret;
 
-       start = PFN_UP(start_addr);
-       end = PFN_DOWN(end_addr);
-
-       if (end <= start)
-               return 0;
-
        for(pfn = start; pfn < end; pfn++) {
                unsigned long mfn = pfn_to_mfn(pfn);
 
@@ -117,72 +119,52 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
        return len;
 }
 
-static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
-                                                    const struct e820map *e820)
+static unsigned long __init xen_set_identity_and_release(
+       const struct e820entry *list, size_t map_size, unsigned long nr_pages)
 {
-       phys_addr_t max_addr = PFN_PHYS(max_pfn);
-       phys_addr_t last_end = ISA_END_ADDRESS;
+       phys_addr_t start = 0;
        unsigned long released = 0;
-       int i;
-
-       /* Free any unused memory above the low 1Mbyte. */
-       for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
-               phys_addr_t end = e820->map[i].addr;
-               end = min(max_addr, end);
-
-               if (last_end < end)
-                       released += xen_release_chunk(last_end, end);
-               last_end = max(last_end, e820->map[i].addr + e820->map[i].size);
-       }
-
-       if (last_end < max_addr)
-               released += xen_release_chunk(last_end, max_addr);
-
-       printk(KERN_INFO "released %lu pages of unused memory\n", released);
-       return released;
-}
-
-static unsigned long __init xen_set_identity(const struct e820entry *list,
-                                            ssize_t map_size)
-{
-       phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS;
-       phys_addr_t start_pci = last;
-       const struct e820entry *entry;
        unsigned long identity = 0;
+       const struct e820entry *entry;
        int i;
 
+       /*
+        * Combine non-RAM regions and gaps until a RAM region (or the
+        * end of the map) is reached, then set the 1:1 map and
+        * release the pages (if available) in those non-RAM regions.
+        *
+        * The combined non-RAM regions are rounded to a whole number
+        * of pages so any partial pages are accessible via the 1:1
+        * mapping.  This is needed for some BIOSes that put (for
+        * example) the DMI tables in a reserved region that begins on
+        * a non-page boundary.
+        */
        for (i = 0, entry = list; i < map_size; i++, entry++) {
-               phys_addr_t start = entry->addr;
-               phys_addr_t end = start + entry->size;
+               phys_addr_t end = entry->addr + entry->size;
 
-               if (start < last)
-                       start = last;
+               if (entry->type == E820_RAM || i == map_size - 1) {
+                       unsigned long start_pfn = PFN_DOWN(start);
+                       unsigned long end_pfn = PFN_UP(end);
 
-               if (end <= start)
-                       continue;
+                       if (entry->type == E820_RAM)
+                               end_pfn = PFN_UP(entry->addr);
 
-               /* Skip over the 1MB region. */
-               if (last > end)
-                       continue;
+                       if (start_pfn < end_pfn) {
+                               if (start_pfn < nr_pages)
+                                       released += xen_release_chunk(
+                                               start_pfn, min(end_pfn, nr_pages));
 
-               if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) {
-                       if (start > start_pci)
                                identity += set_phys_range_identity(
-                                               PFN_UP(start_pci), PFN_DOWN(start));
-
-                       /* Without saving 'last' we would gooble RAM too
-                        * at the end of the loop. */
-                       last = end;
-                       start_pci = end;
-                       continue;
+                                       start_pfn, end_pfn);
+                       }
+                       start = end;
                }
-               start_pci = min(start, start_pci);
-               last = end;
        }
-       if (last > start_pci)
-               identity += set_phys_range_identity(
-                                       PFN_UP(start_pci), PFN_DOWN(last));
-       return identity;
+
+       printk(KERN_INFO "Released %lu pages of unused memory\n", released);
+       printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);
+
+       return released;
 }
 
 static unsigned long __init xen_get_max_pages(void)
@@ -197,21 +179,32 @@ static unsigned long __init xen_get_max_pages(void)
        return min(max_pages, MAX_DOMAIN_PAGES);
 }
 
+static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
+{
+       u64 end = start + size;
+
+       /* Align RAM regions to page boundaries. */
+       if (type == E820_RAM) {
+               start = PAGE_ALIGN(start);
+               end &= ~((u64)PAGE_SIZE - 1);
+       }
+
+       e820_add_region(start, end - start, type);
+}
+
 /**
  * machine_specific_memory_setup - Hook for machine specific memory setup.
  **/
 char * __init xen_memory_setup(void)
 {
        static struct e820entry map[E820MAX] __initdata;
-       static struct e820entry map_raw[E820MAX] __initdata;
 
        unsigned long max_pfn = xen_start_info->nr_pages;
        unsigned long long mem_end;
        int rc;
        struct xen_memory_map memmap;
+       unsigned long max_pages;
        unsigned long extra_pages = 0;
-       unsigned long extra_limit;
-       unsigned long identity_pages = 0;
        int i;
        int op;
 
@@ -237,58 +230,65 @@ char * __init xen_memory_setup(void)
        }
        BUG_ON(rc);
 
-       memcpy(map_raw, map, sizeof(map));
-       e820.nr_map = 0;
-       xen_extra_mem_start = mem_end;
-       for (i = 0; i < memmap.nr_entries; i++) {
-               unsigned long long end;
-
-               /* Guard against non-page aligned E820 entries. */
-               if (map[i].type == E820_RAM)
-                       map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE;
-
-               end = map[i].addr + map[i].size;
-               if (map[i].type == E820_RAM && end > mem_end) {
-                       /* RAM off the end - may be partially included */
-                       u64 delta = min(map[i].size, end - mem_end);
-
-                       map[i].size -= delta;
-                       end -= delta;
-
-                       extra_pages += PFN_DOWN(delta);
-                       /*
-                        * Set RAM below 4GB that is not for us to be unusable.
-                        * This prevents "System RAM" address space from being
-                        * used as potential resource for I/O address (happens
-                        * when 'allocate_resource' is called).
-                        */
-                       if (delta &&
-                               (xen_initial_domain() && end < 0x100000000ULL))
-                               e820_add_region(end, delta, E820_UNUSABLE);
+       /* Make sure the Xen-supplied memory map is well-ordered. */
+       sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
+
+       max_pages = xen_get_max_pages();
+       if (max_pages > max_pfn)
+               extra_pages += max_pages - max_pfn;
+
+       /*
+        * Set P2M for all non-RAM pages and E820 gaps to be identity
+        * type PFNs.  Any RAM pages that would be made inaccesible by
+        * this are first released.
+        */
+       xen_released_pages = xen_set_identity_and_release(
+               map, memmap.nr_entries, max_pfn);
+       extra_pages += xen_released_pages;
+
+       /*
+        * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
+        * factor the base size.  On non-highmem systems, the base
+        * size is the full initial memory allocation; on highmem it
+        * is limited to the max size of lowmem, so that it doesn't
+        * get completely filled.
+        *
+        * In principle there could be a problem in lowmem systems if
+        * the initial memory is also very large with respect to
+        * lowmem, but we won't try to deal with that here.
+        */
+       extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+                         extra_pages);
+
+       i = 0;
+       while (i < memmap.nr_entries) {
+               u64 addr = map[i].addr;
+               u64 size = map[i].size;
+               u32 type = map[i].type;
+
+               if (type == E820_RAM) {
+                       if (addr < mem_end) {
+                               size = min(size, mem_end - addr);
+                       } else if (extra_pages) {
+                               size = min(size, (u64)extra_pages * PAGE_SIZE);
+                               extra_pages -= size / PAGE_SIZE;
+                               xen_add_extra_mem(addr, size);
+                       } else
+                               type = E820_UNUSABLE;
                }
 
-               if (map[i].size > 0 && end > xen_extra_mem_start)
-                       xen_extra_mem_start = end;
+               xen_align_and_add_e820_region(addr, size, type);
 
-               /* Add region if any remains */
-               if (map[i].size > 0)
-                       e820_add_region(map[i].addr, map[i].size, map[i].type);
+               map[i].addr += size;
+               map[i].size -= size;
+               if (map[i].size == 0)
+                       i++;
        }
-       /* Align the balloon area so that max_low_pfn does not get set
-        * to be at the _end_ of the PCI gap at the far end (fee01000).
-        * Note that xen_extra_mem_start gets set in the loop above to be
-        * past the last E820 region. */
-       if (xen_initial_domain() && (xen_extra_mem_start < (1ULL<<32)))
-               xen_extra_mem_start = (1ULL<<32);
 
        /*
         * In domU, the ISA region is normal, usable memory, but we
         * reserve ISA memory anyway because too many things poke
         * about in there.
-        *
-        * In Dom0, the host E820 information can leave gaps in the
-        * ISA range, which would cause us to release those pages.  To
-        * avoid this, we unconditionally reserve them here.
         */
        e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
                        E820_RESERVED);
@@ -305,44 +305,6 @@ char * __init xen_memory_setup(void)
 
        sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 
-       extra_limit = xen_get_max_pages();
-       if (max_pfn + extra_pages > extra_limit) {
-               if (extra_limit > max_pfn)
-                       extra_pages = extra_limit - max_pfn;
-               else
-                       extra_pages = 0;
-       }
-
-       extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820);
-
-       /*
-        * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
-        * factor the base size.  On non-highmem systems, the base
-        * size is the full initial memory allocation; on highmem it
-        * is limited to the max size of lowmem, so that it doesn't
-        * get completely filled.
-        *
-        * In principle there could be a problem in lowmem systems if
-        * the initial memory is also very large with respect to
-        * lowmem, but we won't try to deal with that here.
-        */
-       extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
-                         max_pfn + extra_pages);
-
-       if (extra_limit >= max_pfn)
-               extra_pages = extra_limit - max_pfn;
-       else
-               extra_pages = 0;
-
-       xen_add_extra_mem(extra_pages);
-
-       /*
-        * Set P2M for all non-RAM pages and E820 gaps to be identity
-        * type PFNs. We supply it with the non-sanitized version
-        * of the E820.
-        */
-       identity_pages = xen_set_identity(map_raw, memmap.nr_entries);
-       printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages);
        return "Xen";
 }
 
index 2330a9a..1540792 100644 (file)
@@ -396,7 +396,7 @@ static int xen_blkbk_map(struct blkif_request *req,
                        continue;
 
                ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr),
-                       blkbk->pending_page(pending_req, i), false);
+                       blkbk->pending_page(pending_req, i), NULL);
                if (ret) {
                        pr_alert(DRV_PFX "Failed to install M2P override for %lx (ret: %d)\n",
                                 (unsigned long)map[i].dev_bus_addr, ret);
index 6fa215a..90832a9 100644 (file)
@@ -400,9 +400,8 @@ static int pcifront_claim_resource(struct pci_dev *dev, void *data)
                        dev_info(&pdev->xdev->dev, "claiming resource %s/%d\n",
                                pci_name(dev), i);
                        if (pci_claim_resource(dev, i)) {
-                               dev_err(&pdev->xdev->dev, "Could not claim "
-                                       "resource %s/%d! Device offline. Try "
-                                       "giving less than 4GB to domain.\n",
+                               dev_err(&pdev->xdev->dev, "Could not claim resource %s/%d! "
+                                       "Device offline. Try using e820_host=1 in the guest config.\n",
                                        pci_name(dev), i);
                        }
                }
index 5f7ff8e..8795480 100644 (file)
@@ -137,16 +137,6 @@ config XEN_GRANT_DEV_ALLOC
          to other domains. This can be used to implement frontend drivers
          or as part of an inter-domain shared memory channel.
 
-config XEN_PLATFORM_PCI
-       tristate "xen platform pci device driver"
-       depends on XEN_PVHVM && PCI
-       default m
-       help
-         Driver for the Xen PCI Platform device: it is responsible for
-         initializing xenbus and grant_table when running in a Xen HVM
-         domain. As a consequence this driver is required to run any Xen PV
-         frontend on Xen HVM.
-
 config SWIOTLB_XEN
        def_bool y
        depends on PCI
index 72bbb27..974fffd 100644 (file)
@@ -14,7 +14,7 @@ obj-$(CONFIG_XEN_GNTDEV)              += xen-gntdev.o
 obj-$(CONFIG_XEN_GRANT_DEV_ALLOC)      += xen-gntalloc.o
 obj-$(CONFIG_XENFS)                    += xenfs/
 obj-$(CONFIG_XEN_SYS_HYPERVISOR)       += sys-hypervisor.o
-obj-$(CONFIG_XEN_PLATFORM_PCI)         += xen-platform-pci.o
+obj-$(CONFIG_XEN_PVHVM)                        += platform-pci.o
 obj-$(CONFIG_XEN_TMEM)                 += tmem.o
 obj-$(CONFIG_SWIOTLB_XEN)              += swiotlb-xen.o
 obj-$(CONFIG_XEN_DOM0)                 += pci.o
@@ -23,5 +23,3 @@ obj-$(CONFIG_XEN_PCIDEV_BACKEND)      += xen-pciback/
 xen-evtchn-y                           := evtchn.o
 xen-gntdev-y                           := gntdev.o
 xen-gntalloc-y                         := gntalloc.o
-
-xen-platform-pci-y                     := platform-pci.o
index 5dfd8f8..5876e1a 100644 (file)
@@ -501,20 +501,24 @@ EXPORT_SYMBOL_GPL(balloon_set_new_target);
  * alloc_xenballooned_pages - get pages that have been ballooned out
  * @nr_pages: Number of pages to get
  * @pages: pages returned
+ * @highmem: highmem or lowmem pages
  * @return 0 on success, error otherwise
  */
-int alloc_xenballooned_pages(int nr_pages, struct page** pages)
+int alloc_xenballooned_pages(int nr_pages, struct page **pages, bool highmem)
 {
        int pgno = 0;
        struct page* page;
        mutex_lock(&balloon_mutex);
        while (pgno < nr_pages) {
-               page = balloon_retrieve(true);
-               if (page) {
+               page = balloon_retrieve(highmem);
+               if (page && PageHighMem(page) == highmem) {
                        pages[pgno++] = page;
                } else {
                        enum bp_state st;
-                       st = decrease_reservation(nr_pages - pgno, GFP_HIGHUSER);
+                       if (page)
+                               balloon_append(page);
+                       st = decrease_reservation(nr_pages - pgno,
+                                       highmem ? GFP_HIGHUSER : GFP_USER);
                        if (st != BP_DONE)
                                goto out_undo;
                }
@@ -555,17 +559,40 @@ void free_xenballooned_pages(int nr_pages, struct page** pages)
 }
 EXPORT_SYMBOL(free_xenballooned_pages);
 
-static int __init balloon_init(void)
+static void __init balloon_add_region(unsigned long start_pfn,
+                                     unsigned long pages)
 {
        unsigned long pfn, extra_pfn_end;
        struct page *page;
 
+       /*
+        * If the amount of usable memory has been limited (e.g., with
+        * the 'mem' command line parameter), don't add pages beyond
+        * this limit.
+        */
+       extra_pfn_end = min(max_pfn, start_pfn + pages);
+
+       for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) {
+               page = pfn_to_page(pfn);
+               /* totalram_pages and totalhigh_pages do not
+                  include the boot-time balloon extension, so
+                  don't subtract from it. */
+               __balloon_append(page);
+       }
+}
+
+static int __init balloon_init(void)
+{
+       int i;
+
        if (!xen_domain())
                return -ENODEV;
 
        pr_info("xen/balloon: Initialising balloon driver.\n");
 
-       balloon_stats.current_pages = xen_pv_domain() ? min(xen_start_info->nr_pages, max_pfn) : max_pfn;
+       balloon_stats.current_pages = xen_pv_domain()
+               ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn)
+               : max_pfn;
        balloon_stats.target_pages  = balloon_stats.current_pages;
        balloon_stats.balloon_low   = 0;
        balloon_stats.balloon_high  = 0;
@@ -584,24 +611,13 @@ static int __init balloon_init(void)
 #endif
 
        /*
-        * Initialise the balloon with excess memory space.  We need
-        * to make sure we don't add memory which doesn't exist or
-        * logically exist.  The E820 map can be trimmed to be smaller
-        * than the amount of physical memory due to the mem= command
-        * line parameter.  And if this is a 32-bit non-HIGHMEM kernel
-        * on a system with memory which requires highmem to access,
-        * don't try to use it.
+        * Initialize the balloon with pages from the extra memory
+        * regions (see arch/x86/xen/setup.c).
         */
-       extra_pfn_end = min(min(max_pfn, e820_end_of_ram_pfn()),
-                           (unsigned long)PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size));
-       for (pfn = PFN_UP(xen_extra_mem_start);
-            pfn < extra_pfn_end;
-            pfn++) {
-               page = pfn_to_page(pfn);
-               /* totalram_pages and totalhigh_pages do not include the boot-time
-                  balloon extension, so don't subtract from it. */
-               __balloon_append(page);
-       }
+       for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++)
+               if (xen_extra_mem[i].size)
+                       balloon_add_region(PFN_UP(xen_extra_mem[i].start),
+                                          PFN_DOWN(xen_extra_mem[i].size));
 
        return 0;
 }
index 7523719..7a55b29 100644 (file)
@@ -432,7 +432,8 @@ static int __must_check xen_allocate_irq_dynamic(void)
 
        irq = irq_alloc_desc_from(first, -1);
 
-       xen_irq_init(irq);
+       if (irq >= 0)
+               xen_irq_init(irq);
 
        return irq;
 }
@@ -713,7 +714,7 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
        mutex_lock(&irq_mapping_update_lock);
 
        irq = xen_allocate_irq_dynamic();
-       if (irq == -1)
+       if (irq < 0)
                goto out;
 
        irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_edge_irq,
@@ -729,7 +730,7 @@ out:
 error_irq:
        mutex_unlock(&irq_mapping_update_lock);
        xen_free_irq(irq);
-       return -1;
+       return ret;
 }
 #endif
 
@@ -779,7 +780,7 @@ int xen_irq_from_pirq(unsigned pirq)
        mutex_lock(&irq_mapping_update_lock);
 
        list_for_each_entry(info, &xen_irq_list_head, list) {
-               if (info == NULL || info->type != IRQT_PIRQ)
+               if (info->type != IRQT_PIRQ)
                        continue;
                irq = info->irq;
                if (info->u.pirq.pirq == pirq)
@@ -872,11 +873,32 @@ static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
        return err ? : bind_evtchn_to_irq(bind_interdomain.local_port);
 }
 
+static int find_virq(unsigned int virq, unsigned int cpu)
+{
+       struct evtchn_status status;
+       int port, rc = -ENOENT;
+
+       memset(&status, 0, sizeof(status));
+       for (port = 0; port <= NR_EVENT_CHANNELS; port++) {
+               status.dom = DOMID_SELF;
+               status.port = port;
+               rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status);
+               if (rc < 0)
+                       continue;
+               if (status.status != EVTCHNSTAT_virq)
+                       continue;
+               if (status.u.virq == virq && status.vcpu == cpu) {
+                       rc = port;
+                       break;
+               }
+       }
+       return rc;
+}
 
 int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 {
        struct evtchn_bind_virq bind_virq;
-       int evtchn, irq;
+       int evtchn, irq, ret;
 
        mutex_lock(&irq_mapping_update_lock);
 
@@ -892,10 +914,16 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 
                bind_virq.virq = virq;
                bind_virq.vcpu = cpu;
-               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
-                                               &bind_virq) != 0)
-                       BUG();
-               evtchn = bind_virq.port;
+               ret = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+                                               &bind_virq);
+               if (ret == 0)
+                       evtchn = bind_virq.port;
+               else {
+                       if (ret == -EEXIST)
+                               ret = find_virq(virq, cpu);
+                       BUG_ON(ret < 0);
+                       evtchn = ret;
+               }
 
                xen_irq_info_virq_init(cpu, irq, evtchn, virq);
 
@@ -1670,6 +1698,7 @@ void __init xen_init_IRQ(void)
 
        evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq),
                                    GFP_KERNEL);
+       BUG_ON(!evtchn_to_irq);
        for (i = 0; i < NR_EVENT_CHANNELS; i++)
                evtchn_to_irq[i] = -1;
 
index f914b26..880798a 100644 (file)
@@ -83,6 +83,7 @@ struct grant_map {
        struct ioctl_gntdev_grant_ref *grants;
        struct gnttab_map_grant_ref   *map_ops;
        struct gnttab_unmap_grant_ref *unmap_ops;
+       struct gnttab_map_grant_ref   *kmap_ops;
        struct page **pages;
 };
 
@@ -116,19 +117,22 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
        add->grants    = kzalloc(sizeof(add->grants[0])    * count, GFP_KERNEL);
        add->map_ops   = kzalloc(sizeof(add->map_ops[0])   * count, GFP_KERNEL);
        add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL);
+       add->kmap_ops  = kzalloc(sizeof(add->kmap_ops[0])  * count, GFP_KERNEL);
        add->pages     = kzalloc(sizeof(add->pages[0])     * count, GFP_KERNEL);
        if (NULL == add->grants    ||
            NULL == add->map_ops   ||
            NULL == add->unmap_ops ||
+           NULL == add->kmap_ops  ||
            NULL == add->pages)
                goto err;
 
-       if (alloc_xenballooned_pages(count, add->pages))
+       if (alloc_xenballooned_pages(count, add->pages, false /* lowmem */))
                goto err;
 
        for (i = 0; i < count; i++) {
                add->map_ops[i].handle = -1;
                add->unmap_ops[i].handle = -1;
+               add->kmap_ops[i].handle = -1;
        }
 
        add->index = 0;
@@ -142,6 +146,7 @@ err:
        kfree(add->grants);
        kfree(add->map_ops);
        kfree(add->unmap_ops);
+       kfree(add->kmap_ops);
        kfree(add);
        return NULL;
 }
@@ -243,10 +248,35 @@ static int map_grant_pages(struct grant_map *map)
                        gnttab_set_unmap_op(&map->unmap_ops[i], addr,
                                map->flags, -1 /* handle */);
                }
+       } else {
+               /*
+                * Setup the map_ops corresponding to the pte entries pointing
+                * to the kernel linear addresses of the struct pages.
+                * These ptes are completely different from the user ptes dealt
+                * with find_grant_ptes.
+                */
+               for (i = 0; i < map->count; i++) {
+                       unsigned level;
+                       unsigned long address = (unsigned long)
+                               pfn_to_kaddr(page_to_pfn(map->pages[i]));
+                       pte_t *ptep;
+                       u64 pte_maddr = 0;
+                       BUG_ON(PageHighMem(map->pages[i]));
+
+                       ptep = lookup_address(address, &level);
+                       pte_maddr = arbitrary_virt_to_machine(ptep).maddr;
+                       gnttab_set_map_op(&map->kmap_ops[i], pte_maddr,
+                               map->flags |
+                               GNTMAP_host_map |
+                               GNTMAP_contains_pte,
+                               map->grants[i].ref,
+                               map->grants[i].domid);
+               }
        }
 
        pr_debug("map %d+%d\n", map->index, map->count);
-       err = gnttab_map_refs(map->map_ops, map->pages, map->count);
+       err = gnttab_map_refs(map->map_ops, use_ptemod ? map->kmap_ops : NULL,
+                       map->pages, map->count);
        if (err)
                return err;
 
@@ -462,13 +492,11 @@ static int gntdev_release(struct inode *inode, struct file *flip)
 
        pr_debug("priv %p\n", priv);
 
-       spin_lock(&priv->lock);
        while (!list_empty(&priv->maps)) {
                map = list_entry(priv->maps.next, struct grant_map, next);
                list_del(&map->next);
                gntdev_put_map(map);
        }
-       spin_unlock(&priv->lock);
 
        if (use_ptemod)
                mmu_notifier_unregister(&priv->mn, priv->mm);
@@ -532,10 +560,11 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
        map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
        if (map) {
                list_del(&map->next);
-               gntdev_put_map(map);
                err = 0;
        }
        spin_unlock(&priv->lock);
+       if (map)
+               gntdev_put_map(map);
        return err;
 }
 
index 4f44b34..8c71ab8 100644 (file)
@@ -448,7 +448,8 @@ unsigned int gnttab_max_grant_frames(void)
 EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
 
 int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
-                   struct page **pages, unsigned int count)
+                       struct gnttab_map_grant_ref *kmap_ops,
+                       struct page **pages, unsigned int count)
 {
        int i, ret;
        pte_t *pte;
@@ -488,8 +489,7 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
                         */
                        return -EOPNOTSUPP;
                }
-               ret = m2p_add_override(mfn, pages[i],
-                                      map_ops[i].flags & GNTMAP_contains_pte);
+               ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
                if (ret)
                        return ret;
        }
index cef4baf..6605707 100644 (file)
@@ -18,6 +18,7 @@
  */
 
 #include <linux/pci.h>
+#include <linux/acpi.h>
 #include <xen/xen.h>
 #include <xen/interface/physdev.h>
 #include <xen/interface/xen.h>
 #include <asm/xen/hypercall.h>
 #include "../pci/pci.h"
 
+static bool __read_mostly pci_seg_supported = true;
+
 static int xen_add_device(struct device *dev)
 {
        int r;
        struct pci_dev *pci_dev = to_pci_dev(dev);
+#ifdef CONFIG_PCI_IOV
+       struct pci_dev *physfn = pci_dev->physfn;
+#endif
+
+       if (pci_seg_supported) {
+               struct physdev_pci_device_add add = {
+                       .seg = pci_domain_nr(pci_dev->bus),
+                       .bus = pci_dev->bus->number,
+                       .devfn = pci_dev->devfn
+               };
+#ifdef CONFIG_ACPI
+               acpi_handle handle;
+#endif
+
+#ifdef CONFIG_PCI_IOV
+               if (pci_dev->is_virtfn) {
+                       add.flags = XEN_PCI_DEV_VIRTFN;
+                       add.physfn.bus = physfn->bus->number;
+                       add.physfn.devfn = physfn->devfn;
+               } else
+#endif
+               if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn))
+                       add.flags = XEN_PCI_DEV_EXTFN;
+
+#ifdef CONFIG_ACPI
+               handle = DEVICE_ACPI_HANDLE(&pci_dev->dev);
+               if (!handle)
+                       handle = DEVICE_ACPI_HANDLE(pci_dev->bus->bridge);
+#ifdef CONFIG_PCI_IOV
+               if (!handle && pci_dev->is_virtfn)
+                       handle = DEVICE_ACPI_HANDLE(physfn->bus->bridge);
+#endif
+               if (handle) {
+                       acpi_status status;
+
+                       do {
+                               unsigned long long pxm;
+
+                               status = acpi_evaluate_integer(handle, "_PXM",
+                                                              NULL, &pxm);
+                               if (ACPI_SUCCESS(status)) {
+                                       add.optarr[0] = pxm;
+                                       add.flags |= XEN_PCI_DEV_PXM;
+                                       break;
+                               }
+                               status = acpi_get_parent(handle, &handle);
+                       } while (ACPI_SUCCESS(status));
+               }
+#endif /* CONFIG_ACPI */
+
+               r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, &add);
+               if (r != -ENOSYS)
+                       return r;
+               pci_seg_supported = false;
+       }
 
+       if (pci_domain_nr(pci_dev->bus))
+               r = -ENOSYS;
 #ifdef CONFIG_PCI_IOV
-       if (pci_dev->is_virtfn) {
+       else if (pci_dev->is_virtfn) {
                struct physdev_manage_pci_ext manage_pci_ext = {
                        .bus            = pci_dev->bus->number,
                        .devfn          = pci_dev->devfn,
                        .is_virtfn      = 1,
-                       .physfn.bus     = pci_dev->physfn->bus->number,
-                       .physfn.devfn   = pci_dev->physfn->devfn,
+                       .physfn.bus     = physfn->bus->number,
+                       .physfn.devfn   = physfn->devfn,
                };
 
                r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
                        &manage_pci_ext);
-       } else
+       }
 #endif
-       if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) {
+       else if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) {
                struct physdev_manage_pci_ext manage_pci_ext = {
                        .bus            = pci_dev->bus->number,
                        .devfn          = pci_dev->devfn,
@@ -71,13 +131,27 @@ static int xen_remove_device(struct device *dev)
 {
        int r;
        struct pci_dev *pci_dev = to_pci_dev(dev);
-       struct physdev_manage_pci manage_pci;
 
-       manage_pci.bus = pci_dev->bus->number;
-       manage_pci.devfn = pci_dev->devfn;
+       if (pci_seg_supported) {
+               struct physdev_pci_device device = {
+                       .seg = pci_domain_nr(pci_dev->bus),
+                       .bus = pci_dev->bus->number,
+                       .devfn = pci_dev->devfn
+               };
 
-       r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove,
-               &manage_pci);
+               r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_remove,
+                                         &device);
+       } else if (pci_domain_nr(pci_dev->bus))
+               r = -ENOSYS;
+       else {
+               struct physdev_manage_pci manage_pci = {
+                       .bus = pci_dev->bus->number,
+                       .devfn = pci_dev->devfn
+               };
+
+               r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove,
+                                         &manage_pci);
+       }
 
        return r;
 }
@@ -96,13 +170,16 @@ static int xen_pci_notifier(struct notifier_block *nb,
                r = xen_remove_device(dev);
                break;
        default:
-               break;
+               return NOTIFY_DONE;
        }
-
-       return r;
+       if (r)
+               dev_err(dev, "Failed to %s - passthrough or MSI/MSI-X might fail!\n",
+                       action == BUS_NOTIFY_ADD_DEVICE ? "add" :
+                       (action == BUS_NOTIFY_DEL_DEVICE ? "delete" : "?"));
+       return NOTIFY_OK;
 }
 
-struct notifier_block device_nb = {
+static struct notifier_block device_nb = {
        .notifier_call = xen_pci_notifier,
 };
 
index 6e8c15a..c984768 100644 (file)
@@ -38,6 +38,7 @@
 #include <xen/swiotlb-xen.h>
 #include <xen/page.h>
 #include <xen/xen-ops.h>
+#include <xen/hvc-console.h>
 /*
  * Used to do a quick range check in swiotlb_tbl_unmap_single and
  * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
@@ -146,8 +147,10 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
 void __init xen_swiotlb_init(int verbose)
 {
        unsigned long bytes;
-       int rc;
+       int rc = -ENOMEM;
        unsigned long nr_tbl;
+       char *m = NULL;
+       unsigned int repeat = 3;
 
        nr_tbl = swioltb_nr_tbl();
        if (nr_tbl)
@@ -156,16 +159,17 @@ void __init xen_swiotlb_init(int verbose)
                xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT);
                xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE);
        }
-
+retry:
        bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT;
 
        /*
         * Get IO TLB memory from any location.
         */
        xen_io_tlb_start = alloc_bootmem(bytes);
-       if (!xen_io_tlb_start)
-               panic("Cannot allocate SWIOTLB buffer");
-
+       if (!xen_io_tlb_start) {
+               m = "Cannot allocate Xen-SWIOTLB buffer!\n";
+               goto error;
+       }
        xen_io_tlb_end = xen_io_tlb_start + bytes;
        /*
         * And replace that memory with pages under 4GB.
@@ -173,17 +177,28 @@ void __init xen_swiotlb_init(int verbose)
        rc = xen_swiotlb_fixup(xen_io_tlb_start,
                               bytes,
                               xen_io_tlb_nslabs);
-       if (rc)
+       if (rc) {
+               free_bootmem(__pa(xen_io_tlb_start), bytes);
+               m = "Failed to get contiguous memory for DMA from Xen!\n"\
+                   "You either: don't have the permissions, do not have"\
+                   " enough free memory under 4GB, or the hypervisor memory"\
+                   "is too fragmented!";
                goto error;
-
+       }
        start_dma_addr = xen_virt_to_bus(xen_io_tlb_start);
        swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose);
 
        return;
 error:
-       panic("DMA(%d): Failed to exchange pages allocated for DMA with Xen! "\
-             "We either don't have the permission or you do not have enough"\
-             "free memory under 4GB!\n", rc);
+       if (repeat--) {
+               xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */
+                                       (xen_io_tlb_nslabs >> 1));
+               printk(KERN_INFO "Xen-SWIOTLB: Lowering to %luMB\n",
+                     (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20);
+               goto retry;
+       }
+       xen_raw_printk("%s (rc:%d)", m, rc);
+       panic("%s (rc:%d)", m, rc);
 }
 
 void *
@@ -194,6 +209,8 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
        int order = get_order(size);
        u64 dma_mask = DMA_BIT_MASK(32);
        unsigned long vstart;
+       phys_addr_t phys;
+       dma_addr_t dev_addr;
 
        /*
        * Ignore region specifiers - the kernel's ideas of
@@ -209,18 +226,26 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
        vstart = __get_free_pages(flags, order);
        ret = (void *)vstart;
 
+       if (!ret)
+               return ret;
+
        if (hwdev && hwdev->coherent_dma_mask)
-               dma_mask = dma_alloc_coherent_mask(hwdev, flags);
+               dma_mask = hwdev->coherent_dma_mask;
 
-       if (ret) {
+       phys = virt_to_phys(ret);
+       dev_addr = xen_phys_to_bus(phys);
+       if (((dev_addr + size - 1 <= dma_mask)) &&
+           !range_straddles_page_boundary(phys, size))
+               *dma_handle = dev_addr;
+       else {
                if (xen_create_contiguous_region(vstart, order,
                                                 fls64(dma_mask)) != 0) {
                        free_pages(vstart, order);
                        return NULL;
                }
-               memset(ret, 0, size);
                *dma_handle = virt_to_machine(ret).maddr;
        }
+       memset(ret, 0, size);
        return ret;
 }
 EXPORT_SYMBOL_GPL(xen_swiotlb_alloc_coherent);
@@ -230,11 +255,21 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
                          dma_addr_t dev_addr)
 {
        int order = get_order(size);
+       phys_addr_t phys;
+       u64 dma_mask = DMA_BIT_MASK(32);
 
        if (dma_release_from_coherent(hwdev, order, vaddr))
                return;
 
-       xen_destroy_contiguous_region((unsigned long)vaddr, order);
+       if (hwdev && hwdev->coherent_dma_mask)
+               dma_mask = hwdev->coherent_dma_mask;
+
+       phys = virt_to_phys(vaddr);
+
+       if (((dev_addr + size - 1 > dma_mask)) ||
+           range_straddles_page_boundary(phys, size))
+               xen_destroy_contiguous_region((unsigned long)vaddr, order);
+
        free_pages((unsigned long)vaddr, order);
 }
 EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent);
@@ -278,9 +313,10 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
        /*
         * Ensure that the address returned is DMA'ble
         */
-       if (!dma_capable(dev, dev_addr, size))
-               panic("map_single: bounce buffer is not DMA'ble");
-
+       if (!dma_capable(dev, dev_addr, size)) {
+               swiotlb_tbl_unmap_single(dev, map, size, dir);
+               dev_addr = 0;
+       }
        return dev_addr;
 }
 EXPORT_SYMBOL_GPL(xen_swiotlb_map_page);
index a803144..444345a 100644 (file)
@@ -15,7 +15,6 @@
 #include "conf_space.h"
 #include "conf_space_quirks.h"
 
-#define DRV_NAME       "xen-pciback"
 static int permissive;
 module_param(permissive, bool, 0644);
 
index da3cbdf..3daf862 100644 (file)
@@ -15,7 +15,6 @@ struct pci_bar_info {
        int which;
 };
 
-#define DRV_NAME       "xen-pciback"
 #define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
 #define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
 
@@ -25,7 +24,7 @@ static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data)
        int ret;
 
        ret = xen_pcibk_read_config_word(dev, offset, value, data);
-       if (!atomic_read(&dev->enable_cnt))
+       if (!pci_is_enabled(dev))
                return ret;
 
        for (i = 0; i < PCI_ROM_RESOURCE; i++) {
@@ -187,7 +186,7 @@ static inline void read_dev_bar(struct pci_dev *dev,
 
        bar_info->val = res[pos].start |
                        (res[pos].flags & PCI_REGION_FLAG_MASK);
-       bar_info->len_val = res[pos].end - res[pos].start + 1;
+       bar_info->len_val = resource_size(&res[pos]);
 }
 
 static void *bar_init(struct pci_dev *dev, int offset)
index 921a889..7476791 100644 (file)
@@ -12,7 +12,6 @@
 #include "conf_space_quirks.h"
 
 LIST_HEAD(xen_pcibk_quirks);
-#define        DRV_NAME        "xen-pciback"
 static inline const struct pci_device_id *
 match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
 {
@@ -36,7 +35,7 @@ static struct xen_pcibk_config_quirk *xen_pcibk_find_quirk(struct pci_dev *dev)
                        goto out;
        tmp_quirk = NULL;
        printk(KERN_DEBUG DRV_NAME
-              ":quirk didn't match any device xen_pciback knows about\n");
+              ": quirk didn't match any device known\n");
 out:
        return tmp_quirk;
 }
index 1d32a9a..828dddc 100644 (file)
@@ -7,13 +7,13 @@
 
 #include <linux/list.h>
 #include <linux/pci.h>
-#include <linux/spinlock.h>
+#include <linux/mutex.h>
 #include "pciback.h"
 
 struct passthrough_dev_data {
        /* Access to dev_list must be protected by lock */
        struct list_head dev_list;
-       spinlock_t lock;
+       struct mutex lock;
 };
 
 static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev,
@@ -24,9 +24,8 @@ static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev,
        struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
        struct pci_dev_entry *dev_entry;
        struct pci_dev *dev = NULL;
-       unsigned long flags;
 
-       spin_lock_irqsave(&dev_data->lock, flags);
+       mutex_lock(&dev_data->lock);
 
        list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
                if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
@@ -37,7 +36,7 @@ static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev,
                }
        }
 
-       spin_unlock_irqrestore(&dev_data->lock, flags);
+       mutex_unlock(&dev_data->lock);
 
        return dev;
 }
@@ -48,7 +47,6 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
 {
        struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
        struct pci_dev_entry *dev_entry;
-       unsigned long flags;
        unsigned int domain, bus, devfn;
        int err;
 
@@ -57,9 +55,9 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
                return -ENOMEM;
        dev_entry->dev = dev;
 
-       spin_lock_irqsave(&dev_data->lock, flags);
+       mutex_lock(&dev_data->lock);
        list_add_tail(&dev_entry->list, &dev_data->dev_list);
-       spin_unlock_irqrestore(&dev_data->lock, flags);
+       mutex_unlock(&dev_data->lock);
 
        /* Publish this device. */
        domain = (unsigned int)pci_domain_nr(dev->bus);
@@ -76,9 +74,8 @@ static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
        struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
        struct pci_dev_entry *dev_entry, *t;
        struct pci_dev *found_dev = NULL;
-       unsigned long flags;
 
-       spin_lock_irqsave(&dev_data->lock, flags);
+       mutex_lock(&dev_data->lock);
 
        list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
                if (dev_entry->dev == dev) {
@@ -88,7 +85,7 @@ static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
                }
        }
 
-       spin_unlock_irqrestore(&dev_data->lock, flags);
+       mutex_unlock(&dev_data->lock);
 
        if (found_dev)
                pcistub_put_pci_dev(found_dev);
@@ -102,7 +99,7 @@ static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev)
        if (!dev_data)
                return -ENOMEM;
 
-       spin_lock_init(&dev_data->lock);
+       mutex_init(&dev_data->lock);
 
        INIT_LIST_HEAD(&dev_data->dev_list);
 
@@ -116,14 +113,14 @@ static int __xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev,
 {
        int err = 0;
        struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
-       struct pci_dev_entry *dev_entry, *e, *tmp;
+       struct pci_dev_entry *dev_entry, *e;
        struct pci_dev *dev;
        int found;
        unsigned int domain, bus;
 
-       spin_lock(&dev_data->lock);
+       mutex_lock(&dev_data->lock);
 
-       list_for_each_entry_safe(dev_entry, tmp, &dev_data->dev_list, list) {
+       list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
                /* Only publish this device as a root if none of its
                 * parent bridges are exported
                 */
@@ -142,16 +139,13 @@ static int __xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev,
                bus = (unsigned int)dev_entry->dev->bus->number;
 
                if (!found) {
-                       spin_unlock(&dev_data->lock);
                        err = publish_root_cb(pdev, domain, bus);
                        if (err)
                                break;
-                       spin_lock(&dev_data->lock);
                }
        }
 
-       if (!err)
-               spin_unlock(&dev_data->lock);
+       mutex_unlock(&dev_data->lock);
 
        return err;
 }
@@ -182,7 +176,7 @@ static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev,
        return 1;
 }
 
-struct xen_pcibk_backend xen_pcibk_passthrough_backend = {
+const struct xen_pcibk_backend xen_pcibk_passthrough_backend = {
        .name           = "passthrough",
        .init           = __xen_pcibk_init_devices,
        .free           = __xen_pcibk_release_devices,
index aec214a..8f06e1e 100644 (file)
@@ -21,8 +21,6 @@
 #include "conf_space.h"
 #include "conf_space_quirks.h"
 
-#define DRV_NAME       "xen-pciback"
-
 static char *pci_devs_to_hide;
 wait_queue_head_t xen_pcibk_aer_wait_queue;
 /*Add sem for sync AER handling and xen_pcibk remove/reconfigue ops,
@@ -222,6 +220,8 @@ void pcistub_put_pci_dev(struct pci_dev *dev)
        }
 
        spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+       if (WARN_ON(!found_psdev))
+               return;
 
        /*hold this lock for avoiding breaking link between
        * pcistub and xen_pcibk when AER is in processing
@@ -514,12 +514,9 @@ static void kill_domain_by_device(struct pcistub_device *psdev)
        int err;
        char nodename[PCI_NODENAME_MAX];
 
-       if (!psdev)
-               dev_err(&psdev->dev->dev,
-                       "device is NULL when do AER recovery/kill_domain\n");
+       BUG_ON(!psdev);
        snprintf(nodename, PCI_NODENAME_MAX, "/local/domain/0/backend/pci/%d/0",
                psdev->pdev->xdev->otherend_id);
-       nodename[strlen(nodename)] = '\0';
 
 again:
        err = xenbus_transaction_start(&xbt);
@@ -605,7 +602,7 @@ static pci_ers_result_t common_process(struct pcistub_device *psdev,
        if (test_bit(_XEN_PCIF_active,
                (unsigned long *)&psdev->pdev->sh_info->flags)) {
                dev_dbg(&psdev->dev->dev,
-                       "schedule pci_conf service in xen_pcibk\n");
+                       "schedule pci_conf service in " DRV_NAME "\n");
                xen_pcibk_test_and_schedule_op(psdev->pdev);
        }
 
@@ -995,8 +992,7 @@ out:
                err = count;
        return err;
 }
-
-DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
+static DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
 
 static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf,
                                   size_t count)
@@ -1015,8 +1011,7 @@ out:
                err = count;
        return err;
 }
-
-DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
+static DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
 
 static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
 {
@@ -1039,8 +1034,7 @@ static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
 
        return count;
 }
-
-DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
+static DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
 
 static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf)
 {
@@ -1069,8 +1063,7 @@ static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf)
        spin_unlock_irqrestore(&pcistub_devices_lock, flags);
        return count;
 }
-
-DRIVER_ATTR(irq_handlers, S_IRUSR, pcistub_irq_handler_show, NULL);
+static DRIVER_ATTR(irq_handlers, S_IRUSR, pcistub_irq_handler_show, NULL);
 
 static ssize_t pcistub_irq_handler_switch(struct device_driver *drv,
                                          const char *buf,
@@ -1106,7 +1099,8 @@ out:
                err = count;
        return err;
 }
-DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL, pcistub_irq_handler_switch);
+static DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL,
+                  pcistub_irq_handler_switch);
 
 static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf,
                                 size_t count)
@@ -1170,8 +1164,8 @@ out:
 
        return count;
 }
-
-DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add);
+static DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show,
+                  pcistub_quirk_add);
 
 static ssize_t permissive_add(struct device_driver *drv, const char *buf,
                              size_t count)
@@ -1236,8 +1230,8 @@ static ssize_t permissive_show(struct device_driver *drv, char *buf)
        spin_unlock_irqrestore(&pcistub_devices_lock, flags);
        return count;
 }
-
-DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add);
+static DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show,
+                  permissive_add);
 
 static void pcistub_exit(void)
 {
@@ -1374,3 +1368,4 @@ module_init(xen_pcibk_init);
 module_exit(xen_pcibk_cleanup);
 
 MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:pci");
index a0e131a..e9b4011 100644 (file)
@@ -15,6 +15,8 @@
 #include <linux/atomic.h>
 #include <xen/interface/io/pciif.h>
 
+#define DRV_NAME       "xen-pciback"
+
 struct pci_dev_entry {
        struct list_head list;
        struct pci_dev *dev;
@@ -27,7 +29,7 @@ struct pci_dev_entry {
 
 struct xen_pcibk_device {
        void *pci_dev_data;
-       spinlock_t dev_lock;
+       struct mutex dev_lock;
        struct xenbus_device *xdev;
        struct xenbus_watch be_watch;
        u8 be_watching;
@@ -89,7 +91,7 @@ typedef int (*publish_pci_root_cb) (struct xen_pcibk_device *pdev,
  *  passthrough - BDFs are exactly like in the host.
  */
 struct xen_pcibk_backend {
-       char *name;
+       const char *name;
        int (*init)(struct xen_pcibk_device *pdev);
        void (*free)(struct xen_pcibk_device *pdev);
        int (*find)(struct pci_dev *pcidev, struct xen_pcibk_device *pdev,
@@ -104,9 +106,9 @@ struct xen_pcibk_backend {
                               unsigned int devfn);
 };
 
-extern struct xen_pcibk_backend xen_pcibk_vpci_backend;
-extern struct xen_pcibk_backend xen_pcibk_passthrough_backend;
-extern struct xen_pcibk_backend *xen_pcibk_backend;
+extern const struct xen_pcibk_backend xen_pcibk_vpci_backend;
+extern const struct xen_pcibk_backend xen_pcibk_passthrough_backend;
+extern const struct xen_pcibk_backend *xen_pcibk_backend;
 
 static inline int xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
                                        struct pci_dev *dev,
@@ -116,13 +118,14 @@ static inline int xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
        if (xen_pcibk_backend && xen_pcibk_backend->add)
                return xen_pcibk_backend->add(pdev, dev, devid, publish_cb);
        return -1;
-};
+}
+
 static inline void xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
                                             struct pci_dev *dev)
 {
        if (xen_pcibk_backend && xen_pcibk_backend->free)
                return xen_pcibk_backend->release(pdev, dev);
-};
+}
 
 static inline struct pci_dev *
 xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, unsigned int domain,
@@ -131,7 +134,8 @@ xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, unsigned int domain,
        if (xen_pcibk_backend && xen_pcibk_backend->get)
                return xen_pcibk_backend->get(pdev, domain, bus, devfn);
        return NULL;
-};
+}
+
 /**
 * Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in xen_pcibk
 * before sending aer request to pcifront, so that guest could identify
@@ -148,25 +152,29 @@ static inline int xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev,
                return xen_pcibk_backend->find(pcidev, pdev, domain, bus,
                                               devfn);
        return -1;
-};
+}
+
 static inline int xen_pcibk_init_devices(struct xen_pcibk_device *pdev)
 {
        if (xen_pcibk_backend && xen_pcibk_backend->init)
                return xen_pcibk_backend->init(pdev);
        return -1;
-};
+}
+
 static inline int xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev,
                                              publish_pci_root_cb cb)
 {
        if (xen_pcibk_backend && xen_pcibk_backend->publish)
                return xen_pcibk_backend->publish(pdev, cb);
        return -1;
-};
+}
+
 static inline void xen_pcibk_release_devices(struct xen_pcibk_device *pdev)
 {
        if (xen_pcibk_backend && xen_pcibk_backend->free)
                return xen_pcibk_backend->free(pdev);
-};
+}
+
 /* Handles events from front-end */
 irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id);
 void xen_pcibk_do_op(struct work_struct *data);
index 8c95c34..63616d7 100644 (file)
@@ -10,7 +10,6 @@
 #include <linux/sched.h>
 #include "pciback.h"
 
-#define DRV_NAME       "xen-pciback"
 int verbose_request;
 module_param(verbose_request, int, 0644);
 
index 4a42cfb..46d140b 100644 (file)
@@ -8,16 +8,15 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/pci.h>
-#include <linux/spinlock.h>
+#include <linux/mutex.h>
 #include "pciback.h"
 
 #define PCI_SLOT_MAX 32
-#define DRV_NAME       "xen-pciback"
 
 struct vpci_dev_data {
        /* Access to dev_list must be protected by lock */
        struct list_head dev_list[PCI_SLOT_MAX];
-       spinlock_t lock;
+       struct mutex lock;
 };
 
 static inline struct list_head *list_first(struct list_head *head)
@@ -33,13 +32,12 @@ static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev,
        struct pci_dev_entry *entry;
        struct pci_dev *dev = NULL;
        struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
-       unsigned long flags;
 
        if (domain != 0 || bus != 0)
                return NULL;
 
        if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
-               spin_lock_irqsave(&vpci_dev->lock, flags);
+               mutex_lock(&vpci_dev->lock);
 
                list_for_each_entry(entry,
                                    &vpci_dev->dev_list[PCI_SLOT(devfn)],
@@ -50,7 +48,7 @@ static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev,
                        }
                }
 
-               spin_unlock_irqrestore(&vpci_dev->lock, flags);
+               mutex_unlock(&vpci_dev->lock);
        }
        return dev;
 }
@@ -71,7 +69,6 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
        int err = 0, slot, func = -1;
        struct pci_dev_entry *t, *dev_entry;
        struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
-       unsigned long flags;
 
        if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
                err = -EFAULT;
@@ -90,7 +87,7 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
 
        dev_entry->dev = dev;
 
-       spin_lock_irqsave(&vpci_dev->lock, flags);
+       mutex_lock(&vpci_dev->lock);
 
        /* Keep multi-function devices together on the virtual PCI bus */
        for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
@@ -129,7 +126,7 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
                         "No more space on root virtual PCI bus");
 
 unlock:
-       spin_unlock_irqrestore(&vpci_dev->lock, flags);
+       mutex_unlock(&vpci_dev->lock);
 
        /* Publish this device. */
        if (!err)
@@ -145,14 +142,13 @@ static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
        int slot;
        struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
        struct pci_dev *found_dev = NULL;
-       unsigned long flags;
 
-       spin_lock_irqsave(&vpci_dev->lock, flags);
+       mutex_lock(&vpci_dev->lock);
 
        for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
-               struct pci_dev_entry *e, *tmp;
-               list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
-                                        list) {
+               struct pci_dev_entry *e;
+
+               list_for_each_entry(e, &vpci_dev->dev_list[slot], list) {
                        if (e->dev == dev) {
                                list_del(&e->list);
                                found_dev = e->dev;
@@ -163,7 +159,7 @@ static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
        }
 
 out:
-       spin_unlock_irqrestore(&vpci_dev->lock, flags);
+       mutex_unlock(&vpci_dev->lock);
 
        if (found_dev)
                pcistub_put_pci_dev(found_dev);
@@ -178,7 +174,7 @@ static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev)
        if (!vpci_dev)
                return -ENOMEM;
 
-       spin_lock_init(&vpci_dev->lock);
+       mutex_init(&vpci_dev->lock);
 
        for (slot = 0; slot < PCI_SLOT_MAX; slot++)
                INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
@@ -222,10 +218,9 @@ static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev,
        struct pci_dev_entry *entry;
        struct pci_dev *dev = NULL;
        struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
-       unsigned long flags;
        int found = 0, slot;
 
-       spin_lock_irqsave(&vpci_dev->lock, flags);
+       mutex_lock(&vpci_dev->lock);
        for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
                list_for_each_entry(entry,
                            &vpci_dev->dev_list[slot],
@@ -243,11 +238,11 @@ static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev,
                        }
                }
        }
-       spin_unlock_irqrestore(&vpci_dev->lock, flags);
+       mutex_unlock(&vpci_dev->lock);
        return found;
 }
 
-struct xen_pcibk_backend xen_pcibk_vpci_backend = {
+const struct xen_pcibk_backend xen_pcibk_vpci_backend = {
        .name           = "vpci",
        .init           = __xen_pcibk_init_devices,
        .free           = __xen_pcibk_release_devices,
index 978d2c6..474d52e 100644 (file)
@@ -13,7 +13,6 @@
 #include <asm/xen/pci.h>
 #include "pciback.h"
 
-#define        DRV_NAME        "xen-pciback"
 #define INVALID_EVTCHN_IRQ  (-1)
 struct workqueue_struct *xen_pcibk_wq;
 
@@ -44,7 +43,7 @@ static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev)
        pdev->xdev = xdev;
        dev_set_drvdata(&xdev->dev, pdev);
 
-       spin_lock_init(&pdev->dev_lock);
+       mutex_init(&pdev->dev_lock);
 
        pdev->sh_info = NULL;
        pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
@@ -62,14 +61,12 @@ out:
 
 static void xen_pcibk_disconnect(struct xen_pcibk_device *pdev)
 {
-       spin_lock(&pdev->dev_lock);
-
+       mutex_lock(&pdev->dev_lock);
        /* Ensure the guest can't trigger our handler before removing devices */
        if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) {
                unbind_from_irqhandler(pdev->evtchn_irq, pdev);
                pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
        }
-       spin_unlock(&pdev->dev_lock);
 
        /* If the driver domain started an op, make sure we complete it
         * before releasing the shared memory */
@@ -77,13 +74,11 @@ static void xen_pcibk_disconnect(struct xen_pcibk_device *pdev)
        /* Note, the workqueue does not use spinlocks at all.*/
        flush_workqueue(xen_pcibk_wq);
 
-       spin_lock(&pdev->dev_lock);
        if (pdev->sh_info != NULL) {
                xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info);
                pdev->sh_info = NULL;
        }
-       spin_unlock(&pdev->dev_lock);
-
+       mutex_unlock(&pdev->dev_lock);
 }
 
 static void free_pdev(struct xen_pcibk_device *pdev)
@@ -120,9 +115,7 @@ static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref,
                goto out;
        }
 
-       spin_lock(&pdev->dev_lock);
        pdev->sh_info = vaddr;
-       spin_unlock(&pdev->dev_lock);
 
        err = bind_interdomain_evtchn_to_irqhandler(
                pdev->xdev->otherend_id, remote_evtchn, xen_pcibk_handle_event,
@@ -132,10 +125,7 @@ static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref,
                                 "Error binding event channel to IRQ");
                goto out;
        }
-
-       spin_lock(&pdev->dev_lock);
        pdev->evtchn_irq = err;
-       spin_unlock(&pdev->dev_lock);
        err = 0;
 
        dev_dbg(&pdev->xdev->dev, "Attached!\n");
@@ -150,6 +140,7 @@ static int xen_pcibk_attach(struct xen_pcibk_device *pdev)
        char *magic = NULL;
 
 
+       mutex_lock(&pdev->dev_lock);
        /* Make sure we only do this setup once */
        if (xenbus_read_driver_state(pdev->xdev->nodename) !=
            XenbusStateInitialised)
@@ -176,7 +167,7 @@ static int xen_pcibk_attach(struct xen_pcibk_device *pdev)
        if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
                xenbus_dev_fatal(pdev->xdev, -EFAULT,
                                 "version mismatch (%s/%s) with pcifront - "
-                                "halting xen_pcibk",
+                                "halting " DRV_NAME,
                                 magic, XEN_PCI_MAGIC);
                goto out;
        }
@@ -194,6 +185,7 @@ static int xen_pcibk_attach(struct xen_pcibk_device *pdev)
 
        dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
 out:
+       mutex_unlock(&pdev->dev_lock);
 
        kfree(magic);
 
@@ -369,6 +361,7 @@ static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev)
 
        dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n");
 
+       mutex_lock(&pdev->dev_lock);
        /* Make sure we only reconfigure once */
        if (xenbus_read_driver_state(pdev->xdev->nodename) !=
            XenbusStateReconfiguring)
@@ -506,6 +499,7 @@ static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev)
        }
 
 out:
+       mutex_unlock(&pdev->dev_lock);
        return 0;
 }
 
@@ -562,6 +556,7 @@ static int xen_pcibk_setup_backend(struct xen_pcibk_device *pdev)
        char dev_str[64];
        char state_str[64];
 
+       mutex_lock(&pdev->dev_lock);
        /* It's possible we could get the call to setup twice, so make sure
         * we're not already connected.
         */
@@ -642,10 +637,10 @@ static int xen_pcibk_setup_backend(struct xen_pcibk_device *pdev)
                                 "Error switching to initialised state!");
 
 out:
+       mutex_unlock(&pdev->dev_lock);
        if (!err)
                /* see if pcifront is already configured (if not, we'll wait) */
                xen_pcibk_attach(pdev);
-
        return err;
 }
 
@@ -724,7 +719,7 @@ static struct xenbus_driver xenbus_xen_pcibk_driver = {
        .otherend_changed       = xen_pcibk_frontend_changed,
 };
 
-struct xen_pcibk_backend *xen_pcibk_backend;
+const struct xen_pcibk_backend *__read_mostly xen_pcibk_backend;
 
 int __init xen_pcibk_xenbus_register(void)
 {
index 6ea852e..d93c708 100644 (file)
@@ -68,6 +68,8 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/bootmem.h>
+#include <linux/swap.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/module.h>
@@ -93,6 +95,15 @@ static unsigned int selfballoon_uphysteresis __read_mostly = 1;
 /* In HZ, controls frequency of worker invocation. */
 static unsigned int selfballoon_interval __read_mostly = 5;
 
+/*
+ * Minimum usable RAM in MB for selfballooning target for balloon.
+ * If non-zero, it is added to totalreserve_pages and self-ballooning
+ * will not balloon below the sum.  If zero, a piecewise linear function
+ * is calculated as a minimum and added to totalreserve_pages.  Note that
+ * setting this value indiscriminately may cause OOMs and crashes.
+ */
+static unsigned int selfballoon_min_usable_mb;
+
 static void selfballoon_process(struct work_struct *work);
 static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process);
 
@@ -189,20 +200,23 @@ static int __init xen_selfballooning_setup(char *s)
 __setup("selfballooning", xen_selfballooning_setup);
 #endif /* CONFIG_FRONTSWAP */
 
+#define MB2PAGES(mb)   ((mb) << (20 - PAGE_SHIFT))
+
 /*
  * Use current balloon size, the goal (vm_committed_as), and hysteresis
  * parameters to set a new target balloon size
  */
 static void selfballoon_process(struct work_struct *work)
 {
-       unsigned long cur_pages, goal_pages, tgt_pages;
+       unsigned long cur_pages, goal_pages, tgt_pages, floor_pages;
+       unsigned long useful_pages;
        bool reset_timer = false;
 
        if (xen_selfballooning_enabled) {
-               cur_pages = balloon_stats.current_pages;
+               cur_pages = totalram_pages;
                tgt_pages = cur_pages; /* default is no change */
                goal_pages = percpu_counter_read_positive(&vm_committed_as) +
-                       balloon_stats.current_pages - totalram_pages;
+                               totalreserve_pages;
 #ifdef CONFIG_FRONTSWAP
                /* allow space for frontswap pages to be repatriated */
                if (frontswap_selfshrinking && frontswap_enabled)
@@ -217,7 +231,26 @@ static void selfballoon_process(struct work_struct *work)
                                ((goal_pages - cur_pages) /
                                  selfballoon_uphysteresis);
                /* else if cur_pages == goal_pages, no change */
-               balloon_set_new_target(tgt_pages);
+               useful_pages = max_pfn - totalreserve_pages;
+               if (selfballoon_min_usable_mb != 0)
+                       floor_pages = totalreserve_pages +
+                                       MB2PAGES(selfballoon_min_usable_mb);
+               /* piecewise linear function ending in ~3% slope */
+               else if (useful_pages < MB2PAGES(16))
+                       floor_pages = max_pfn; /* not worth ballooning */
+               else if (useful_pages < MB2PAGES(64))
+                       floor_pages = totalreserve_pages + MB2PAGES(16) +
+                                       ((useful_pages - MB2PAGES(16)) >> 1);
+               else if (useful_pages < MB2PAGES(512))
+                       floor_pages = totalreserve_pages + MB2PAGES(40) +
+                                       ((useful_pages - MB2PAGES(40)) >> 3);
+               else /* useful_pages >= MB2PAGES(512) */
+                       floor_pages = totalreserve_pages + MB2PAGES(99) +
+                                       ((useful_pages - MB2PAGES(99)) >> 5);
+               if (tgt_pages < floor_pages)
+                       tgt_pages = floor_pages;
+               balloon_set_new_target(tgt_pages +
+                       balloon_stats.current_pages - totalram_pages);
                reset_timer = true;
        }
 #ifdef CONFIG_FRONTSWAP
@@ -340,6 +373,31 @@ static ssize_t store_selfballoon_uphys(struct sys_device *dev,
 static SYSDEV_ATTR(selfballoon_uphysteresis, S_IRUGO | S_IWUSR,
                   show_selfballoon_uphys, store_selfballoon_uphys);
 
+SELFBALLOON_SHOW(selfballoon_min_usable_mb, "%d\n",
+                               selfballoon_min_usable_mb);
+
+static ssize_t store_selfballoon_min_usable_mb(struct sys_device *dev,
+                                              struct sysdev_attribute *attr,
+                                              const char *buf,
+                                              size_t count)
+{
+       unsigned long val;
+       int err;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       err = strict_strtoul(buf, 10, &val);
+       if (err || val == 0)
+               return -EINVAL;
+       selfballoon_min_usable_mb = val;
+       return count;
+}
+
+static SYSDEV_ATTR(selfballoon_min_usable_mb, S_IRUGO | S_IWUSR,
+                  show_selfballoon_min_usable_mb,
+                  store_selfballoon_min_usable_mb);
+
+
 #ifdef CONFIG_FRONTSWAP
 SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking);
 
@@ -421,6 +479,7 @@ static struct attribute *selfballoon_attrs[] = {
        &attr_selfballoon_interval.attr,
        &attr_selfballoon_downhysteresis.attr,
        &attr_selfballoon_uphysteresis.attr,
+       &attr_selfballoon_min_usable_mb.attr,
 #ifdef CONFIG_FRONTSWAP
        &attr_frontswap_selfshrinking.attr,
        &attr_frontswap_hysteresis.attr,
index 090c61e..2eff7a6 100644 (file)
@@ -212,7 +212,9 @@ int xb_init_comms(void)
                printk(KERN_WARNING "XENBUS response ring is not quiescent "
                       "(%08x:%08x): fixing up\n",
                       intf->rsp_cons, intf->rsp_prod);
-               intf->rsp_cons = intf->rsp_prod;
+               /* breaks kdump */
+               if (!reset_devices)
+                       intf->rsp_cons = intf->rsp_prod;
        }
 
        if (xenbus_irq) {
index bd2f90c..cef9b0b 100644 (file)
@@ -684,64 +684,74 @@ static int __init xenbus_probe_initcall(void)
 
 device_initcall(xenbus_probe_initcall);
 
-static int __init xenbus_init(void)
+/* Set up event channel for xenstored which is run as a local process
+ * (this is normally used only in dom0)
+ */
+static int __init xenstored_local_init(void)
 {
        int err = 0;
        unsigned long page = 0;
+       struct evtchn_alloc_unbound alloc_unbound;
 
-       DPRINTK("");
+       /* Allocate Xenstore page */
+       page = get_zeroed_page(GFP_KERNEL);
+       if (!page)
+               goto out_err;
 
-       err = -ENODEV;
-       if (!xen_domain())
-               return err;
+       xen_store_mfn = xen_start_info->store_mfn =
+               pfn_to_mfn(virt_to_phys((void *)page) >>
+                          PAGE_SHIFT);
 
-       /*
-        * Domain0 doesn't have a store_evtchn or store_mfn yet.
-        */
-       if (xen_initial_domain()) {
-               struct evtchn_alloc_unbound alloc_unbound;
+       /* Next allocate a local port which xenstored can bind to */
+       alloc_unbound.dom        = DOMID_SELF;
+       alloc_unbound.remote_dom = DOMID_SELF;
 
-               /* Allocate Xenstore page */
-               page = get_zeroed_page(GFP_KERNEL);
-               if (!page)
-                       goto out_error;
+       err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+                                         &alloc_unbound);
+       if (err == -ENOSYS)
+               goto out_err;
 
-               xen_store_mfn = xen_start_info->store_mfn =
-                       pfn_to_mfn(virt_to_phys((void *)page) >>
-                                  PAGE_SHIFT);
+       BUG_ON(err);
+       xen_store_evtchn = xen_start_info->store_evtchn =
+               alloc_unbound.port;
 
-               /* Next allocate a local port which xenstored can bind to */
-               alloc_unbound.dom        = DOMID_SELF;
-               alloc_unbound.remote_dom = 0;
+       return 0;
 
-               err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
-                                                 &alloc_unbound);
-               if (err == -ENOSYS)
-                       goto out_error;
+ out_err:
+       if (page != 0)
+               free_page(page);
+       return err;
+}
 
-               BUG_ON(err);
-               xen_store_evtchn = xen_start_info->store_evtchn =
-                       alloc_unbound.port;
+static int __init xenbus_init(void)
+{
+       int err = 0;
 
-               xen_store_interface = mfn_to_virt(xen_store_mfn);
+       if (!xen_domain())
+               return -ENODEV;
+
+       if (xen_hvm_domain()) {
+               uint64_t v = 0;
+               err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
+               if (err)
+                       goto out_error;
+               xen_store_evtchn = (int)v;
+               err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v);
+               if (err)
+                       goto out_error;
+               xen_store_mfn = (unsigned long)v;
+               xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE);
        } else {
-               if (xen_hvm_domain()) {
-                       uint64_t v = 0;
-                       err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
-                       if (err)
-                               goto out_error;
-                       xen_store_evtchn = (int)v;
-                       err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v);
+               xen_store_evtchn = xen_start_info->store_evtchn;
+               xen_store_mfn = xen_start_info->store_mfn;
+               if (xen_store_evtchn)
+                       xenstored_ready = 1;
+               else {
+                       err = xenstored_local_init();
                        if (err)
                                goto out_error;
-                       xen_store_mfn = (unsigned long)v;
-                       xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE);
-               } else {
-                       xen_store_evtchn = xen_start_info->store_evtchn;
-                       xen_store_mfn = xen_start_info->store_mfn;
-                       xen_store_interface = mfn_to_virt(xen_store_mfn);
-                       xenstored_ready = 1;
                }
+               xen_store_interface = mfn_to_virt(xen_store_mfn);
        }
 
        /* Initialize the interface to xenstore. */
@@ -760,12 +770,7 @@ static int __init xenbus_init(void)
        proc_mkdir("xen", NULL);
 #endif
 
-       return 0;
-
-  out_error:
-       if (page != 0)
-               free_page(page);
-
+ out_error:
        return err;
 }
 
index 60adf91..32417b5 100644 (file)
@@ -104,8 +104,6 @@ static int xenbus_uevent_backend(struct device *dev,
 
        xdev = to_xenbus_device(dev);
        bus = container_of(xdev->dev.bus, struct xen_bus_type, bus);
-       if (xdev == NULL)
-               return -ENODEV;
 
        if (add_uevent_var(env, "MODALIAS=xen-backend:%s", xdev->devicetype))
                return -ENOMEM;
index ed2ba47..540587e 100644 (file)
@@ -248,10 +248,131 @@ int __xenbus_register_frontend(struct xenbus_driver *drv,
 }
 EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
 
+static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq);
+static int backend_state;
+
+static void xenbus_reset_backend_state_changed(struct xenbus_watch *w,
+                                       const char **v, unsigned int l)
+{
+       xenbus_scanf(XBT_NIL, v[XS_WATCH_PATH], "", "%i", &backend_state);
+       printk(KERN_DEBUG "XENBUS: backend %s %s\n",
+                       v[XS_WATCH_PATH], xenbus_strstate(backend_state));
+       wake_up(&backend_state_wq);
+}
+
+static void xenbus_reset_wait_for_backend(char *be, int expected)
+{
+       long timeout;
+       timeout = wait_event_interruptible_timeout(backend_state_wq,
+                       backend_state == expected, 5 * HZ);
+       if (timeout <= 0)
+               printk(KERN_INFO "XENBUS: backend %s timed out.\n", be);
+}
+
+/*
+ * Reset frontend if it is in Connected or Closed state.
+ * Wait for backend to catch up.
+ * State Connected happens during kdump, Closed after kexec.
+ */
+static void xenbus_reset_frontend(char *fe, char *be, int be_state)
+{
+       struct xenbus_watch be_watch;
+
+       printk(KERN_DEBUG "XENBUS: backend %s %s\n",
+                       be, xenbus_strstate(be_state));
+
+       memset(&be_watch, 0, sizeof(be_watch));
+       be_watch.node = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/state", be);
+       if (!be_watch.node)
+               return;
+
+       be_watch.callback = xenbus_reset_backend_state_changed;
+       backend_state = XenbusStateUnknown;
+
+       printk(KERN_INFO "XENBUS: triggering reconnect on %s\n", be);
+       register_xenbus_watch(&be_watch);
+
+       /* fall through to forward backend to state XenbusStateInitialising */
+       switch (be_state) {
+       case XenbusStateConnected:
+               xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosing);
+               xenbus_reset_wait_for_backend(be, XenbusStateClosing);
+
+       case XenbusStateClosing:
+               xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosed);
+               xenbus_reset_wait_for_backend(be, XenbusStateClosed);
+
+       case XenbusStateClosed:
+               xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateInitialising);
+               xenbus_reset_wait_for_backend(be, XenbusStateInitWait);
+       }
+
+       unregister_xenbus_watch(&be_watch);
+       printk(KERN_INFO "XENBUS: reconnect done on %s\n", be);
+       kfree(be_watch.node);
+}
+
+static void xenbus_check_frontend(char *class, char *dev)
+{
+       int be_state, fe_state, err;
+       char *backend, *frontend;
+
+       frontend = kasprintf(GFP_NOIO | __GFP_HIGH, "device/%s/%s", class, dev);
+       if (!frontend)
+               return;
+
+       err = xenbus_scanf(XBT_NIL, frontend, "state", "%i", &fe_state);
+       if (err != 1)
+               goto out;
+
+       switch (fe_state) {
+       case XenbusStateConnected:
+       case XenbusStateClosed:
+               printk(KERN_DEBUG "XENBUS: frontend %s %s\n",
+                               frontend, xenbus_strstate(fe_state));
+               backend = xenbus_read(XBT_NIL, frontend, "backend", NULL);
+               if (!backend || IS_ERR(backend))
+                       goto out;
+               err = xenbus_scanf(XBT_NIL, backend, "state", "%i", &be_state);
+               if (err == 1)
+                       xenbus_reset_frontend(frontend, backend, be_state);
+               kfree(backend);
+               break;
+       default:
+               break;
+       }
+out:
+       kfree(frontend);
+}
+
+static void xenbus_reset_state(void)
+{
+       char **devclass, **dev;
+       int devclass_n, dev_n;
+       int i, j;
+
+       devclass = xenbus_directory(XBT_NIL, "device", "", &devclass_n);
+       if (IS_ERR(devclass))
+               return;
+
+       for (i = 0; i < devclass_n; i++) {
+               dev = xenbus_directory(XBT_NIL, "device", devclass[i], &dev_n);
+               if (IS_ERR(dev))
+                       continue;
+               for (j = 0; j < dev_n; j++)
+                       xenbus_check_frontend(devclass[i], dev[j]);
+               kfree(dev);
+       }
+       kfree(devclass);
+}
+
 static int frontend_probe_and_watch(struct notifier_block *notifier,
                                   unsigned long event,
                                   void *data)
 {
+       /* reset devices in Connected or Closed state */
+       if (xen_hvm_domain())
+               xenbus_reset_state();
        /* Enumerate devices in xenstore and watch for changes. */
        xenbus_probe_devices(&xenbus_frontend);
        register_xenbus_watch(&fe_watch);
index 5534690..b3b8f2f 100644 (file)
@@ -45,6 +45,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <xen/xenbus.h>
+#include <xen/xen.h>
 #include "xenbus_comms.h"
 
 struct xs_stored_msg {
@@ -620,6 +621,15 @@ static struct xenbus_watch *find_watch(const char *token)
        return NULL;
 }
 
+static void xs_reset_watches(void)
+{
+       int err;
+
+       err = xs_error(xs_single(XBT_NIL, XS_RESET_WATCHES, "", NULL));
+       if (err && err != -EEXIST)
+               printk(KERN_WARNING "xs_reset_watches failed: %d\n", err);
+}
+
 /* Register callback to watch this node. */
 int register_xenbus_watch(struct xenbus_watch *watch)
 {
@@ -638,8 +648,7 @@ int register_xenbus_watch(struct xenbus_watch *watch)
 
        err = xs_watch(watch->node, token);
 
-       /* Ignore errors due to multiple registration. */
-       if ((err != 0) && (err != -EEXIST)) {
+       if (err) {
                spin_lock(&watches_lock);
                list_del(&watch->list);
                spin_unlock(&watches_lock);
@@ -897,5 +906,9 @@ int xs_init(void)
        if (IS_ERR(task))
                return PTR_ERR(task);
 
+       /* shutdown watches for kexec boot */
+       if (xen_hvm_domain())
+               xs_reset_watches();
+
        return 0;
 }
index 76f7538..d29c153 100644 (file)
@@ -25,8 +25,9 @@ extern struct balloon_stats balloon_stats;
 
 void balloon_set_new_target(unsigned long target);
 
-int alloc_xenballooned_pages(int nr_pages, struct page** pages);
-void free_xenballooned_pages(int nr_pages, struct page** pages);
+int alloc_xenballooned_pages(int nr_pages, struct page **pages,
+               bool highmem);
+void free_xenballooned_pages(int nr_pages, struct page **pages);
 
 struct sys_device;
 #ifdef CONFIG_XEN_SELFBALLOONING
index b1fab6b..6b99bfb 100644 (file)
@@ -156,6 +156,7 @@ unsigned int gnttab_max_grant_frames(void);
 #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
 
 int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
+                       struct gnttab_map_grant_ref *kmap_ops,
                    struct page **pages, unsigned int count);
 int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
                      struct page **pages, unsigned int count);
index 99fcffb..f0b6890 100644 (file)
@@ -26,7 +26,11 @@ enum xsd_sockmsg_type
     XS_SET_PERMS,
     XS_WATCH_EVENT,
     XS_ERROR,
-    XS_IS_DOMAIN_INTRODUCED
+    XS_IS_DOMAIN_INTRODUCED,
+    XS_RESUME,
+    XS_SET_TARGET,
+    XS_RESTRICT,
+    XS_RESET_WATCHES
 };
 
 #define XS_WRITE_NONE "NONE"
index 534cac8..c1080d9 100644 (file)
@@ -109,6 +109,7 @@ struct physdev_irq {
 #define MAP_PIRQ_TYPE_MSI              0x0
 #define MAP_PIRQ_TYPE_GSI              0x1
 #define MAP_PIRQ_TYPE_UNKNOWN          0x2
+#define MAP_PIRQ_TYPE_MSI_SEG          0x3
 
 #define PHYSDEVOP_map_pirq             13
 struct physdev_map_pirq {
@@ -119,7 +120,7 @@ struct physdev_map_pirq {
     int index;
     /* IN or OUT */
     int pirq;
-    /* IN */
+    /* IN - high 16 bits hold segment for MAP_PIRQ_TYPE_MSI_SEG */
     int bus;
     /* IN */
     int devfn;
@@ -198,6 +199,37 @@ struct physdev_get_free_pirq {
     uint32_t pirq;
 };
 
+#define XEN_PCI_DEV_EXTFN              0x1
+#define XEN_PCI_DEV_VIRTFN             0x2
+#define XEN_PCI_DEV_PXM                0x4
+
+#define PHYSDEVOP_pci_device_add        25
+struct physdev_pci_device_add {
+    /* IN */
+    uint16_t seg;
+    uint8_t bus;
+    uint8_t devfn;
+    uint32_t flags;
+    struct {
+        uint8_t bus;
+        uint8_t devfn;
+    } physfn;
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    uint32_t optarr[];
+#elif defined(__GNUC__)
+    uint32_t optarr[0];
+#endif
+};
+
+#define PHYSDEVOP_pci_device_remove     26
+#define PHYSDEVOP_restore_msi_ext       27
+struct physdev_pci_device {
+    /* IN */
+    uint16_t seg;
+    uint8_t bus;
+    uint8_t devfn;
+};
+
 /*
  * Notify that some PIRQ-bound event channels have been unmasked.
  * ** This command is obsolete since interface version 0x00030202 and is **
index 0be36b9..12765b6 100644 (file)
@@ -3,6 +3,16 @@
 
 #include <asm/xen/page.h>
 
-extern phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
+struct xen_memory_region {
+       phys_addr_t start;
+       phys_addr_t size;
+};
+
+#define XEN_EXTRA_MEM_MAX_REGIONS 128 /* == E820MAX */
+
+extern __initdata
+struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS];
+
+extern unsigned long xen_released_pages;
 
 #endif /* _XEN_PAGE_H */