Merge tag 'stable/for-linus-3.18-rc0-tag' of git://git.kernel.org/pub/scm/linux/kerne...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 12 Oct 2014 00:29:01 +0000 (20:29 -0400)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 12 Oct 2014 00:29:01 +0000 (20:29 -0400)
Pull Xen updates from David Vrabel:
 "Features and fixes:

   - Add pvscsi frontend and backend drivers.
   - Remove _PAGE_IOMAP PTE flag, freeing it for alternate uses.
   - Try and keep memory contiguous during PV memory setup (reduces
     SWIOTLB usage).
   - Allow front/back drivers to use threaded irqs.
   - Support large initrds in PV guests.
   - Fix PVH guests in preparation for Xen 4.5"

* tag 'stable/for-linus-3.18-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (22 commits)
  xen: remove DEFINE_XENBUS_DRIVER() macro
  xen/xenbus: Remove BUG_ON() when error string trucated
  xen/xenbus: Correct the comments for xenbus_grant_ring()
  x86/xen: Set EFER.NX and EFER.SCE in PVH guests
  xen: eliminate scalability issues from initrd handling
  xen: sync some headers with xen tree
  xen: make pvscsi frontend dependant on xenbus frontend
  arm{,64}/xen: Remove "EXPERIMENTAL" in the description of the Xen options
  xen-scsifront: don't deadlock if the ring becomes full
  x86: remove the Xen-specific _PAGE_IOMAP PTE flag
  x86/xen: do not use _PAGE_IOMAP PTE flag for I/O mappings
  x86: skip check for spurious faults for non-present faults
  xen/efi: Directly include needed headers
  xen-scsiback: clean up a type issue in scsiback_make_tpg()
  xen-scsifront: use GFP_ATOMIC under spin_lock
  MAINTAINERS: Add xen pvscsi maintainer
  xen-scsiback: Add Xen PV SCSI backend driver
  xen-scsifront: Add Xen PV SCSI frontend driver
  xen: Add Xen pvSCSI protocol description
  xen/events: support threaded irqs for interdomain event channels
  ...

46 files changed:
MAINTAINERS
arch/arm/Kconfig
arch/arm64/Kconfig
arch/x86/include/asm/pgtable_types.h
arch/x86/mm/fault.c
arch/x86/mm/init_32.c
arch/x86/mm/init_64.c
arch/x86/pci/i386.c
arch/x86/xen/efi.c
arch/x86/xen/enlighten.c
arch/x86/xen/mmu.c
arch/x86/xen/p2m.c
arch/x86/xen/p2m.h [new file with mode: 0644]
arch/x86/xen/setup.c
arch/x86/xen/smp.c
arch/x86/xen/smp.h
arch/x86/xen/xen-head.S
drivers/block/xen-blkback/xenbus.c
drivers/block/xen-blkfront.c
drivers/char/tpm/xen-tpmfront.c
drivers/input/misc/xen-kbdfront.c
drivers/net/xen-netback/xenbus.c
drivers/net/xen-netfront.c
drivers/pci/xen-pcifront.c
drivers/scsi/Kconfig
drivers/scsi/Makefile
drivers/scsi/xen-scsifront.c [new file with mode: 0644]
drivers/tty/hvc/hvc_xen.c
drivers/video/fbdev/xen-fbfront.c
drivers/xen/Kconfig
drivers/xen/Makefile
drivers/xen/efi.c
drivers/xen/events/events_base.c
drivers/xen/grant-table.c
drivers/xen/xen-pciback/xenbus.c
drivers/xen/xen-scsiback.c [new file with mode: 0644]
drivers/xen/xenbus/xenbus_client.c
drivers/xen/xenbus/xenbus_probe.c
drivers/xen/xenbus/xenbus_probe.h
drivers/xen/xenbus/xenbus_probe_backend.c
drivers/xen/xenbus/xenbus_probe_frontend.c
include/xen/events.h
include/xen/interface/elfnote.h
include/xen/interface/io/vscsiif.h [new file with mode: 0644]
include/xen/interface/xen.h
include/xen/xenbus.h

index b28dc11..f8d882e 100644 (file)
@@ -10268,6 +10268,15 @@ S:     Supported
 F:     drivers/block/xen-blkback/*
 F:     drivers/block/xen*
 
+XEN PVSCSI DRIVERS
+M:     Juergen Gross <jgross@suse.com>
+L:     xen-devel@lists.xenproject.org (moderated for non-subscribers)
+L:     linux-scsi@vger.kernel.org
+S:     Supported
+F:     drivers/scsi/xen-scsifront.c
+F:     drivers/xen/xen-scsiback.c
+F:     include/xen/interface/io/vscsiif.h
+
 XEN SWIOTLB SUBSYSTEM
 M:     Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
 L:     xen-devel@lists.xenproject.org (moderated for non-subscribers)
index 18f392f..89c4b5c 100644 (file)
@@ -1779,7 +1779,7 @@ config XEN_DOM0
        depends on XEN
 
 config XEN
-       bool "Xen guest support on ARM (EXPERIMENTAL)"
+       bool "Xen guest support on ARM"
        depends on ARM && AEABI && OF
        depends on CPU_V7 && !CPU_V6
        depends on !GENERIC_ATOMIC64
index c49ca4c..ac9afde 100644 (file)
@@ -349,7 +349,7 @@ config XEN_DOM0
        depends on XEN
 
 config XEN
-       bool "Xen guest support on ARM64 (EXPERIMENTAL)"
+       bool "Xen guest support on ARM64"
        depends on ARM64 && OF
        select SWIOTLB_XEN
        help
index 0f9724c..0778964 100644 (file)
@@ -23,7 +23,6 @@
 #define _PAGE_BIT_SPECIAL      _PAGE_BIT_SOFTW1
 #define _PAGE_BIT_CPA_TEST     _PAGE_BIT_SOFTW1
 #define _PAGE_BIT_SPLITTING    _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
-#define _PAGE_BIT_IOMAP                _PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */
 #define _PAGE_BIT_HIDDEN       _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
 #define _PAGE_BIT_SOFT_DIRTY   _PAGE_BIT_SOFTW3 /* software dirty tracking */
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
@@ -52,7 +51,7 @@
 #define _PAGE_PSE      (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
 #define _PAGE_GLOBAL   (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
 #define _PAGE_SOFTW1   (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
-#define _PAGE_IOMAP    (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
+#define _PAGE_SOFTW2   (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
 #define _PAGE_PAT      (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
 #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
 #define _PAGE_SPECIAL  (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
 #define __PAGE_KERNEL_LARGE_NOCACHE    (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_EXEC       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
 
-#define __PAGE_KERNEL_IO               (__PAGE_KERNEL | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_NOCACHE       (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_UC_MINUS      (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_WC            (__PAGE_KERNEL_WC | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO               (__PAGE_KERNEL)
+#define __PAGE_KERNEL_IO_NOCACHE       (__PAGE_KERNEL_NOCACHE)
+#define __PAGE_KERNEL_IO_UC_MINUS      (__PAGE_KERNEL_UC_MINUS)
+#define __PAGE_KERNEL_IO_WC            (__PAGE_KERNEL_WC)
 
 #define PAGE_KERNEL                    __pgprot(__PAGE_KERNEL)
 #define PAGE_KERNEL_RO                 __pgprot(__PAGE_KERNEL_RO)
index a241946..83bb03b 100644 (file)
@@ -933,8 +933,17 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
  * cross-processor TLB flush, even if no stale TLB entries exist
  * on other processors.
  *
+ * Spurious faults may only occur if the TLB contains an entry with
+ * fewer permission than the page table entry.  Non-present (P = 0)
+ * and reserved bit (R = 1) faults are never spurious.
+ *
  * There are no security implications to leaving a stale TLB when
  * increasing the permissions on a page.
+ *
+ * Returns non-zero if a spurious fault was handled, zero otherwise.
+ *
+ * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
+ * (Optional Invalidation).
  */
 static noinline int
 spurious_fault(unsigned long error_code, unsigned long address)
@@ -945,8 +954,17 @@ spurious_fault(unsigned long error_code, unsigned long address)
        pte_t *pte;
        int ret;
 
-       /* Reserved-bit violation or user access to kernel space? */
-       if (error_code & (PF_USER | PF_RSVD))
+       /*
+        * Only writes to RO or instruction fetches from NX may cause
+        * spurious faults.
+        *
+        * These could be from user or supervisor accesses but the TLB
+        * is only lazily flushed after a kernel mapping protection
+        * change, so user accesses are not expected to cause spurious
+        * faults.
+        */
+       if (error_code != (PF_WRITE | PF_PROT)
+           && error_code != (PF_INSTR | PF_PROT))
                return 0;
 
        pgd = init_mm.pgd + pgd_index(address);
index 7d05565..c8140e1 100644 (file)
@@ -537,7 +537,7 @@ static void __init pagetable_init(void)
        permanent_kmaps_init(pgd_base);
 }
 
-pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
+pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
 /* user-defined highmem size */
index 5621c47..5d98476 100644 (file)
@@ -151,7 +151,7 @@ early_param("gbpages", parse_direct_gbpages_on);
  * around without checking the pgd every time.
  */
 
-pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
+pteval_t __supported_pte_mask __read_mostly = ~0;
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
 int force_personality32;
index 2ae525e..37c1435 100644 (file)
@@ -442,8 +442,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
                 */
                prot |= _PAGE_CACHE_UC_MINUS;
 
-       prot |= _PAGE_IOMAP;    /* creating a mapping for IO */
-
        vma->vm_page_prot = __pgprot(prot);
 
        if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
index a02e09e..be14cc3 100644 (file)
  * with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/bitops.h>
 #include <linux/efi.h>
 #include <linux/init.h>
 #include <linux/string.h>
 
 #include <xen/xen-ops.h>
 
+#include <asm/page.h>
 #include <asm/setup.h>
 
 void __init xen_efi_init(void)
index c0cb11f..acb0eff 100644 (file)
@@ -1463,6 +1463,7 @@ static void __ref xen_setup_gdt(int cpu)
        pv_cpu_ops.load_gdt = xen_load_gdt;
 }
 
+#ifdef CONFIG_XEN_PVH
 /*
  * A PV guest starts with default flags that are not set for PVH, set them
  * here asap.
@@ -1508,17 +1509,21 @@ static void __init xen_pvh_early_guest_init(void)
                return;
 
        xen_have_vector_callback = 1;
+
+       xen_pvh_early_cpu_init(0, false);
        xen_pvh_set_cr_flags(0);
 
 #ifdef CONFIG_X86_32
        BUG(); /* PVH: Implement proper support. */
 #endif
 }
+#endif    /* CONFIG_XEN_PVH */
 
 /* First C function to be called on Xen boot */
 asmlinkage __visible void __init xen_start_kernel(void)
 {
        struct physdev_set_iopl set_iopl;
+       unsigned long initrd_start = 0;
        int rc;
 
        if (!xen_start_info)
@@ -1527,7 +1532,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
        xen_domain_type = XEN_PV_DOMAIN;
 
        xen_setup_features();
+#ifdef CONFIG_XEN_PVH
        xen_pvh_early_guest_init();
+#endif
        xen_setup_machphys_mapping();
 
        /* Install Xen paravirt ops */
@@ -1559,8 +1566,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
 #endif
                __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
 
-       __supported_pte_mask |= _PAGE_IOMAP;
-
        /*
         * Prevent page tables from being allocated in highmem, even
         * if CONFIG_HIGHPTE is enabled.
@@ -1667,10 +1672,16 @@ asmlinkage __visible void __init xen_start_kernel(void)
        new_cpu_data.x86_capability[0] = cpuid_edx(1);
 #endif
 
+       if (xen_start_info->mod_start) {
+           if (xen_start_info->flags & SIF_MOD_START_PFN)
+               initrd_start = PFN_PHYS(xen_start_info->mod_start);
+           else
+               initrd_start = __pa(xen_start_info->mod_start);
+       }
+
        /* Poke various useful things into boot_params */
        boot_params.hdr.type_of_loader = (9 << 4) | 0;
-       boot_params.hdr.ramdisk_image = xen_start_info->mod_start
-               ? __pa(xen_start_info->mod_start) : 0;
+       boot_params.hdr.ramdisk_image = initrd_start;
        boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
        boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
 
index 16fb009..f62af76 100644 (file)
@@ -399,38 +399,14 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
                if (unlikely(mfn == INVALID_P2M_ENTRY)) {
                        mfn = 0;
                        flags = 0;
-               } else {
-                       /*
-                        * Paramount to do this test _after_ the
-                        * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
-                        * IDENTITY_FRAME_BIT resolves to true.
-                        */
-                       mfn &= ~FOREIGN_FRAME_BIT;
-                       if (mfn & IDENTITY_FRAME_BIT) {
-                               mfn &= ~IDENTITY_FRAME_BIT;
-                               flags |= _PAGE_IOMAP;
-                       }
-               }
+               } else
+                       mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
                val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
        }
 
        return val;
 }
 
-static pteval_t iomap_pte(pteval_t val)
-{
-       if (val & _PAGE_PRESENT) {
-               unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
-               pteval_t flags = val & PTE_FLAGS_MASK;
-
-               /* We assume the pte frame number is a MFN, so
-                  just use it as-is. */
-               val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
-       }
-
-       return val;
-}
-
 __visible pteval_t xen_pte_val(pte_t pte)
 {
        pteval_t pteval = pte.pte;
@@ -441,9 +417,6 @@ __visible pteval_t xen_pte_val(pte_t pte)
                pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
        }
 #endif
-       if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
-               return pteval;
-
        return pte_mfn_to_pfn(pteval);
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
@@ -481,7 +454,6 @@ void xen_set_pat(u64 pat)
 
 __visible pte_t xen_make_pte(pteval_t pte)
 {
-       phys_addr_t addr = (pte & PTE_PFN_MASK);
 #if 0
        /* If Linux is trying to set a WC pte, then map to the Xen WC.
         * If _PAGE_PAT is set, then it probably means it is really
@@ -496,19 +468,7 @@ __visible pte_t xen_make_pte(pteval_t pte)
                        pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
        }
 #endif
-       /*
-        * Unprivileged domains are allowed to do IOMAPpings for
-        * PCI passthrough, but not map ISA space.  The ISA
-        * mappings are just dummy local mappings to keep other
-        * parts of the kernel happy.
-        */
-       if (unlikely(pte & _PAGE_IOMAP) &&
-           (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
-               pte = iomap_pte(pte);
-       } else {
-               pte &= ~_PAGE_IOMAP;
-               pte = pte_pfn_to_mfn(pte);
-       }
+       pte = pte_pfn_to_mfn(pte);
 
        return native_make_pte(pte);
 }
@@ -2091,7 +2051,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 
        default:
                /* By default, set_fixmap is used for hardware mappings */
-               pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
+               pte = mfn_pte(phys, prot);
                break;
        }
 
index 3172692..9f5983b 100644 (file)
 #include <xen/balloon.h>
 #include <xen/grant_table.h>
 
+#include "p2m.h"
 #include "multicalls.h"
 #include "xen-ops.h"
 
@@ -180,12 +181,6 @@ static void __init m2p_override_init(void);
 
 unsigned long xen_max_p2m_pfn __read_mostly;
 
-#define P2M_PER_PAGE           (PAGE_SIZE / sizeof(unsigned long))
-#define P2M_MID_PER_PAGE       (PAGE_SIZE / sizeof(unsigned long *))
-#define P2M_TOP_PER_PAGE       (PAGE_SIZE / sizeof(unsigned long **))
-
-#define MAX_P2M_PFN            (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
-
 /* Placeholders for holes in the address space */
 static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
 static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
@@ -202,16 +197,12 @@ static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE);
 RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
 RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
 
-/* We might hit two boundary violations at the start and end, at max each
- * boundary violation will require three middle nodes. */
-RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3);
-
-/* When we populate back during bootup, the amount of pages can vary. The
- * max we have is seen is 395979, but that does not mean it can't be more.
- * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle
- * it can re-use Xen provided mfn_list array, so we only need to allocate at
- * most three P2M top nodes. */
-RESERVE_BRK(p2m_populated, PAGE_SIZE * 3);
+/* For each I/O range remapped we may lose up to two leaf pages for the boundary
+ * violations and three mid pages to cover up to 3GB. With
+ * early_can_reuse_p2m_middle() most of the leaf pages will be reused by the
+ * remapped region.
+ */
+RESERVE_BRK(p2m_identity_remap, PAGE_SIZE * 2 * 3 * MAX_REMAP_RANGES);
 
 static inline unsigned p2m_top_index(unsigned long pfn)
 {
diff --git a/arch/x86/xen/p2m.h b/arch/x86/xen/p2m.h
new file mode 100644 (file)
index 0000000..ad8aee2
--- /dev/null
@@ -0,0 +1,15 @@
+#ifndef _XEN_P2M_H
+#define _XEN_P2M_H
+
+#define P2M_PER_PAGE        (PAGE_SIZE / sizeof(unsigned long))
+#define P2M_MID_PER_PAGE    (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE    (PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN         (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
+#define MAX_REMAP_RANGES    10
+
+extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
+                                      unsigned long pfn_e);
+
+#endif  /* _XEN_P2M_H */
index 2e55516..af72161 100644 (file)
@@ -29,6 +29,7 @@
 #include <xen/features.h>
 #include "xen-ops.h"
 #include "vdso.h"
+#include "p2m.h"
 
 /* These are code, but not functions.  Defined in entry.S */
 extern const char xen_hypervisor_callback[];
@@ -46,6 +47,9 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
 /* Number of pages released from the initial allocation. */
 unsigned long xen_released_pages;
 
+/* Buffer used to remap identity mapped pages */
+unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata;
+
 /* 
  * The maximum amount of extra memory compared to the base size.  The
  * main scaling factor is the size of struct page.  At extreme ratios
@@ -151,107 +155,325 @@ static unsigned long __init xen_do_chunk(unsigned long start,
        return len;
 }
 
-static unsigned long __init xen_release_chunk(unsigned long start,
-                                             unsigned long end)
-{
-       return xen_do_chunk(start, end, true);
-}
-
-static unsigned long __init xen_populate_chunk(
+/*
+ * Finds the next RAM pfn available in the E820 map after min_pfn.
+ * This function updates min_pfn with the pfn found and returns
+ * the size of that range or zero if not found.
+ */
+static unsigned long __init xen_find_pfn_range(
        const struct e820entry *list, size_t map_size,
-       unsigned long max_pfn, unsigned long *last_pfn,
-       unsigned long credits_left)
+       unsigned long *min_pfn)
 {
        const struct e820entry *entry;
        unsigned int i;
        unsigned long done = 0;
-       unsigned long dest_pfn;
 
        for (i = 0, entry = list; i < map_size; i++, entry++) {
                unsigned long s_pfn;
                unsigned long e_pfn;
-               unsigned long pfns;
-               long capacity;
-
-               if (credits_left <= 0)
-                       break;
 
                if (entry->type != E820_RAM)
                        continue;
 
                e_pfn = PFN_DOWN(entry->addr + entry->size);
 
-               /* We only care about E820 after the xen_start_info->nr_pages */
-               if (e_pfn <= max_pfn)
+               /* We only care about E820 after this */
+               if (e_pfn < *min_pfn)
                        continue;
 
                s_pfn = PFN_UP(entry->addr);
-               /* If the E820 falls within the nr_pages, we want to start
-                * at the nr_pages PFN.
-                * If that would mean going past the E820 entry, skip it
+
+               /* If min_pfn falls within the E820 entry, we want to start
+                * at the min_pfn PFN.
                 */
-               if (s_pfn <= max_pfn) {
-                       capacity = e_pfn - max_pfn;
-                       dest_pfn = max_pfn;
+               if (s_pfn <= *min_pfn) {
+                       done = e_pfn - *min_pfn;
                } else {
-                       capacity = e_pfn - s_pfn;
-                       dest_pfn = s_pfn;
+                       done = e_pfn - s_pfn;
+                       *min_pfn = s_pfn;
                }
+               break;
+       }
 
-               if (credits_left < capacity)
-                       capacity = credits_left;
+       return done;
+}
 
-               pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
-               done += pfns;
-               *last_pfn = (dest_pfn + pfns);
-               if (pfns < capacity)
-                       break;
-               credits_left -= pfns;
+/*
+ * This releases a chunk of memory and then does the identity map. It's used as
+ * as a fallback if the remapping fails.
+ */
+static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
+       unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity,
+       unsigned long *released)
+{
+       WARN_ON(start_pfn > end_pfn);
+
+       /* Need to release pages first */
+       *released += xen_do_chunk(start_pfn, min(end_pfn, nr_pages), true);
+       *identity += set_phys_range_identity(start_pfn, end_pfn);
+}
+
+/*
+ * Helper function to update both the p2m and m2p tables.
+ */
+static unsigned long __init xen_update_mem_tables(unsigned long pfn,
+                                                 unsigned long mfn)
+{
+       struct mmu_update update = {
+               .ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
+               .val = pfn
+       };
+
+       /* Update p2m */
+       if (!early_set_phys_to_machine(pfn, mfn)) {
+               WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
+                    pfn, mfn);
+               return false;
        }
-       return done;
+
+       /* Update m2p */
+       if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
+               WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
+                    mfn, pfn);
+               return false;
+       }
+
+       return true;
 }
 
-static void __init xen_set_identity_and_release_chunk(
-       unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
-       unsigned long *released, unsigned long *identity)
+/*
+ * This function updates the p2m and m2p tables with an identity map from
+ * start_pfn to start_pfn+size and remaps the underlying RAM of the original
+ * allocation at remap_pfn. It must do so carefully in P2M_PER_PAGE sized blocks
+ * to not exhaust the reserved brk space. Doing it in properly aligned blocks
+ * ensures we only allocate the minimum required leaf pages in the p2m table. It
+ * copies the existing mfns from the p2m table under the 1:1 map, overwrites
+ * them with the identity map and then updates the p2m and m2p tables with the
+ * remapped memory.
+ */
+static unsigned long __init xen_do_set_identity_and_remap_chunk(
+        unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
 {
-       unsigned long pfn;
+       unsigned long ident_pfn_iter, remap_pfn_iter;
+       unsigned long ident_start_pfn_align, remap_start_pfn_align;
+       unsigned long ident_end_pfn_align, remap_end_pfn_align;
+       unsigned long ident_boundary_pfn, remap_boundary_pfn;
+       unsigned long ident_cnt = 0;
+       unsigned long remap_cnt = 0;
+       unsigned long left = size;
+       unsigned long mod;
+       int i;
+
+       WARN_ON(size == 0);
+
+       BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
 
        /*
-        * If the PFNs are currently mapped, clear the mappings
-        * (except for the ISA region which must be 1:1 mapped) to
-        * release the refcounts (in Xen) on the original frames.
+        * Determine the proper alignment to remap memory in P2M_PER_PAGE sized
+        * blocks. We need to keep track of both the existing pfn mapping and
+        * the new pfn remapping.
         */
-       for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
-               pte_t pte = __pte_ma(0);
+       mod = start_pfn % P2M_PER_PAGE;
+       ident_start_pfn_align =
+               mod ? (start_pfn - mod + P2M_PER_PAGE) : start_pfn;
+       mod = remap_pfn % P2M_PER_PAGE;
+       remap_start_pfn_align =
+               mod ? (remap_pfn - mod + P2M_PER_PAGE) : remap_pfn;
+       mod = (start_pfn + size) % P2M_PER_PAGE;
+       ident_end_pfn_align = start_pfn + size - mod;
+       mod = (remap_pfn + size) % P2M_PER_PAGE;
+       remap_end_pfn_align = remap_pfn + size - mod;
+
+       /* Iterate over each p2m leaf node in each range */
+       for (ident_pfn_iter = ident_start_pfn_align, remap_pfn_iter = remap_start_pfn_align;
+            ident_pfn_iter < ident_end_pfn_align && remap_pfn_iter < remap_end_pfn_align;
+            ident_pfn_iter += P2M_PER_PAGE, remap_pfn_iter += P2M_PER_PAGE) {
+               /* Check we aren't past the end */
+               BUG_ON(ident_pfn_iter + P2M_PER_PAGE > start_pfn + size);
+               BUG_ON(remap_pfn_iter + P2M_PER_PAGE > remap_pfn + size);
+
+               /* Save p2m mappings */
+               for (i = 0; i < P2M_PER_PAGE; i++)
+                       xen_remap_buf[i] = pfn_to_mfn(ident_pfn_iter + i);
+
+               /* Set identity map which will free a p2m leaf */
+               ident_cnt += set_phys_range_identity(ident_pfn_iter,
+                       ident_pfn_iter + P2M_PER_PAGE);
+
+#ifdef DEBUG
+               /* Helps verify a p2m leaf has been freed */
+               for (i = 0; i < P2M_PER_PAGE; i++) {
+                       unsigned int pfn = ident_pfn_iter + i;
+                       BUG_ON(pfn_to_mfn(pfn) != pfn);
+               }
+#endif
+               /* Now remap memory */
+               for (i = 0; i < P2M_PER_PAGE; i++) {
+                       unsigned long mfn = xen_remap_buf[i];
+
+                       /* This will use the p2m leaf freed above */
+                       if (!xen_update_mem_tables(remap_pfn_iter + i, mfn)) {
+                               WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
+                                       remap_pfn_iter + i, mfn);
+                               return 0;
+                       }
+
+                       remap_cnt++;
+               }
 
-               if (pfn < PFN_UP(ISA_END_ADDRESS))
-                       pte = mfn_pte(pfn, PAGE_KERNEL_IO);
+               left -= P2M_PER_PAGE;
+       }
 
-               (void)HYPERVISOR_update_va_mapping(
-                       (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0);
+       /* Max boundary space possible */
+       BUG_ON(left > (P2M_PER_PAGE - 1) * 2);
+
+       /* Now handle the boundary conditions */
+       ident_boundary_pfn = start_pfn;
+       remap_boundary_pfn = remap_pfn;
+       for (i = 0; i < left; i++) {
+               unsigned long mfn;
+
+               /* These two checks move from the start to end boundaries */
+               if (ident_boundary_pfn == ident_start_pfn_align)
+                       ident_boundary_pfn = ident_pfn_iter;
+               if (remap_boundary_pfn == remap_start_pfn_align)
+                       remap_boundary_pfn = remap_pfn_iter;
+
+               /* Check we aren't past the end */
+               BUG_ON(ident_boundary_pfn >= start_pfn + size);
+               BUG_ON(remap_boundary_pfn >= remap_pfn + size);
+
+               mfn = pfn_to_mfn(ident_boundary_pfn);
+
+               if (!xen_update_mem_tables(remap_boundary_pfn, mfn)) {
+                       WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
+                               remap_pfn_iter + i, mfn);
+                       return 0;
+               }
+               remap_cnt++;
+
+               ident_boundary_pfn++;
+               remap_boundary_pfn++;
        }
 
-       if (start_pfn < nr_pages)
-               *released += xen_release_chunk(
-                       start_pfn, min(end_pfn, nr_pages));
+       /* Finish up the identity map */
+       if (ident_start_pfn_align >= ident_end_pfn_align) {
+               /*
+                 * In this case we have an identity range which does not span an
+                 * aligned block so everything needs to be identity mapped here.
+                 * If we didn't check this we might remap too many pages since
+                 * the align boundaries are not meaningful in this case.
+                */
+               ident_cnt += set_phys_range_identity(start_pfn,
+                       start_pfn + size);
+       } else {
+               /* Remapped above so check each end of the chunk */
+               if (start_pfn < ident_start_pfn_align)
+                       ident_cnt += set_phys_range_identity(start_pfn,
+                               ident_start_pfn_align);
+               if (start_pfn + size > ident_pfn_iter)
+                       ident_cnt += set_phys_range_identity(ident_pfn_iter,
+                               start_pfn + size);
+       }
 
-       *identity += set_phys_range_identity(start_pfn, end_pfn);
+       BUG_ON(ident_cnt != size);
+       BUG_ON(remap_cnt != size);
+
+       return size;
 }
 
-static unsigned long __init xen_set_identity_and_release(
-       const struct e820entry *list, size_t map_size, unsigned long nr_pages)
+/*
+ * This function takes a contiguous pfn range that needs to be identity mapped
+ * and:
+ *
+ *  1) Finds a new range of pfns to use to remap based on E820 and remap_pfn.
+ *  2) Calls the do_ function to actually do the mapping/remapping work.
+ *
+ * The goal is to not allocate additional memory but to remap the existing
+ * pages. In the case of an error the underlying memory is simply released back
+ * to Xen and not remapped.
+ */
+static unsigned long __init xen_set_identity_and_remap_chunk(
+        const struct e820entry *list, size_t map_size, unsigned long start_pfn,
+       unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
+       unsigned long *identity, unsigned long *remapped,
+       unsigned long *released)
+{
+       unsigned long pfn;
+       unsigned long i = 0;
+       unsigned long n = end_pfn - start_pfn;
+
+       while (i < n) {
+               unsigned long cur_pfn = start_pfn + i;
+               unsigned long left = n - i;
+               unsigned long size = left;
+               unsigned long remap_range_size;
+
+               /* Do not remap pages beyond the current allocation */
+               if (cur_pfn >= nr_pages) {
+                       /* Identity map remaining pages */
+                       *identity += set_phys_range_identity(cur_pfn,
+                               cur_pfn + size);
+                       break;
+               }
+               if (cur_pfn + size > nr_pages)
+                       size = nr_pages - cur_pfn;
+
+               remap_range_size = xen_find_pfn_range(list, map_size,
+                                                     &remap_pfn);
+               if (!remap_range_size) {
+                       pr_warning("Unable to find available pfn range, not remapping identity pages\n");
+                       xen_set_identity_and_release_chunk(cur_pfn,
+                               cur_pfn + left, nr_pages, identity, released);
+                       break;
+               }
+               /* Adjust size to fit in current e820 RAM region */
+               if (size > remap_range_size)
+                       size = remap_range_size;
+
+               if (!xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn)) {
+                       WARN(1, "Failed to remap 1:1 memory cur_pfn=%ld size=%ld remap_pfn=%ld\n",
+                               cur_pfn, size, remap_pfn);
+                       xen_set_identity_and_release_chunk(cur_pfn,
+                               cur_pfn + left, nr_pages, identity, released);
+                       break;
+               }
+
+               /* Update variables to reflect new mappings. */
+               i += size;
+               remap_pfn += size;
+               *identity += size;
+               *remapped += size;
+       }
+
+       /*
+        * If the PFNs are currently mapped, the VA mapping also needs
+        * to be updated to be 1:1.
+        */
+       for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
+               (void)HYPERVISOR_update_va_mapping(
+                       (unsigned long)__va(pfn << PAGE_SHIFT),
+                       mfn_pte(pfn, PAGE_KERNEL_IO), 0);
+
+       return remap_pfn;
+}
+
+static unsigned long __init xen_set_identity_and_remap(
+       const struct e820entry *list, size_t map_size, unsigned long nr_pages,
+       unsigned long *released)
 {
        phys_addr_t start = 0;
-       unsigned long released = 0;
        unsigned long identity = 0;
+       unsigned long remapped = 0;
+       unsigned long last_pfn = nr_pages;
        const struct e820entry *entry;
+       unsigned long num_released = 0;
        int i;
 
        /*
         * Combine non-RAM regions and gaps until a RAM region (or the
         * end of the map) is reached, then set the 1:1 map and
-        * release the pages (if available) in those non-RAM regions.
+        * remap the memory in those non-RAM regions.
         *
         * The combined non-RAM regions are rounded to a whole number
         * of pages so any partial pages are accessible via the 1:1
@@ -269,22 +491,24 @@ static unsigned long __init xen_set_identity_and_release(
                                end_pfn = PFN_UP(entry->addr);
 
                        if (start_pfn < end_pfn)
-                               xen_set_identity_and_release_chunk(
-                                       start_pfn, end_pfn, nr_pages,
-                                       &released, &identity);
-
+                               last_pfn = xen_set_identity_and_remap_chunk(
+                                               list, map_size, start_pfn,
+                                               end_pfn, nr_pages, last_pfn,
+                                               &identity, &remapped,
+                                               &num_released);
                        start = end;
                }
        }
 
-       if (released)
-               printk(KERN_INFO "Released %lu pages of unused memory\n", released);
-       if (identity)
-               printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);
+       *released = num_released;
 
-       return released;
-}
+       pr_info("Set %ld page(s) to 1-1 mapping\n", identity);
+       pr_info("Remapped %ld page(s), last_pfn=%ld\n", remapped,
+               last_pfn);
+       pr_info("Released %ld page(s)\n", num_released);
 
+       return last_pfn;
+}
 static unsigned long __init xen_get_max_pages(void)
 {
        unsigned long max_pages = MAX_DOMAIN_PAGES;
@@ -347,7 +571,6 @@ char * __init xen_memory_setup(void)
        unsigned long max_pages;
        unsigned long last_pfn = 0;
        unsigned long extra_pages = 0;
-       unsigned long populated;
        int i;
        int op;
 
@@ -392,20 +615,11 @@ char * __init xen_memory_setup(void)
                extra_pages += max_pages - max_pfn;
 
        /*
-        * Set P2M for all non-RAM pages and E820 gaps to be identity
-        * type PFNs.  Any RAM pages that would be made inaccesible by
-        * this are first released.
+        * Set identity map on non-RAM pages and remap the underlying RAM.
         */
-       xen_released_pages = xen_set_identity_and_release(
-               map, memmap.nr_entries, max_pfn);
-
-       /*
-        * Populate back the non-RAM pages and E820 gaps that had been
-        * released. */
-       populated = xen_populate_chunk(map, memmap.nr_entries,
-                       max_pfn, &last_pfn, xen_released_pages);
+       last_pfn = xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
+                                             &xen_released_pages);
 
-       xen_released_pages -= populated;
        extra_pages += xen_released_pages;
 
        if (last_pfn > max_pfn) {
index 7005974..c670d75 100644 (file)
@@ -37,6 +37,7 @@
 #include <xen/hvc-console.h>
 #include "xen-ops.h"
 #include "mmu.h"
+#include "smp.h"
 
 cpumask_var_t xen_cpu_initialized_map;
 
@@ -99,10 +100,14 @@ static void cpu_bringup(void)
        wmb();                  /* make sure everything is out */
 }
 
-/* Note: cpu parameter is only relevant for PVH */
-static void cpu_bringup_and_idle(int cpu)
+/*
+ * Note: cpu parameter is only relevant for PVH. The reason for passing it
+ * is we can't do smp_processor_id until the percpu segments are loaded, for
+ * which we need the cpu number! So we pass it in rdi as first parameter.
+ */
+asmlinkage __visible void cpu_bringup_and_idle(int cpu)
 {
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_XEN_PVH
        if (xen_feature(XENFEAT_auto_translated_physmap) &&
            xen_feature(XENFEAT_supervisor_mode_kernel))
                xen_pvh_secondary_vcpu_init(cpu);
@@ -374,11 +379,10 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
        ctxt->user_regs.fs = __KERNEL_PERCPU;
        ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
 #endif
-       ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
-
        memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
 
        if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+               ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
                ctxt->flags = VGCF_IN_KERNEL;
                ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
                ctxt->user_regs.ds = __USER_DS;
@@ -413,15 +417,18 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
                                        (unsigned long)xen_failsafe_callback;
                ctxt->user_regs.cs = __KERNEL_CS;
                per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
-#ifdef CONFIG_X86_32
        }
-#else
-       } else
-               /* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with
-                * %rdi having the cpu number - which means are passing in
-                * as the first parameter the cpu. Subtle!
+#ifdef CONFIG_XEN_PVH
+       else {
+               /*
+                * The vcpu comes on kernel page tables which have the NX pte
+                * bit set. This means before DS/SS is touched, NX in
+                * EFER must be set. Hence the following assembly glue code.
                 */
+               ctxt->user_regs.eip = (unsigned long)xen_pvh_early_cpu_init;
                ctxt->user_regs.rdi = cpu;
+               ctxt->user_regs.rsi = true;  /* entry == true */
+       }
 #endif
        ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
        ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
index c7c2d89..963d62a 100644 (file)
@@ -8,4 +8,12 @@ extern void xen_send_IPI_allbutself(int vector);
 extern void xen_send_IPI_all(int vector);
 extern void xen_send_IPI_self(int vector);
 
+#ifdef CONFIG_XEN_PVH
+extern void xen_pvh_early_cpu_init(int cpu, bool entry);
+#else
+static inline void xen_pvh_early_cpu_init(int cpu, bool entry)
+{
+}
+#endif
+
 #endif
index 485b695..674b222 100644 (file)
@@ -47,6 +47,41 @@ ENTRY(startup_xen)
 
        __FINIT
 
+#ifdef CONFIG_XEN_PVH
+/*
+ * xen_pvh_early_cpu_init() - early PVH VCPU initialization
+ * @cpu:   this cpu number (%rdi)
+ * @entry: true if this is a secondary vcpu coming up on this entry
+ *         point, false if this is the boot CPU being initialized for
+ *         the first time (%rsi)
+ *
+ * Note: This is called as a function on the boot CPU, and is the entry point
+ *       on the secondary CPU.
+ */
+ENTRY(xen_pvh_early_cpu_init)
+       mov     %rsi, %r11
+
+       /* Gather features to see if NX implemented. */
+       mov     $0x80000001, %eax
+       cpuid
+       mov     %edx, %esi
+
+       mov     $MSR_EFER, %ecx
+       rdmsr
+       bts     $_EFER_SCE, %eax
+
+       bt      $20, %esi
+       jnc     1f              /* No NX, skip setting it */
+       bts     $_EFER_NX, %eax
+1:     wrmsr
+#ifdef CONFIG_SMP
+       cmp     $0, %r11b
+       jne     cpu_bringup_and_idle
+#endif
+       ret
+
+#endif /* CONFIG_XEN_PVH */
+
 .pushsection .text
        .balign PAGE_SIZE
 ENTRY(hypercall_page)
@@ -124,6 +159,7 @@ NEXT_HYPERCALL(arch_6)
        ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
                .quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
        ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
+       ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN,  .long 1)
        ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   _ASM_PTR __HYPERVISOR_VIRT_START)
        ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   _ASM_PTR 0)
 
index 3a8b810..0b13b1c 100644 (file)
@@ -907,22 +907,17 @@ static int connect_ring(struct backend_info *be)
        return 0;
 }
 
-
-/* ** Driver Registration ** */
-
-
 static const struct xenbus_device_id xen_blkbk_ids[] = {
        { "vbd" },
        { "" }
 };
 
-
-static DEFINE_XENBUS_DRIVER(xen_blkbk, ,
+static struct xenbus_driver xen_blkbk_driver = {
+       .ids  = xen_blkbk_ids,
        .probe = xen_blkbk_probe,
        .remove = xen_blkbk_remove,
        .otherend_changed = frontend_changed
-);
-
+};
 
 int xen_blkif_xenbus_init(void)
 {
index 5deb235..37af03e 100644 (file)
@@ -2055,13 +2055,14 @@ static const struct xenbus_device_id blkfront_ids[] = {
        { "" }
 };
 
-static DEFINE_XENBUS_DRIVER(blkfront, ,
+static struct xenbus_driver blkfront_driver = {
+       .ids  = blkfront_ids,
        .probe = blkfront_probe,
        .remove = blkfront_remove,
        .resume = blkfront_resume,
        .otherend_changed = blkback_changed,
        .is_ready = blkfront_is_ready,
-);
+};
 
 static int __init xlblk_init(void)
 {
index 2064b45..441b44e 100644 (file)
@@ -367,12 +367,13 @@ static const struct xenbus_device_id tpmfront_ids[] = {
 };
 MODULE_ALIAS("xen:vtpm");
 
-static DEFINE_XENBUS_DRIVER(tpmfront, ,
-               .probe = tpmfront_probe,
-               .remove = tpmfront_remove,
-               .resume = tpmfront_resume,
-               .otherend_changed = backend_changed,
-       );
+static struct xenbus_driver tpmfront_driver = {
+       .ids = tpmfront_ids,
+       .probe = tpmfront_probe,
+       .remove = tpmfront_remove,
+       .resume = tpmfront_resume,
+       .otherend_changed = backend_changed,
+};
 
 static int __init xen_tpmfront_init(void)
 {
index fbfdc10..1af28b0 100644 (file)
@@ -365,12 +365,13 @@ static const struct xenbus_device_id xenkbd_ids[] = {
        { "" }
 };
 
-static DEFINE_XENBUS_DRIVER(xenkbd, ,
+static struct xenbus_driver xenkbd_driver = {
+       .ids = xenkbd_ids,
        .probe = xenkbd_probe,
        .remove = xenkbd_remove,
        .resume = xenkbd_resume,
        .otherend_changed = xenkbd_backend_changed,
-);
+};
 
 static int __init xenkbd_init(void)
 {
index 9c47b89..8079c31 100644 (file)
@@ -937,22 +937,18 @@ static int read_xenbus_vif_flags(struct backend_info *be)
        return 0;
 }
 
-
-/* ** Driver Registration ** */
-
-
 static const struct xenbus_device_id netback_ids[] = {
        { "vif" },
        { "" }
 };
 
-
-static DEFINE_XENBUS_DRIVER(netback, ,
+static struct xenbus_driver netback_driver = {
+       .ids = netback_ids,
        .probe = netback_probe,
        .remove = netback_remove,
        .uevent = netback_uevent,
        .otherend_changed = frontend_changed,
-);
+};
 
 int xenvif_xenbus_init(void)
 {
index ca82f54..fa67144 100644 (file)
@@ -2300,12 +2300,6 @@ static void xennet_sysfs_delif(struct net_device *netdev)
 
 #endif /* CONFIG_SYSFS */
 
-static const struct xenbus_device_id netfront_ids[] = {
-       { "vif" },
-       { "" }
-};
-
-
 static int xennet_remove(struct xenbus_device *dev)
 {
        struct netfront_info *info = dev_get_drvdata(&dev->dev);
@@ -2338,12 +2332,18 @@ static int xennet_remove(struct xenbus_device *dev)
        return 0;
 }
 
-static DEFINE_XENBUS_DRIVER(netfront, ,
+static const struct xenbus_device_id netfront_ids[] = {
+       { "vif" },
+       { "" }
+};
+
+static struct xenbus_driver netfront_driver = {
+       .ids = netfront_ids,
        .probe = netfront_probe,
        .remove = xennet_remove,
        .resume = netfront_resume,
        .otherend_changed = netback_changed,
-);
+};
 
 static int __init netif_init(void)
 {
index 53df39a..116ca37 100644 (file)
@@ -1136,11 +1136,13 @@ static const struct xenbus_device_id xenpci_ids[] = {
        {""},
 };
 
-static DEFINE_XENBUS_DRIVER(xenpci, "pcifront",
+static struct xenbus_driver xenpci_driver = {
+       .name                   = "pcifront",
+       .ids                    = xenpci_ids,
        .probe                  = pcifront_xenbus_probe,
        .remove                 = pcifront_xenbus_remove,
        .otherend_changed       = pcifront_backend_changed,
-);
+};
 
 static int __init pcifront_init(void)
 {
index e85e64a..296619b 100644 (file)
@@ -587,6 +587,16 @@ config VMWARE_PVSCSI
          To compile this driver as a module, choose M here: the
          module will be called vmw_pvscsi.
 
+config XEN_SCSI_FRONTEND
+       tristate "XEN SCSI frontend driver"
+       depends on SCSI && XEN
+       select XEN_XENBUS_FRONTEND
+       help
+         The XEN SCSI frontend driver allows the kernel to access SCSI Devices
+         within another guest OS (usually Dom0).
+         Only needed if the kernel is running in a XEN guest and generic
+         SCSI access to a device is needed.
+
 config HYPERV_STORAGE
        tristate "Microsoft Hyper-V virtual storage driver"
        depends on SCSI && HYPERV
index 5f0d299..59f1ce6 100644 (file)
@@ -141,6 +141,7 @@ obj-$(CONFIG_SCSI_ESAS2R)   += esas2r/
 obj-$(CONFIG_SCSI_PMCRAID)     += pmcraid.o
 obj-$(CONFIG_SCSI_VIRTIO)      += virtio_scsi.o
 obj-$(CONFIG_VMWARE_PVSCSI)    += vmw_pvscsi.o
+obj-$(CONFIG_XEN_SCSI_FRONTEND)        += xen-scsifront.o
 obj-$(CONFIG_HYPERV_STORAGE)   += hv_storvsc.o
 
 obj-$(CONFIG_ARM)              += arm/
diff --git a/drivers/scsi/xen-scsifront.c b/drivers/scsi/xen-scsifront.c
new file mode 100644 (file)
index 0000000..34199d2
--- /dev/null
@@ -0,0 +1,1026 @@
+/*
+ * Xen SCSI frontend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/wait.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/blkdev.h>
+#include <linux/pfn.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_host.h>
+
+#include <xen/xen.h>
+#include <xen/xenbus.h>
+#include <xen/grant_table.h>
+#include <xen/events.h>
+#include <xen/page.h>
+
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/vscsiif.h>
+#include <xen/interface/io/protocols.h>
+
+#include <asm/xen/hypervisor.h>
+
+
+#define GRANT_INVALID_REF      0
+
+#define VSCSIFRONT_OP_ADD_LUN  1
+#define VSCSIFRONT_OP_DEL_LUN  2
+
+/* Tuning point. */
+#define VSCSIIF_DEFAULT_CMD_PER_LUN 10
+#define VSCSIIF_MAX_TARGET          64
+#define VSCSIIF_MAX_LUN             255
+
+#define VSCSIIF_RING_SIZE      __CONST_RING_SIZE(vscsiif, PAGE_SIZE)
+#define VSCSIIF_MAX_REQS       VSCSIIF_RING_SIZE
+
+#define vscsiif_grants_sg(_sg) (PFN_UP((_sg) *         \
+                               sizeof(struct scsiif_request_segment)))
+
+struct vscsifrnt_shadow {
+       /* command between backend and frontend */
+       unsigned char act;
+       uint16_t rqid;
+
+       unsigned int nr_grants;         /* number of grants in gref[] */
+       struct scsiif_request_segment *sg;      /* scatter/gather elements */
+
+       /* Do reset or abort function. */
+       wait_queue_head_t wq_reset;     /* reset work queue           */
+       int wait_reset;                 /* reset work queue condition */
+       int32_t rslt_reset;             /* reset response status:     */
+                                       /* SUCCESS or FAILED or:      */
+#define RSLT_RESET_WAITING     0
+#define RSLT_RESET_ERR         -1
+
+       /* Requested struct scsi_cmnd is stored from kernel. */
+       struct scsi_cmnd *sc;
+       int gref[vscsiif_grants_sg(SG_ALL) + SG_ALL];
+};
+
+struct vscsifrnt_info {
+       struct xenbus_device *dev;
+
+       struct Scsi_Host *host;
+       int host_active;
+
+       unsigned int evtchn;
+       unsigned int irq;
+
+       grant_ref_t ring_ref;
+       struct vscsiif_front_ring ring;
+       struct vscsiif_response ring_rsp;
+
+       spinlock_t shadow_lock;
+       DECLARE_BITMAP(shadow_free_bitmap, VSCSIIF_MAX_REQS);
+       struct vscsifrnt_shadow *shadow[VSCSIIF_MAX_REQS];
+
+       wait_queue_head_t wq_sync;
+       unsigned int wait_ring_available:1;
+
+       char dev_state_path[64];
+       struct task_struct *curr;
+};
+
+static DEFINE_MUTEX(scsifront_mutex);
+
+static void scsifront_wake_up(struct vscsifrnt_info *info)
+{
+       info->wait_ring_available = 0;
+       wake_up(&info->wq_sync);
+}
+
+static int scsifront_get_rqid(struct vscsifrnt_info *info)
+{
+       unsigned long flags;
+       int free;
+
+       spin_lock_irqsave(&info->shadow_lock, flags);
+
+       free = find_first_bit(info->shadow_free_bitmap, VSCSIIF_MAX_REQS);
+       __clear_bit(free, info->shadow_free_bitmap);
+
+       spin_unlock_irqrestore(&info->shadow_lock, flags);
+
+       return free;
+}
+
+static int _scsifront_put_rqid(struct vscsifrnt_info *info, uint32_t id)
+{
+       int empty = bitmap_empty(info->shadow_free_bitmap, VSCSIIF_MAX_REQS);
+
+       __set_bit(id, info->shadow_free_bitmap);
+       info->shadow[id] = NULL;
+
+       return empty || info->wait_ring_available;
+}
+
+static void scsifront_put_rqid(struct vscsifrnt_info *info, uint32_t id)
+{
+       unsigned long flags;
+       int kick;
+
+       spin_lock_irqsave(&info->shadow_lock, flags);
+       kick = _scsifront_put_rqid(info, id);
+       spin_unlock_irqrestore(&info->shadow_lock, flags);
+
+       if (kick)
+               scsifront_wake_up(info);
+}
+
+static struct vscsiif_request *scsifront_pre_req(struct vscsifrnt_info *info)
+{
+       struct vscsiif_front_ring *ring = &(info->ring);
+       struct vscsiif_request *ring_req;
+       uint32_t id;
+
+       id = scsifront_get_rqid(info);  /* use id in response */
+       if (id >= VSCSIIF_MAX_REQS)
+               return NULL;
+
+       ring_req = RING_GET_REQUEST(&(info->ring), ring->req_prod_pvt);
+
+       ring->req_prod_pvt++;
+
+       ring_req->rqid = (uint16_t)id;
+
+       return ring_req;
+}
+
+static void scsifront_do_request(struct vscsifrnt_info *info)
+{
+       struct vscsiif_front_ring *ring = &(info->ring);
+       int notify;
+
+       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(ring, notify);
+       if (notify)
+               notify_remote_via_irq(info->irq);
+}
+
+static void scsifront_gnttab_done(struct vscsifrnt_info *info, uint32_t id)
+{
+       struct vscsifrnt_shadow *s = info->shadow[id];
+       int i;
+
+       if (s->sc->sc_data_direction == DMA_NONE)
+               return;
+
+       for (i = 0; i < s->nr_grants; i++) {
+               if (unlikely(gnttab_query_foreign_access(s->gref[i]) != 0)) {
+                       shost_printk(KERN_ALERT, info->host, KBUILD_MODNAME
+                                    "grant still in use by backend\n");
+                       BUG();
+               }
+               gnttab_end_foreign_access(s->gref[i], 0, 0UL);
+       }
+
+       kfree(s->sg);
+}
+
+static void scsifront_cdb_cmd_done(struct vscsifrnt_info *info,
+                                  struct vscsiif_response *ring_rsp)
+{
+       struct scsi_cmnd *sc;
+       uint32_t id;
+       uint8_t sense_len;
+
+       id = ring_rsp->rqid;
+       sc = info->shadow[id]->sc;
+
+       BUG_ON(sc == NULL);
+
+       scsifront_gnttab_done(info, id);
+       scsifront_put_rqid(info, id);
+
+       sc->result = ring_rsp->rslt;
+       scsi_set_resid(sc, ring_rsp->residual_len);
+
+       sense_len = min_t(uint8_t, VSCSIIF_SENSE_BUFFERSIZE,
+                         ring_rsp->sense_len);
+
+       if (sense_len)
+               memcpy(sc->sense_buffer, ring_rsp->sense_buffer, sense_len);
+
+       sc->scsi_done(sc);
+}
+
+static void scsifront_sync_cmd_done(struct vscsifrnt_info *info,
+                                   struct vscsiif_response *ring_rsp)
+{
+       uint16_t id = ring_rsp->rqid;
+       unsigned long flags;
+       struct vscsifrnt_shadow *shadow = info->shadow[id];
+       int kick;
+
+       spin_lock_irqsave(&info->shadow_lock, flags);
+       shadow->wait_reset = 1;
+       switch (shadow->rslt_reset) {
+       case RSLT_RESET_WAITING:
+               shadow->rslt_reset = ring_rsp->rslt;
+               break;
+       case RSLT_RESET_ERR:
+               kick = _scsifront_put_rqid(info, id);
+               spin_unlock_irqrestore(&info->shadow_lock, flags);
+               kfree(shadow);
+               if (kick)
+                       scsifront_wake_up(info);
+               return;
+       default:
+               shost_printk(KERN_ERR, info->host, KBUILD_MODNAME
+                            "bad reset state %d, possibly leaking %u\n",
+                            shadow->rslt_reset, id);
+               break;
+       }
+       spin_unlock_irqrestore(&info->shadow_lock, flags);
+
+       wake_up(&shadow->wq_reset);
+}
+
+static int scsifront_cmd_done(struct vscsifrnt_info *info)
+{
+       struct vscsiif_response *ring_rsp;
+       RING_IDX i, rp;
+       int more_to_do = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(info->host->host_lock, flags);
+
+       rp = info->ring.sring->rsp_prod;
+       rmb();  /* ordering required respective to dom0 */
+       for (i = info->ring.rsp_cons; i != rp; i++) {
+
+               ring_rsp = RING_GET_RESPONSE(&info->ring, i);
+
+               if (WARN(ring_rsp->rqid >= VSCSIIF_MAX_REQS ||
+                        test_bit(ring_rsp->rqid, info->shadow_free_bitmap),
+                        "illegal rqid %u returned by backend!\n",
+                        ring_rsp->rqid))
+                       continue;
+
+               if (info->shadow[ring_rsp->rqid]->act == VSCSIIF_ACT_SCSI_CDB)
+                       scsifront_cdb_cmd_done(info, ring_rsp);
+               else
+                       scsifront_sync_cmd_done(info, ring_rsp);
+       }
+
+       info->ring.rsp_cons = i;
+
+       if (i != info->ring.req_prod_pvt)
+               RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
+       else
+               info->ring.sring->rsp_event = i + 1;
+
+       info->wait_ring_available = 0;
+
+       spin_unlock_irqrestore(info->host->host_lock, flags);
+
+       wake_up(&info->wq_sync);
+
+       return more_to_do;
+}
+
+static irqreturn_t scsifront_irq_fn(int irq, void *dev_id)
+{
+       struct vscsifrnt_info *info = dev_id;
+
+       while (scsifront_cmd_done(info))
+               /* Yield point for this unbounded loop. */
+               cond_resched();
+
+       return IRQ_HANDLED;
+}
+
+static int map_data_for_request(struct vscsifrnt_info *info,
+                               struct scsi_cmnd *sc,
+                               struct vscsiif_request *ring_req,
+                               struct vscsifrnt_shadow *shadow)
+{
+       grant_ref_t gref_head;
+       struct page *page;
+       int err, ref, ref_cnt = 0;
+       int grant_ro = (sc->sc_data_direction == DMA_TO_DEVICE);
+       unsigned int i, off, len, bytes;
+       unsigned int data_len = scsi_bufflen(sc);
+       unsigned int data_grants = 0, seg_grants = 0;
+       struct scatterlist *sg;
+       unsigned long mfn;
+       struct scsiif_request_segment *seg;
+
+       ring_req->nr_segments = 0;
+       if (sc->sc_data_direction == DMA_NONE || !data_len)
+               return 0;
+
+       scsi_for_each_sg(sc, sg, scsi_sg_count(sc), i)
+               data_grants += PFN_UP(sg->offset + sg->length);
+
+       if (data_grants > VSCSIIF_SG_TABLESIZE) {
+               if (data_grants > info->host->sg_tablesize) {
+                       shost_printk(KERN_ERR, info->host, KBUILD_MODNAME
+                            "Unable to map request_buffer for command!\n");
+                       return -E2BIG;
+               }
+               seg_grants = vscsiif_grants_sg(data_grants);
+               shadow->sg = kcalloc(data_grants,
+                       sizeof(struct scsiif_request_segment), GFP_ATOMIC);
+               if (!shadow->sg)
+                       return -ENOMEM;
+       }
+       seg = shadow->sg ? : ring_req->seg;
+
+       err = gnttab_alloc_grant_references(seg_grants + data_grants,
+                                           &gref_head);
+       if (err) {
+               kfree(shadow->sg);
+               shost_printk(KERN_ERR, info->host, KBUILD_MODNAME
+                            "gnttab_alloc_grant_references() error\n");
+               return -ENOMEM;
+       }
+
+       if (seg_grants) {
+               page = virt_to_page(seg);
+               off = (unsigned long)seg & ~PAGE_MASK;
+               len = sizeof(struct scsiif_request_segment) * data_grants;
+               while (len > 0) {
+                       bytes = min_t(unsigned int, len, PAGE_SIZE - off);
+
+                       ref = gnttab_claim_grant_reference(&gref_head);
+                       BUG_ON(ref == -ENOSPC);
+
+                       mfn = pfn_to_mfn(page_to_pfn(page));
+                       gnttab_grant_foreign_access_ref(ref,
+                               info->dev->otherend_id, mfn, 1);
+                       shadow->gref[ref_cnt] = ref;
+                       ring_req->seg[ref_cnt].gref   = ref;
+                       ring_req->seg[ref_cnt].offset = (uint16_t)off;
+                       ring_req->seg[ref_cnt].length = (uint16_t)bytes;
+
+                       page++;
+                       len -= bytes;
+                       off = 0;
+                       ref_cnt++;
+               }
+               BUG_ON(seg_grants < ref_cnt);
+               seg_grants = ref_cnt;
+       }
+
+       scsi_for_each_sg(sc, sg, scsi_sg_count(sc), i) {
+               page = sg_page(sg);
+               off = sg->offset;
+               len = sg->length;
+
+               while (len > 0 && data_len > 0) {
+                       /*
+                        * sg sends a scatterlist that is larger than
+                        * the data_len it wants transferred for certain
+                        * IO sizes.
+                        */
+                       bytes = min_t(unsigned int, len, PAGE_SIZE - off);
+                       bytes = min(bytes, data_len);
+
+                       ref = gnttab_claim_grant_reference(&gref_head);
+                       BUG_ON(ref == -ENOSPC);
+
+                       mfn = pfn_to_mfn(page_to_pfn(page));
+                       gnttab_grant_foreign_access_ref(ref,
+                               info->dev->otherend_id, mfn, grant_ro);
+
+                       shadow->gref[ref_cnt] = ref;
+                       seg->gref   = ref;
+                       seg->offset = (uint16_t)off;
+                       seg->length = (uint16_t)bytes;
+
+                       page++;
+                       seg++;
+                       len -= bytes;
+                       data_len -= bytes;
+                       off = 0;
+                       ref_cnt++;
+               }
+       }
+
+       if (seg_grants)
+               ring_req->nr_segments = VSCSIIF_SG_GRANT | seg_grants;
+       else
+               ring_req->nr_segments = (uint8_t)ref_cnt;
+       shadow->nr_grants = ref_cnt;
+
+       return 0;
+}
+
+static struct vscsiif_request *scsifront_command2ring(
+               struct vscsifrnt_info *info, struct scsi_cmnd *sc,
+               struct vscsifrnt_shadow *shadow)
+{
+       struct vscsiif_request *ring_req;
+
+       memset(shadow, 0, sizeof(*shadow));
+
+       ring_req = scsifront_pre_req(info);
+       if (!ring_req)
+               return NULL;
+
+       info->shadow[ring_req->rqid] = shadow;
+       shadow->rqid = ring_req->rqid;
+
+       ring_req->id      = sc->device->id;
+       ring_req->lun     = sc->device->lun;
+       ring_req->channel = sc->device->channel;
+       ring_req->cmd_len = sc->cmd_len;
+
+       BUG_ON(sc->cmd_len > VSCSIIF_MAX_COMMAND_SIZE);
+
+       memcpy(ring_req->cmnd, sc->cmnd, sc->cmd_len);
+
+       ring_req->sc_data_direction   = (uint8_t)sc->sc_data_direction;
+       ring_req->timeout_per_command = sc->request->timeout / HZ;
+
+       return ring_req;
+}
+
+static int scsifront_queuecommand(struct Scsi_Host *shost,
+                                 struct scsi_cmnd *sc)
+{
+       struct vscsifrnt_info *info = shost_priv(shost);
+       struct vscsiif_request *ring_req;
+       struct vscsifrnt_shadow *shadow = scsi_cmd_priv(sc);
+       unsigned long flags;
+       int err;
+       uint16_t rqid;
+
+       spin_lock_irqsave(shost->host_lock, flags);
+       if (RING_FULL(&info->ring))
+               goto busy;
+
+       ring_req = scsifront_command2ring(info, sc, shadow);
+       if (!ring_req)
+               goto busy;
+
+       sc->result = 0;
+
+       rqid = ring_req->rqid;
+       ring_req->act = VSCSIIF_ACT_SCSI_CDB;
+
+       shadow->sc  = sc;
+       shadow->act = VSCSIIF_ACT_SCSI_CDB;
+
+       err = map_data_for_request(info, sc, ring_req, shadow);
+       if (err < 0) {
+               pr_debug("%s: err %d\n", __func__, err);
+               scsifront_put_rqid(info, rqid);
+               spin_unlock_irqrestore(shost->host_lock, flags);
+               if (err == -ENOMEM)
+                       return SCSI_MLQUEUE_HOST_BUSY;
+               sc->result = DID_ERROR << 16;
+               sc->scsi_done(sc);
+               return 0;
+       }
+
+       scsifront_do_request(info);
+       spin_unlock_irqrestore(shost->host_lock, flags);
+
+       return 0;
+
+busy:
+       spin_unlock_irqrestore(shost->host_lock, flags);
+       pr_debug("%s: busy\n", __func__);
+       return SCSI_MLQUEUE_HOST_BUSY;
+}
+
+/*
+ * Any exception handling (reset or abort) must be forwarded to the backend.
+ * We have to wait until an answer is returned. This answer contains the
+ * result to be returned to the requestor.
+ */
+static int scsifront_action_handler(struct scsi_cmnd *sc, uint8_t act)
+{
+       struct Scsi_Host *host = sc->device->host;
+       struct vscsifrnt_info *info = shost_priv(host);
+       struct vscsifrnt_shadow *shadow, *s = scsi_cmd_priv(sc);
+       struct vscsiif_request *ring_req;
+       int err = 0;
+
+       shadow = kmalloc(sizeof(*shadow), GFP_NOIO);
+       if (!shadow)
+               return FAILED;
+
+       spin_lock_irq(host->host_lock);
+
+       for (;;) {
+               if (!RING_FULL(&info->ring)) {
+                       ring_req = scsifront_command2ring(info, sc, shadow);
+                       if (ring_req)
+                               break;
+               }
+               if (err) {
+                       spin_unlock_irq(host->host_lock);
+                       kfree(shadow);
+                       return FAILED;
+               }
+               info->wait_ring_available = 1;
+               spin_unlock_irq(host->host_lock);
+               err = wait_event_interruptible(info->wq_sync,
+                                              !info->wait_ring_available);
+               spin_lock_irq(host->host_lock);
+       }
+
+       ring_req->act = act;
+       ring_req->ref_rqid = s->rqid;
+
+       shadow->act = act;
+       shadow->rslt_reset = RSLT_RESET_WAITING;
+       init_waitqueue_head(&shadow->wq_reset);
+
+       ring_req->nr_segments = 0;
+
+       scsifront_do_request(info);
+
+       spin_unlock_irq(host->host_lock);
+       err = wait_event_interruptible(shadow->wq_reset, shadow->wait_reset);
+       spin_lock_irq(host->host_lock);
+
+       if (!err) {
+               err = shadow->rslt_reset;
+               scsifront_put_rqid(info, shadow->rqid);
+               kfree(shadow);
+       } else {
+               spin_lock(&info->shadow_lock);
+               shadow->rslt_reset = RSLT_RESET_ERR;
+               spin_unlock(&info->shadow_lock);
+               err = FAILED;
+       }
+
+       spin_unlock_irq(host->host_lock);
+       return err;
+}
+
+static int scsifront_eh_abort_handler(struct scsi_cmnd *sc)
+{
+       pr_debug("%s\n", __func__);
+       return scsifront_action_handler(sc, VSCSIIF_ACT_SCSI_ABORT);
+}
+
+static int scsifront_dev_reset_handler(struct scsi_cmnd *sc)
+{
+       pr_debug("%s\n", __func__);
+       return scsifront_action_handler(sc, VSCSIIF_ACT_SCSI_RESET);
+}
+
+static int scsifront_sdev_configure(struct scsi_device *sdev)
+{
+       struct vscsifrnt_info *info = shost_priv(sdev->host);
+
+       if (info && current == info->curr)
+               xenbus_printf(XBT_NIL, info->dev->nodename,
+                             info->dev_state_path, "%d", XenbusStateConnected);
+
+       return 0;
+}
+
+static void scsifront_sdev_destroy(struct scsi_device *sdev)
+{
+       struct vscsifrnt_info *info = shost_priv(sdev->host);
+
+       if (info && current == info->curr)
+               xenbus_printf(XBT_NIL, info->dev->nodename,
+                             info->dev_state_path, "%d", XenbusStateClosed);
+}
+
+static struct scsi_host_template scsifront_sht = {
+       .module                 = THIS_MODULE,
+       .name                   = "Xen SCSI frontend driver",
+       .queuecommand           = scsifront_queuecommand,
+       .eh_abort_handler       = scsifront_eh_abort_handler,
+       .eh_device_reset_handler = scsifront_dev_reset_handler,
+       .slave_configure        = scsifront_sdev_configure,
+       .slave_destroy          = scsifront_sdev_destroy,
+       .cmd_per_lun            = VSCSIIF_DEFAULT_CMD_PER_LUN,
+       .can_queue              = VSCSIIF_MAX_REQS,
+       .this_id                = -1,
+       .cmd_size               = sizeof(struct vscsifrnt_shadow),
+       .sg_tablesize           = VSCSIIF_SG_TABLESIZE,
+       .use_clustering         = DISABLE_CLUSTERING,
+       .proc_name              = "scsifront",
+};
+
+static int scsifront_alloc_ring(struct vscsifrnt_info *info)
+{
+       struct xenbus_device *dev = info->dev;
+       struct vscsiif_sring *sring;
+       int err = -ENOMEM;
+
+       /***** Frontend to Backend ring start *****/
+       sring = (struct vscsiif_sring *)__get_free_page(GFP_KERNEL);
+       if (!sring) {
+               xenbus_dev_fatal(dev, err,
+                       "fail to allocate shared ring (Front to Back)");
+               return err;
+       }
+       SHARED_RING_INIT(sring);
+       FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
+
+       err = xenbus_grant_ring(dev, virt_to_mfn(sring));
+       if (err < 0) {
+               free_page((unsigned long)sring);
+               xenbus_dev_fatal(dev, err,
+                       "fail to grant shared ring (Front to Back)");
+               return err;
+       }
+       info->ring_ref = err;
+
+       err = xenbus_alloc_evtchn(dev, &info->evtchn);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "xenbus_alloc_evtchn");
+               goto free_gnttab;
+       }
+
+       err = bind_evtchn_to_irq(info->evtchn);
+       if (err <= 0) {
+               xenbus_dev_fatal(dev, err, "bind_evtchn_to_irq");
+               goto free_gnttab;
+       }
+
+       info->irq = err;
+
+       err = request_threaded_irq(info->irq, NULL, scsifront_irq_fn,
+                                  IRQF_ONESHOT, "scsifront", info);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "request_threaded_irq");
+               goto free_irq;
+       }
+
+       return 0;
+
+/* free resource */
+free_irq:
+       unbind_from_irqhandler(info->irq, info);
+free_gnttab:
+       gnttab_end_foreign_access(info->ring_ref, 0,
+                                 (unsigned long)info->ring.sring);
+
+       return err;
+}
+
+static int scsifront_init_ring(struct vscsifrnt_info *info)
+{
+       struct xenbus_device *dev = info->dev;
+       struct xenbus_transaction xbt;
+       int err;
+
+       pr_debug("%s\n", __func__);
+
+       err = scsifront_alloc_ring(info);
+       if (err)
+               return err;
+       pr_debug("%s: %u %u\n", __func__, info->ring_ref, info->evtchn);
+
+again:
+       err = xenbus_transaction_start(&xbt);
+       if (err)
+               xenbus_dev_fatal(dev, err, "starting transaction");
+
+       err = xenbus_printf(xbt, dev->nodename, "ring-ref", "%u",
+                           info->ring_ref);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "%s", "writing ring-ref");
+               goto fail;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
+                           info->evtchn);
+
+       if (err) {
+               xenbus_dev_fatal(dev, err, "%s", "writing event-channel");
+               goto fail;
+       }
+
+       err = xenbus_transaction_end(xbt, 0);
+       if (err) {
+               if (err == -EAGAIN)
+                       goto again;
+               xenbus_dev_fatal(dev, err, "completing transaction");
+               goto free_sring;
+       }
+
+       return 0;
+
+fail:
+       xenbus_transaction_end(xbt, 1);
+free_sring:
+       unbind_from_irqhandler(info->irq, info);
+       gnttab_end_foreign_access(info->ring_ref, 0,
+                                 (unsigned long)info->ring.sring);
+
+       return err;
+}
+
+
+static int scsifront_probe(struct xenbus_device *dev,
+                          const struct xenbus_device_id *id)
+{
+       struct vscsifrnt_info *info;
+       struct Scsi_Host *host;
+       int err = -ENOMEM;
+       char name[TASK_COMM_LEN];
+
+       host = scsi_host_alloc(&scsifront_sht, sizeof(*info));
+       if (!host) {
+               xenbus_dev_fatal(dev, err, "fail to allocate scsi host");
+               return err;
+       }
+       info = (struct vscsifrnt_info *)host->hostdata;
+
+       dev_set_drvdata(&dev->dev, info);
+       info->dev = dev;
+
+       bitmap_fill(info->shadow_free_bitmap, VSCSIIF_MAX_REQS);
+
+       err = scsifront_init_ring(info);
+       if (err) {
+               scsi_host_put(host);
+               return err;
+       }
+
+       init_waitqueue_head(&info->wq_sync);
+       spin_lock_init(&info->shadow_lock);
+
+       snprintf(name, TASK_COMM_LEN, "vscsiif.%d", host->host_no);
+
+       host->max_id      = VSCSIIF_MAX_TARGET;
+       host->max_channel = 0;
+       host->max_lun     = VSCSIIF_MAX_LUN;
+       host->max_sectors = (host->sg_tablesize - 1) * PAGE_SIZE / 512;
+       host->max_cmd_len = VSCSIIF_MAX_COMMAND_SIZE;
+
+       err = scsi_add_host(host, &dev->dev);
+       if (err) {
+               dev_err(&dev->dev, "fail to add scsi host %d\n", err);
+               goto free_sring;
+       }
+       info->host = host;
+       info->host_active = 1;
+
+       xenbus_switch_state(dev, XenbusStateInitialised);
+
+       return 0;
+
+free_sring:
+       unbind_from_irqhandler(info->irq, info);
+       gnttab_end_foreign_access(info->ring_ref, 0,
+                                 (unsigned long)info->ring.sring);
+       scsi_host_put(host);
+       return err;
+}
+
+static int scsifront_remove(struct xenbus_device *dev)
+{
+       struct vscsifrnt_info *info = dev_get_drvdata(&dev->dev);
+
+       pr_debug("%s: %s removed\n", __func__, dev->nodename);
+
+       mutex_lock(&scsifront_mutex);
+       if (info->host_active) {
+               /* Scsi_host not yet removed */
+               scsi_remove_host(info->host);
+               info->host_active = 0;
+       }
+       mutex_unlock(&scsifront_mutex);
+
+       gnttab_end_foreign_access(info->ring_ref, 0,
+                                 (unsigned long)info->ring.sring);
+       unbind_from_irqhandler(info->irq, info);
+
+       scsi_host_put(info->host);
+
+       return 0;
+}
+
+static void scsifront_disconnect(struct vscsifrnt_info *info)
+{
+       struct xenbus_device *dev = info->dev;
+       struct Scsi_Host *host = info->host;
+
+       pr_debug("%s: %s disconnect\n", __func__, dev->nodename);
+
+       /*
+        * When this function is executed, all devices of
+        * Frontend have been deleted.
+        * Therefore, it need not block I/O before remove_host.
+        */
+
+       mutex_lock(&scsifront_mutex);
+       if (info->host_active) {
+               scsi_remove_host(host);
+               info->host_active = 0;
+       }
+       mutex_unlock(&scsifront_mutex);
+
+       xenbus_frontend_closed(dev);
+}
+
+static void scsifront_do_lun_hotplug(struct vscsifrnt_info *info, int op)
+{
+       struct xenbus_device *dev = info->dev;
+       int i, err = 0;
+       char str[64];
+       char **dir;
+       unsigned int dir_n = 0;
+       unsigned int device_state;
+       unsigned int hst, chn, tgt, lun;
+       struct scsi_device *sdev;
+
+       dir = xenbus_directory(XBT_NIL, dev->otherend, "vscsi-devs", &dir_n);
+       if (IS_ERR(dir))
+               return;
+
+       /* mark current task as the one allowed to modify device states */
+       BUG_ON(info->curr);
+       info->curr = current;
+
+       for (i = 0; i < dir_n; i++) {
+               /* read status */
+               snprintf(str, sizeof(str), "vscsi-devs/%s/state", dir[i]);
+               err = xenbus_scanf(XBT_NIL, dev->otherend, str, "%u",
+                                  &device_state);
+               if (XENBUS_EXIST_ERR(err))
+                       continue;
+
+               /* virtual SCSI device */
+               snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", dir[i]);
+               err = xenbus_scanf(XBT_NIL, dev->otherend, str,
+                                  "%u:%u:%u:%u", &hst, &chn, &tgt, &lun);
+               if (XENBUS_EXIST_ERR(err))
+                       continue;
+
+               /*
+                * Front device state path, used in slave_configure called
+                * on successfull scsi_add_device, and in slave_destroy called
+                * on remove of a device.
+                */
+               snprintf(info->dev_state_path, sizeof(info->dev_state_path),
+                        "vscsi-devs/%s/state", dir[i]);
+
+               switch (op) {
+               case VSCSIFRONT_OP_ADD_LUN:
+                       if (device_state != XenbusStateInitialised)
+                               break;
+
+                       if (scsi_add_device(info->host, chn, tgt, lun)) {
+                               dev_err(&dev->dev, "scsi_add_device\n");
+                               xenbus_printf(XBT_NIL, dev->nodename,
+                                             info->dev_state_path,
+                                             "%d", XenbusStateClosed);
+                       }
+                       break;
+               case VSCSIFRONT_OP_DEL_LUN:
+                       if (device_state != XenbusStateClosing)
+                               break;
+
+                       sdev = scsi_device_lookup(info->host, chn, tgt, lun);
+                       if (sdev) {
+                               scsi_remove_device(sdev);
+                               scsi_device_put(sdev);
+                       }
+                       break;
+               default:
+                       break;
+               }
+       }
+
+       info->curr = NULL;
+
+       kfree(dir);
+}
+
+static void scsifront_read_backend_params(struct xenbus_device *dev,
+                                         struct vscsifrnt_info *info)
+{
+       unsigned int sg_grant;
+       int ret;
+       struct Scsi_Host *host = info->host;
+
+       ret = xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg-grant", "%u",
+                          &sg_grant);
+       if (ret == 1 && sg_grant) {
+               sg_grant = min_t(unsigned int, sg_grant, SG_ALL);
+               sg_grant = max_t(unsigned int, sg_grant, VSCSIIF_SG_TABLESIZE);
+               host->sg_tablesize = min_t(unsigned int, sg_grant,
+                       VSCSIIF_SG_TABLESIZE * PAGE_SIZE /
+                       sizeof(struct scsiif_request_segment));
+               host->max_sectors = (host->sg_tablesize - 1) * PAGE_SIZE / 512;
+       }
+       dev_info(&dev->dev, "using up to %d SG entries\n", host->sg_tablesize);
+}
+
+static void scsifront_backend_changed(struct xenbus_device *dev,
+                                     enum xenbus_state backend_state)
+{
+       struct vscsifrnt_info *info = dev_get_drvdata(&dev->dev);
+
+       pr_debug("%s: %p %u %u\n", __func__, dev, dev->state, backend_state);
+
+       switch (backend_state) {
+       case XenbusStateUnknown:
+       case XenbusStateInitialising:
+       case XenbusStateInitWait:
+       case XenbusStateInitialised:
+               break;
+
+       case XenbusStateConnected:
+               scsifront_read_backend_params(dev, info);
+               if (xenbus_read_driver_state(dev->nodename) ==
+                   XenbusStateInitialised)
+                       scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_ADD_LUN);
+
+               if (dev->state != XenbusStateConnected)
+                       xenbus_switch_state(dev, XenbusStateConnected);
+               break;
+
+       case XenbusStateClosed:
+               if (dev->state == XenbusStateClosed)
+                       break;
+               /* Missed the backend's Closing state -- fallthrough */
+       case XenbusStateClosing:
+               scsifront_disconnect(info);
+               break;
+
+       case XenbusStateReconfiguring:
+               scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_DEL_LUN);
+               xenbus_switch_state(dev, XenbusStateReconfiguring);
+               break;
+
+       case XenbusStateReconfigured:
+               scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_ADD_LUN);
+               xenbus_switch_state(dev, XenbusStateConnected);
+               break;
+       }
+}
+
+static const struct xenbus_device_id scsifront_ids[] = {
+       { "vscsi" },
+       { "" }
+};
+
+static struct xenbus_driver scsifront_driver = {
+       .ids                    = scsifront_ids,
+       .probe                  = scsifront_probe,
+       .remove                 = scsifront_remove,
+       .otherend_changed       = scsifront_backend_changed,
+};
+
+static int __init scsifront_init(void)
+{
+       if (!xen_domain())
+               return -ENODEV;
+
+       return xenbus_register_frontend(&scsifront_driver);
+}
+module_init(scsifront_init);
+
+static void __exit scsifront_exit(void)
+{
+       xenbus_unregister_driver(&scsifront_driver);
+}
+module_exit(scsifront_exit);
+
+MODULE_DESCRIPTION("Xen SCSI frontend driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("xen:vscsi");
+MODULE_AUTHOR("Juergen Gross <jgross@suse.com>");
index 2967f03..f1e5742 100644 (file)
@@ -347,8 +347,6 @@ static int xen_console_remove(struct xencons_info *info)
 }
 
 #ifdef CONFIG_HVC_XEN_FRONTEND
-static struct xenbus_driver xencons_driver;
-
 static int xencons_remove(struct xenbus_device *dev)
 {
        return xen_console_remove(dev_get_drvdata(&dev->dev));
@@ -499,13 +497,14 @@ static const struct xenbus_device_id xencons_ids[] = {
        { "" }
 };
 
-
-static DEFINE_XENBUS_DRIVER(xencons, "xenconsole",
+static struct xenbus_driver xencons_driver = {
+       .name = "xenconsole",
+       .ids = xencons_ids,
        .probe = xencons_probe,
        .remove = xencons_remove,
        .resume = xencons_resume,
        .otherend_changed = xencons_backend_changed,
-);
+};
 #endif /* CONFIG_HVC_XEN_FRONTEND */
 
 static int __init xen_hvc_init(void)
index 901014b..09dc447 100644 (file)
@@ -684,12 +684,13 @@ static const struct xenbus_device_id xenfb_ids[] = {
        { "" }
 };
 
-static DEFINE_XENBUS_DRIVER(xenfb, ,
+static struct xenbus_driver xenfb_driver = {
+       .ids = xenfb_ids,
        .probe = xenfb_probe,
        .remove = xenfb_remove,
        .resume = xenfb_resume,
        .otherend_changed = xenfb_backend_changed,
-);
+};
 
 static int __init xenfb_init(void)
 {
index 8bc0183..b812462 100644 (file)
@@ -172,6 +172,15 @@ config XEN_PCIDEV_BACKEND
 
          If in doubt, say m.
 
+config XEN_SCSI_BACKEND
+       tristate "XEN SCSI backend driver"
+       depends on XEN && XEN_BACKEND && TARGET_CORE
+       help
+         The SCSI backend driver allows the kernel to export its SCSI Devices
+         to other guests via a high-performance shared-memory interface.
+         Only needed for systems running as XEN driver domains (e.g. Dom0) and
+         if guests need generic access to SCSI devices.
+
 config XEN_PRIVCMD
        tristate
        depends on XEN
index 84044b5..2140398 100644 (file)
@@ -36,6 +36,7 @@ obj-$(CONFIG_XEN_ACPI_HOTPLUG_MEMORY) += xen-acpi-memhotplug.o
 obj-$(CONFIG_XEN_ACPI_HOTPLUG_CPU)     += xen-acpi-cpuhotplug.o
 obj-$(CONFIG_XEN_ACPI_PROCESSOR)       += xen-acpi-processor.o
 obj-$(CONFIG_XEN_EFI)                  += efi.o
+obj-$(CONFIG_XEN_SCSI_BACKEND)         += xen-scsiback.o
 xen-evtchn-y                           := evtchn.o
 xen-gntdev-y                           := gntdev.o
 xen-gntalloc-y                         := gntalloc.o
index 31f618a..1f850c9 100644 (file)
@@ -27,6 +27,8 @@
 #include <xen/interface/platform.h>
 #include <xen/xen.h>
 
+#include <asm/page.h>
+
 #include <asm/xen/hypercall.h>
 
 #define INIT_EFI_OP(name) \
index 5b5c5ff..b4bca2d 100644 (file)
@@ -900,8 +900,8 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
        return irq;
 }
 
-static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
-                                         unsigned int remote_port)
+int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
+                                  unsigned int remote_port)
 {
        struct evtchn_bind_interdomain bind_interdomain;
        int err;
@@ -914,6 +914,7 @@ static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
 
        return err ? : bind_evtchn_to_irq(bind_interdomain.local_port);
 }
+EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq);
 
 static int find_virq(unsigned int virq, unsigned int cpu)
 {
index c254ae0..7786291 100644 (file)
@@ -592,7 +592,7 @@ static int grow_gnttab_list(unsigned int more_frames)
        return 0;
 
 grow_nomem:
-       for ( ; i >= nr_glist_frames; i--)
+       while (i-- > nr_glist_frames)
                free_page((unsigned long) gnttab_list[i]);
        return -ENOMEM;
 }
index c214daa..ad8d30c 100644 (file)
@@ -719,11 +719,13 @@ static const struct xenbus_device_id xen_pcibk_ids[] = {
        {""},
 };
 
-static DEFINE_XENBUS_DRIVER(xen_pcibk, DRV_NAME,
+static struct xenbus_driver xen_pcibk_driver = {
+       .name                   = DRV_NAME,
+       .ids                    = xen_pcibk_ids,
        .probe                  = xen_pcibk_xenbus_probe,
        .remove                 = xen_pcibk_xenbus_remove,
        .otherend_changed       = xen_pcibk_frontend_changed,
-);
+};
 
 const struct xen_pcibk_backend *__read_mostly xen_pcibk_backend;
 
diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c
new file mode 100644 (file)
index 0000000..3e32146
--- /dev/null
@@ -0,0 +1,2126 @@
+/*
+ * Xen SCSI backend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * Based on the blkback driver code.
+ * Adaption to kernel taget core infrastructure taken from vhost/scsi.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdarg.h>
+
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/gfp.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/configfs.h>
+
+#include <generated/utsrelease.h>
+
+#include <scsi/scsi_dbg.h>
+#include <scsi/scsi_eh.h>
+#include <scsi/scsi_tcq.h>
+
+#include <target/target_core_base.h>
+#include <target/target_core_fabric.h>
+#include <target/target_core_configfs.h>
+#include <target/target_core_fabric_configfs.h>
+
+#include <asm/hypervisor.h>
+
+#include <xen/xen.h>
+#include <xen/balloon.h>
+#include <xen/events.h>
+#include <xen/xenbus.h>
+#include <xen/grant_table.h>
+#include <xen/page.h>
+
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/vscsiif.h>
+
+#define DPRINTK(_f, _a...)                     \
+       pr_debug("(file=%s, line=%d) " _f, __FILE__ , __LINE__ , ## _a)
+
+#define VSCSI_VERSION  "v0.1"
+#define VSCSI_NAMELEN  32
+
+struct ids_tuple {
+       unsigned int hst;               /* host    */
+       unsigned int chn;               /* channel */
+       unsigned int tgt;               /* target  */
+       unsigned int lun;               /* LUN     */
+};
+
+struct v2p_entry {
+       struct ids_tuple v;             /* translate from */
+       struct scsiback_tpg *tpg;       /* translate to   */
+       unsigned int lun;
+       struct kref kref;
+       struct list_head l;
+};
+
+struct vscsibk_info {
+       struct xenbus_device *dev;
+
+       domid_t domid;
+       unsigned int irq;
+
+       struct vscsiif_back_ring ring;
+       int ring_error;
+
+       spinlock_t ring_lock;
+       atomic_t nr_unreplied_reqs;
+
+       spinlock_t v2p_lock;
+       struct list_head v2p_entry_lists;
+
+       wait_queue_head_t waiting_to_free;
+};
+
+/* theoretical maximum of grants for one request */
+#define VSCSI_MAX_GRANTS       (SG_ALL + VSCSIIF_SG_TABLESIZE)
+
+/*
+ * VSCSI_GRANT_BATCH is the maximum number of grants to be processed in one
+ * call to map/unmap grants. Don't choose it too large, as there are arrays
+ * with VSCSI_GRANT_BATCH elements allocated on the stack.
+ */
+#define VSCSI_GRANT_BATCH      16
+
+struct vscsibk_pend {
+       uint16_t rqid;
+
+       uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE];
+       uint8_t cmd_len;
+
+       uint8_t sc_data_direction;
+       uint16_t n_sg;          /* real length of SG list */
+       uint16_t n_grants;      /* SG pages and potentially SG list */
+       uint32_t data_len;
+       uint32_t result;
+
+       struct vscsibk_info *info;
+       struct v2p_entry *v2p;
+       struct scatterlist *sgl;
+
+       uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE];
+
+       grant_handle_t grant_handles[VSCSI_MAX_GRANTS];
+       struct page *pages[VSCSI_MAX_GRANTS];
+
+       struct se_cmd se_cmd;
+};
+
+struct scsiback_tmr {
+       atomic_t tmr_complete;
+       wait_queue_head_t tmr_wait;
+};
+
+struct scsiback_nexus {
+       /* Pointer to TCM session for I_T Nexus */
+       struct se_session *tvn_se_sess;
+};
+
+struct scsiback_tport {
+       /* SCSI protocol the tport is providing */
+       u8 tport_proto_id;
+       /* Binary World Wide unique Port Name for pvscsi Target port */
+       u64 tport_wwpn;
+       /* ASCII formatted WWPN for pvscsi Target port */
+       char tport_name[VSCSI_NAMELEN];
+       /* Returned by scsiback_make_tport() */
+       struct se_wwn tport_wwn;
+};
+
+struct scsiback_tpg {
+       /* scsiback port target portal group tag for TCM */
+       u16 tport_tpgt;
+       /* track number of TPG Port/Lun Links wrt explicit I_T Nexus shutdown */
+       int tv_tpg_port_count;
+       /* xen-pvscsi references to tpg_nexus, protected by tv_tpg_mutex */
+       int tv_tpg_fe_count;
+       /* list for scsiback_list */
+       struct list_head tv_tpg_list;
+       /* Used to protect access for tpg_nexus */
+       struct mutex tv_tpg_mutex;
+       /* Pointer to the TCM pvscsi I_T Nexus for this TPG endpoint */
+       struct scsiback_nexus *tpg_nexus;
+       /* Pointer back to scsiback_tport */
+       struct scsiback_tport *tport;
+       /* Returned by scsiback_make_tpg() */
+       struct se_portal_group se_tpg;
+       /* alias used in xenstore */
+       char param_alias[VSCSI_NAMELEN];
+       /* list of info structures related to this target portal group */
+       struct list_head info_list;
+};
+
+#define SCSIBACK_INVALID_HANDLE (~0)
+
+static bool log_print_stat;
+module_param(log_print_stat, bool, 0644);
+
+static int scsiback_max_buffer_pages = 1024;
+module_param_named(max_buffer_pages, scsiback_max_buffer_pages, int, 0644);
+MODULE_PARM_DESC(max_buffer_pages,
+"Maximum number of free pages to keep in backend buffer");
+
+static struct kmem_cache *scsiback_cachep;
+static DEFINE_SPINLOCK(free_pages_lock);
+static int free_pages_num;
+static LIST_HEAD(scsiback_free_pages);
+
+/* Global spinlock to protect scsiback TPG list */
+static DEFINE_MUTEX(scsiback_mutex);
+static LIST_HEAD(scsiback_list);
+
+/* Local pointer to allocated TCM configfs fabric module */
+static struct target_fabric_configfs *scsiback_fabric_configfs;
+
+static void scsiback_get(struct vscsibk_info *info)
+{
+       atomic_inc(&info->nr_unreplied_reqs);
+}
+
+static void scsiback_put(struct vscsibk_info *info)
+{
+       if (atomic_dec_and_test(&info->nr_unreplied_reqs))
+               wake_up(&info->waiting_to_free);
+}
+
+static void put_free_pages(struct page **page, int num)
+{
+       unsigned long flags;
+       int i = free_pages_num + num, n = num;
+
+       if (num == 0)
+               return;
+       if (i > scsiback_max_buffer_pages) {
+               n = min(num, i - scsiback_max_buffer_pages);
+               free_xenballooned_pages(n, page + num - n);
+               n = num - n;
+       }
+       spin_lock_irqsave(&free_pages_lock, flags);
+       for (i = 0; i < n; i++)
+               list_add(&page[i]->lru, &scsiback_free_pages);
+       free_pages_num += n;
+       spin_unlock_irqrestore(&free_pages_lock, flags);
+}
+
+static int get_free_page(struct page **page)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&free_pages_lock, flags);
+       if (list_empty(&scsiback_free_pages)) {
+               spin_unlock_irqrestore(&free_pages_lock, flags);
+               return alloc_xenballooned_pages(1, page, false);
+       }
+       page[0] = list_first_entry(&scsiback_free_pages, struct page, lru);
+       list_del(&page[0]->lru);
+       free_pages_num--;
+       spin_unlock_irqrestore(&free_pages_lock, flags);
+       return 0;
+}
+
+static unsigned long vaddr_page(struct page *page)
+{
+       unsigned long pfn = page_to_pfn(page);
+
+       return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+static unsigned long vaddr(struct vscsibk_pend *req, int seg)
+{
+       return vaddr_page(req->pages[seg]);
+}
+
+static void scsiback_print_status(char *sense_buffer, int errors,
+                                       struct vscsibk_pend *pending_req)
+{
+       struct scsiback_tpg *tpg = pending_req->v2p->tpg;
+
+       pr_err("xen-pvscsi[%s:%d] cmnd[0]=%02x -> st=%02x msg=%02x host=%02x drv=%02x\n",
+              tpg->tport->tport_name, pending_req->v2p->lun,
+              pending_req->cmnd[0], status_byte(errors), msg_byte(errors),
+              host_byte(errors), driver_byte(errors));
+
+       if (CHECK_CONDITION & status_byte(errors))
+               __scsi_print_sense("xen-pvscsi", sense_buffer,
+                                  SCSI_SENSE_BUFFERSIZE);
+}
+
+static void scsiback_fast_flush_area(struct vscsibk_pend *req)
+{
+       struct gnttab_unmap_grant_ref unmap[VSCSI_GRANT_BATCH];
+       struct page *pages[VSCSI_GRANT_BATCH];
+       unsigned int i, invcount = 0;
+       grant_handle_t handle;
+       int err;
+
+       kfree(req->sgl);
+       req->sgl = NULL;
+       req->n_sg = 0;
+
+       if (!req->n_grants)
+               return;
+
+       for (i = 0; i < req->n_grants; i++) {
+               handle = req->grant_handles[i];
+               if (handle == SCSIBACK_INVALID_HANDLE)
+                       continue;
+               gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
+                                   GNTMAP_host_map, handle);
+               req->grant_handles[i] = SCSIBACK_INVALID_HANDLE;
+               pages[invcount] = req->pages[i];
+               put_page(pages[invcount]);
+               invcount++;
+               if (invcount < VSCSI_GRANT_BATCH)
+                       continue;
+               err = gnttab_unmap_refs(unmap, NULL, pages, invcount);
+               BUG_ON(err);
+               invcount = 0;
+       }
+
+       if (invcount) {
+               err = gnttab_unmap_refs(unmap, NULL, pages, invcount);
+               BUG_ON(err);
+       }
+
+       put_free_pages(req->pages, req->n_grants);
+       req->n_grants = 0;
+}
+
+static void scsiback_free_translation_entry(struct kref *kref)
+{
+       struct v2p_entry *entry = container_of(kref, struct v2p_entry, kref);
+       struct scsiback_tpg *tpg = entry->tpg;
+
+       mutex_lock(&tpg->tv_tpg_mutex);
+       tpg->tv_tpg_fe_count--;
+       mutex_unlock(&tpg->tv_tpg_mutex);
+
+       kfree(entry);
+}
+
+static void scsiback_do_resp_with_sense(char *sense_buffer, int32_t result,
+                       uint32_t resid, struct vscsibk_pend *pending_req)
+{
+       struct vscsiif_response *ring_res;
+       struct vscsibk_info *info = pending_req->info;
+       int notify;
+       struct scsi_sense_hdr sshdr;
+       unsigned long flags;
+       unsigned len;
+
+       spin_lock_irqsave(&info->ring_lock, flags);
+
+       ring_res = RING_GET_RESPONSE(&info->ring, info->ring.rsp_prod_pvt);
+       info->ring.rsp_prod_pvt++;
+
+       ring_res->rslt   = result;
+       ring_res->rqid   = pending_req->rqid;
+
+       if (sense_buffer != NULL &&
+           scsi_normalize_sense(sense_buffer, VSCSIIF_SENSE_BUFFERSIZE,
+                                &sshdr)) {
+               len = min_t(unsigned, 8 + sense_buffer[7],
+                           VSCSIIF_SENSE_BUFFERSIZE);
+               memcpy(ring_res->sense_buffer, sense_buffer, len);
+               ring_res->sense_len = len;
+       } else {
+               ring_res->sense_len = 0;
+       }
+
+       ring_res->residual_len = resid;
+
+       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&info->ring, notify);
+       spin_unlock_irqrestore(&info->ring_lock, flags);
+
+       if (notify)
+               notify_remote_via_irq(info->irq);
+
+       if (pending_req->v2p)
+               kref_put(&pending_req->v2p->kref,
+                        scsiback_free_translation_entry);
+}
+
+static void scsiback_cmd_done(struct vscsibk_pend *pending_req)
+{
+       struct vscsibk_info *info = pending_req->info;
+       unsigned char *sense_buffer;
+       unsigned int resid;
+       int errors;
+
+       sense_buffer = pending_req->sense_buffer;
+       resid        = pending_req->se_cmd.residual_count;
+       errors       = pending_req->result;
+
+       if (errors && log_print_stat)
+               scsiback_print_status(sense_buffer, errors, pending_req);
+
+       scsiback_fast_flush_area(pending_req);
+       scsiback_do_resp_with_sense(sense_buffer, errors, resid, pending_req);
+       scsiback_put(info);
+}
+
+static void scsiback_cmd_exec(struct vscsibk_pend *pending_req)
+{
+       struct se_cmd *se_cmd = &pending_req->se_cmd;
+       struct se_session *sess = pending_req->v2p->tpg->tpg_nexus->tvn_se_sess;
+       int rc;
+
+       memset(pending_req->sense_buffer, 0, VSCSIIF_SENSE_BUFFERSIZE);
+
+       memset(se_cmd, 0, sizeof(*se_cmd));
+
+       scsiback_get(pending_req->info);
+       rc = target_submit_cmd_map_sgls(se_cmd, sess, pending_req->cmnd,
+                       pending_req->sense_buffer, pending_req->v2p->lun,
+                       pending_req->data_len, 0,
+                       pending_req->sc_data_direction, 0,
+                       pending_req->sgl, pending_req->n_sg,
+                       NULL, 0, NULL, 0);
+       if (rc < 0) {
+               transport_send_check_condition_and_sense(se_cmd,
+                               TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE, 0);
+               transport_generic_free_cmd(se_cmd, 0);
+       }
+}
+
+static int scsiback_gnttab_data_map_batch(struct gnttab_map_grant_ref *map,
+       struct page **pg, grant_handle_t *grant, int cnt)
+{
+       int err, i;
+
+       if (!cnt)
+               return 0;
+
+       err = gnttab_map_refs(map, NULL, pg, cnt);
+       BUG_ON(err);
+       for (i = 0; i < cnt; i++) {
+               if (unlikely(map[i].status != GNTST_okay)) {
+                       pr_err("xen-pvscsi: invalid buffer -- could not remap it\n");
+                       map[i].handle = SCSIBACK_INVALID_HANDLE;
+                       err = -ENOMEM;
+               } else {
+                       get_page(pg[i]);
+               }
+               grant[i] = map[i].handle;
+       }
+       return err;
+}
+
+static int scsiback_gnttab_data_map_list(struct vscsibk_pend *pending_req,
+                       struct scsiif_request_segment *seg, struct page **pg,
+                       grant_handle_t *grant, int cnt, u32 flags)
+{
+       int mapcount = 0, i, err = 0;
+       struct gnttab_map_grant_ref map[VSCSI_GRANT_BATCH];
+       struct vscsibk_info *info = pending_req->info;
+
+       for (i = 0; i < cnt; i++) {
+               if (get_free_page(pg + mapcount)) {
+                       put_free_pages(pg, mapcount);
+                       pr_err("xen-pvscsi: no grant page\n");
+                       return -ENOMEM;
+               }
+               gnttab_set_map_op(&map[mapcount], vaddr_page(pg[mapcount]),
+                                 flags, seg[i].gref, info->domid);
+               mapcount++;
+               if (mapcount < VSCSI_GRANT_BATCH)
+                       continue;
+               err = scsiback_gnttab_data_map_batch(map, pg, grant, mapcount);
+               pg += mapcount;
+               grant += mapcount;
+               pending_req->n_grants += mapcount;
+               if (err)
+                       return err;
+               mapcount = 0;
+       }
+       err = scsiback_gnttab_data_map_batch(map, pg, grant, mapcount);
+       pending_req->n_grants += mapcount;
+       return err;
+}
+
+static int scsiback_gnttab_data_map(struct vscsiif_request *ring_req,
+                                       struct vscsibk_pend *pending_req)
+{
+       u32 flags;
+       int i, err, n_segs, i_seg = 0;
+       struct page **pg;
+       struct scsiif_request_segment *seg;
+       unsigned long end_seg = 0;
+       unsigned int nr_segments = (unsigned int)ring_req->nr_segments;
+       unsigned int nr_sgl = 0;
+       struct scatterlist *sg;
+       grant_handle_t *grant;
+
+       pending_req->n_sg = 0;
+       pending_req->n_grants = 0;
+       pending_req->data_len = 0;
+
+       nr_segments &= ~VSCSIIF_SG_GRANT;
+       if (!nr_segments)
+               return 0;
+
+       if (nr_segments > VSCSIIF_SG_TABLESIZE) {
+               DPRINTK("xen-pvscsi: invalid parameter nr_seg = %d\n",
+                       ring_req->nr_segments);
+               return -EINVAL;
+       }
+
+       if (ring_req->nr_segments & VSCSIIF_SG_GRANT) {
+               err = scsiback_gnttab_data_map_list(pending_req, ring_req->seg,
+                       pending_req->pages, pending_req->grant_handles,
+                       nr_segments, GNTMAP_host_map | GNTMAP_readonly);
+               if (err)
+                       return err;
+               nr_sgl = nr_segments;
+               nr_segments = 0;
+               for (i = 0; i < nr_sgl; i++) {
+                       n_segs = ring_req->seg[i].length /
+                                sizeof(struct scsiif_request_segment);
+                       if ((unsigned)ring_req->seg[i].offset +
+                           (unsigned)ring_req->seg[i].length > PAGE_SIZE ||
+                           n_segs * sizeof(struct scsiif_request_segment) !=
+                           ring_req->seg[i].length)
+                               return -EINVAL;
+                       nr_segments += n_segs;
+               }
+               if (nr_segments > SG_ALL) {
+                       DPRINTK("xen-pvscsi: invalid nr_seg = %d\n",
+                               nr_segments);
+                       return -EINVAL;
+               }
+       }
+
+       /* free of (sgl) in fast_flush_area()*/
+       pending_req->sgl = kmalloc_array(nr_segments,
+                                       sizeof(struct scatterlist), GFP_KERNEL);
+       if (!pending_req->sgl)
+               return -ENOMEM;
+
+       sg_init_table(pending_req->sgl, nr_segments);
+       pending_req->n_sg = nr_segments;
+
+       flags = GNTMAP_host_map;
+       if (pending_req->sc_data_direction == DMA_TO_DEVICE)
+               flags |= GNTMAP_readonly;
+
+       pg = pending_req->pages + nr_sgl;
+       grant = pending_req->grant_handles + nr_sgl;
+       if (!nr_sgl) {
+               seg = ring_req->seg;
+               err = scsiback_gnttab_data_map_list(pending_req, seg,
+                       pg, grant, nr_segments, flags);
+               if (err)
+                       return err;
+       } else {
+               for (i = 0; i < nr_sgl; i++) {
+                       seg = (struct scsiif_request_segment *)(
+                             vaddr(pending_req, i) + ring_req->seg[i].offset);
+                       n_segs = ring_req->seg[i].length /
+                                sizeof(struct scsiif_request_segment);
+                       err = scsiback_gnttab_data_map_list(pending_req, seg,
+                               pg, grant, n_segs, flags);
+                       if (err)
+                               return err;
+                       pg += n_segs;
+                       grant += n_segs;
+               }
+               end_seg = vaddr(pending_req, 0) + ring_req->seg[0].offset;
+               seg = (struct scsiif_request_segment *)end_seg;
+               end_seg += ring_req->seg[0].length;
+               pg = pending_req->pages + nr_sgl;
+       }
+
+       for_each_sg(pending_req->sgl, sg, nr_segments, i) {
+               sg_set_page(sg, pg[i], seg->length, seg->offset);
+               pending_req->data_len += seg->length;
+               seg++;
+               if (nr_sgl && (unsigned long)seg >= end_seg) {
+                       i_seg++;
+                       end_seg = vaddr(pending_req, i_seg) +
+                                 ring_req->seg[i_seg].offset;
+                       seg = (struct scsiif_request_segment *)end_seg;
+                       end_seg += ring_req->seg[i_seg].length;
+               }
+               if (sg->offset >= PAGE_SIZE ||
+                   sg->length > PAGE_SIZE ||
+                   sg->offset + sg->length > PAGE_SIZE)
+                       return -EINVAL;
+       }
+
+       return 0;
+}
+
+static void scsiback_disconnect(struct vscsibk_info *info)
+{
+       wait_event(info->waiting_to_free,
+               atomic_read(&info->nr_unreplied_reqs) == 0);
+
+       unbind_from_irqhandler(info->irq, info);
+       info->irq = 0;
+       xenbus_unmap_ring_vfree(info->dev, info->ring.sring);
+}
+
+static void scsiback_device_action(struct vscsibk_pend *pending_req,
+       enum tcm_tmreq_table act, int tag)
+{
+       int rc, err = FAILED;
+       struct scsiback_tpg *tpg = pending_req->v2p->tpg;
+       struct se_cmd *se_cmd = &pending_req->se_cmd;
+       struct scsiback_tmr *tmr;
+
+       tmr = kzalloc(sizeof(struct scsiback_tmr), GFP_KERNEL);
+       if (!tmr)
+               goto out;
+
+       init_waitqueue_head(&tmr->tmr_wait);
+
+       transport_init_se_cmd(se_cmd, tpg->se_tpg.se_tpg_tfo,
+               tpg->tpg_nexus->tvn_se_sess, 0, DMA_NONE, MSG_SIMPLE_TAG,
+               &pending_req->sense_buffer[0]);
+
+       rc = core_tmr_alloc_req(se_cmd, tmr, act, GFP_KERNEL);
+       if (rc < 0)
+               goto out;
+
+       se_cmd->se_tmr_req->ref_task_tag = tag;
+
+       if (transport_lookup_tmr_lun(se_cmd, pending_req->v2p->lun) < 0)
+               goto out;
+
+       transport_generic_handle_tmr(se_cmd);
+       wait_event(tmr->tmr_wait, atomic_read(&tmr->tmr_complete));
+
+       err = (se_cmd->se_tmr_req->response == TMR_FUNCTION_COMPLETE) ?
+               SUCCESS : FAILED;
+
+out:
+       if (tmr) {
+               transport_generic_free_cmd(&pending_req->se_cmd, 1);
+               kfree(tmr);
+       }
+
+       scsiback_do_resp_with_sense(NULL, err, 0, pending_req);
+
+       kmem_cache_free(scsiback_cachep, pending_req);
+}
+
+/*
+  Perform virtual to physical translation
+*/
+static struct v2p_entry *scsiback_do_translation(struct vscsibk_info *info,
+                       struct ids_tuple *v)
+{
+       struct v2p_entry *entry;
+       struct list_head *head = &(info->v2p_entry_lists);
+       unsigned long flags;
+
+       spin_lock_irqsave(&info->v2p_lock, flags);
+       list_for_each_entry(entry, head, l) {
+               if ((entry->v.chn == v->chn) &&
+                   (entry->v.tgt == v->tgt) &&
+                   (entry->v.lun == v->lun)) {
+                       kref_get(&entry->kref);
+                       goto out;
+               }
+       }
+       entry = NULL;
+
+out:
+       spin_unlock_irqrestore(&info->v2p_lock, flags);
+       return entry;
+}
+
+static int prepare_pending_reqs(struct vscsibk_info *info,
+                               struct vscsiif_request *ring_req,
+                               struct vscsibk_pend *pending_req)
+{
+       struct v2p_entry *v2p;
+       struct ids_tuple vir;
+
+       pending_req->rqid       = ring_req->rqid;
+       pending_req->info       = info;
+
+       vir.chn = ring_req->channel;
+       vir.tgt = ring_req->id;
+       vir.lun = ring_req->lun;
+
+       v2p = scsiback_do_translation(info, &vir);
+       if (!v2p) {
+               pending_req->v2p = NULL;
+               DPRINTK("xen-pvscsi: doesn't exist.\n");
+               return -ENODEV;
+       }
+       pending_req->v2p = v2p;
+
+       /* request range check from frontend */
+       pending_req->sc_data_direction = ring_req->sc_data_direction;
+       if ((pending_req->sc_data_direction != DMA_BIDIRECTIONAL) &&
+               (pending_req->sc_data_direction != DMA_TO_DEVICE) &&
+               (pending_req->sc_data_direction != DMA_FROM_DEVICE) &&
+               (pending_req->sc_data_direction != DMA_NONE)) {
+               DPRINTK("xen-pvscsi: invalid parameter data_dir = %d\n",
+                       pending_req->sc_data_direction);
+               return -EINVAL;
+       }
+
+       pending_req->cmd_len = ring_req->cmd_len;
+       if (pending_req->cmd_len > VSCSIIF_MAX_COMMAND_SIZE) {
+               DPRINTK("xen-pvscsi: invalid parameter cmd_len = %d\n",
+                       pending_req->cmd_len);
+               return -EINVAL;
+       }
+       memcpy(pending_req->cmnd, ring_req->cmnd, pending_req->cmd_len);
+
+       return 0;
+}
+
+static int scsiback_do_cmd_fn(struct vscsibk_info *info)
+{
+       struct vscsiif_back_ring *ring = &info->ring;
+       struct vscsiif_request *ring_req;
+       struct vscsibk_pend *pending_req;
+       RING_IDX rc, rp;
+       int err, more_to_do;
+       uint32_t result;
+       uint8_t act;
+
+       rc = ring->req_cons;
+       rp = ring->sring->req_prod;
+       rmb();  /* guest system is accessing ring, too */
+
+       if (RING_REQUEST_PROD_OVERFLOW(ring, rp)) {
+               rc = ring->rsp_prod_pvt;
+               pr_warn("xen-pvscsi: Dom%d provided bogus ring requests (%#x - %#x = %u). Halting ring processing\n",
+                          info->domid, rp, rc, rp - rc);
+               info->ring_error = 1;
+               return 0;
+       }
+
+       while ((rc != rp)) {
+               if (RING_REQUEST_CONS_OVERFLOW(ring, rc))
+                       break;
+               pending_req = kmem_cache_alloc(scsiback_cachep, GFP_KERNEL);
+               if (!pending_req)
+                       return 1;
+
+               ring_req = RING_GET_REQUEST(ring, rc);
+               ring->req_cons = ++rc;
+
+               act = ring_req->act;
+               err = prepare_pending_reqs(info, ring_req, pending_req);
+               if (err) {
+                       switch (err) {
+                       case -ENODEV:
+                               result = DID_NO_CONNECT;
+                               break;
+                       default:
+                               result = DRIVER_ERROR;
+                               break;
+                       }
+                       scsiback_do_resp_with_sense(NULL, result << 24, 0,
+                                                   pending_req);
+                       kmem_cache_free(scsiback_cachep, pending_req);
+                       return 1;
+               }
+
+               switch (act) {
+               case VSCSIIF_ACT_SCSI_CDB:
+                       if (scsiback_gnttab_data_map(ring_req, pending_req)) {
+                               scsiback_fast_flush_area(pending_req);
+                               scsiback_do_resp_with_sense(NULL,
+                                       DRIVER_ERROR << 24, 0, pending_req);
+                               kmem_cache_free(scsiback_cachep, pending_req);
+                       } else {
+                               scsiback_cmd_exec(pending_req);
+                       }
+                       break;
+               case VSCSIIF_ACT_SCSI_ABORT:
+                       scsiback_device_action(pending_req, TMR_ABORT_TASK,
+                               ring_req->ref_rqid);
+                       break;
+               case VSCSIIF_ACT_SCSI_RESET:
+                       scsiback_device_action(pending_req, TMR_LUN_RESET, 0);
+                       break;
+               default:
+                       pr_err_ratelimited("xen-pvscsi: invalid request\n");
+                       scsiback_do_resp_with_sense(NULL, DRIVER_ERROR << 24,
+                                                   0, pending_req);
+                       kmem_cache_free(scsiback_cachep, pending_req);
+                       break;
+               }
+
+               /* Yield point for this unbounded loop. */
+               cond_resched();
+       }
+
+       RING_FINAL_CHECK_FOR_REQUESTS(&info->ring, more_to_do);
+       return more_to_do;
+}
+
+static irqreturn_t scsiback_irq_fn(int irq, void *dev_id)
+{
+       struct vscsibk_info *info = dev_id;
+
+       if (info->ring_error)
+               return IRQ_HANDLED;
+
+       while (scsiback_do_cmd_fn(info))
+               cond_resched();
+
+       return IRQ_HANDLED;
+}
+
+static int scsiback_init_sring(struct vscsibk_info *info, grant_ref_t ring_ref,
+                       evtchn_port_t evtchn)
+{
+       void *area;
+       struct vscsiif_sring *sring;
+       int err;
+
+       if (info->irq)
+               return -1;
+
+       err = xenbus_map_ring_valloc(info->dev, ring_ref, &area);
+       if (err)
+               return err;
+
+       sring = (struct vscsiif_sring *)area;
+       BACK_RING_INIT(&info->ring, sring, PAGE_SIZE);
+
+       err = bind_interdomain_evtchn_to_irq(info->domid, evtchn);
+       if (err < 0)
+               goto unmap_page;
+
+       info->irq = err;
+
+       err = request_threaded_irq(info->irq, NULL, scsiback_irq_fn,
+                                  IRQF_ONESHOT, "vscsiif-backend", info);
+       if (err)
+               goto free_irq;
+
+       return 0;
+
+free_irq:
+       unbind_from_irqhandler(info->irq, info);
+       info->irq = 0;
+unmap_page:
+       xenbus_unmap_ring_vfree(info->dev, area);
+
+       return err;
+}
+
+static int scsiback_map(struct vscsibk_info *info)
+{
+       struct xenbus_device *dev = info->dev;
+       unsigned int ring_ref, evtchn;
+       int err;
+
+       err = xenbus_gather(XBT_NIL, dev->otherend,
+                       "ring-ref", "%u", &ring_ref,
+                       "event-channel", "%u", &evtchn, NULL);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "reading %s ring", dev->otherend);
+               return err;
+       }
+
+       return scsiback_init_sring(info, ring_ref, evtchn);
+}
+
+/*
+  Add a new translation entry
+*/
+static int scsiback_add_translation_entry(struct vscsibk_info *info,
+                                         char *phy, struct ids_tuple *v)
+{
+       int err = 0;
+       struct v2p_entry *entry;
+       struct v2p_entry *new;
+       struct list_head *head = &(info->v2p_entry_lists);
+       unsigned long flags;
+       char *lunp;
+       unsigned int lun;
+       struct scsiback_tpg *tpg_entry, *tpg = NULL;
+       char *error = "doesn't exist";
+
+       lunp = strrchr(phy, ':');
+       if (!lunp) {
+               pr_err("xen-pvscsi: illegal format of physical device %s\n",
+                       phy);
+               return -EINVAL;
+       }
+       *lunp = 0;
+       lunp++;
+       if (kstrtouint(lunp, 10, &lun) || lun >= TRANSPORT_MAX_LUNS_PER_TPG) {
+               pr_err("xen-pvscsi: lun number not valid: %s\n", lunp);
+               return -EINVAL;
+       }
+
+       mutex_lock(&scsiback_mutex);
+       list_for_each_entry(tpg_entry, &scsiback_list, tv_tpg_list) {
+               if (!strcmp(phy, tpg_entry->tport->tport_name) ||
+                   !strcmp(phy, tpg_entry->param_alias)) {
+                       spin_lock(&tpg_entry->se_tpg.tpg_lun_lock);
+                       if (tpg_entry->se_tpg.tpg_lun_list[lun]->lun_status ==
+                           TRANSPORT_LUN_STATUS_ACTIVE) {
+                               if (!tpg_entry->tpg_nexus)
+                                       error = "nexus undefined";
+                               else
+                                       tpg = tpg_entry;
+                       }
+                       spin_unlock(&tpg_entry->se_tpg.tpg_lun_lock);
+                       break;
+               }
+       }
+       if (tpg) {
+               mutex_lock(&tpg->tv_tpg_mutex);
+               tpg->tv_tpg_fe_count++;
+               mutex_unlock(&tpg->tv_tpg_mutex);
+       }
+       mutex_unlock(&scsiback_mutex);
+
+       if (!tpg) {
+               pr_err("xen-pvscsi: %s:%d %s\n", phy, lun, error);
+               return -ENODEV;
+       }
+
+       new = kmalloc(sizeof(struct v2p_entry), GFP_KERNEL);
+       if (new == NULL) {
+               err = -ENOMEM;
+               goto out_free;
+       }
+
+       spin_lock_irqsave(&info->v2p_lock, flags);
+
+       /* Check double assignment to identical virtual ID */
+       list_for_each_entry(entry, head, l) {
+               if ((entry->v.chn == v->chn) &&
+                   (entry->v.tgt == v->tgt) &&
+                   (entry->v.lun == v->lun)) {
+                       pr_warn("xen-pvscsi: Virtual ID is already used. Assignment was not performed.\n");
+                       err = -EEXIST;
+                       goto out;
+               }
+
+       }
+
+       /* Create a new translation entry and add to the list */
+       kref_init(&new->kref);
+       new->v = *v;
+       new->tpg = tpg;
+       new->lun = lun;
+       list_add_tail(&new->l, head);
+
+out:
+       spin_unlock_irqrestore(&info->v2p_lock, flags);
+
+out_free:
+       mutex_lock(&tpg->tv_tpg_mutex);
+       tpg->tv_tpg_fe_count--;
+       mutex_unlock(&tpg->tv_tpg_mutex);
+
+       if (err)
+               kfree(new);
+
+       return err;
+}
+
+static void __scsiback_del_translation_entry(struct v2p_entry *entry)
+{
+       list_del(&entry->l);
+       kref_put(&entry->kref, scsiback_free_translation_entry);
+}
+
+/*
+  Delete the translation entry specfied
+*/
+static int scsiback_del_translation_entry(struct vscsibk_info *info,
+                                         struct ids_tuple *v)
+{
+       struct v2p_entry *entry;
+       struct list_head *head = &(info->v2p_entry_lists);
+       unsigned long flags;
+
+       spin_lock_irqsave(&info->v2p_lock, flags);
+       /* Find out the translation entry specified */
+       list_for_each_entry(entry, head, l) {
+               if ((entry->v.chn == v->chn) &&
+                   (entry->v.tgt == v->tgt) &&
+                   (entry->v.lun == v->lun)) {
+                       goto found;
+               }
+       }
+
+       spin_unlock_irqrestore(&info->v2p_lock, flags);
+       return 1;
+
+found:
+       /* Delete the translation entry specfied */
+       __scsiback_del_translation_entry(entry);
+
+       spin_unlock_irqrestore(&info->v2p_lock, flags);
+       return 0;
+}
+
+static void scsiback_do_add_lun(struct vscsibk_info *info, const char *state,
+                               char *phy, struct ids_tuple *vir)
+{
+       if (!scsiback_add_translation_entry(info, phy, vir)) {
+               if (xenbus_printf(XBT_NIL, info->dev->nodename, state,
+                                 "%d", XenbusStateInitialised)) {
+                       pr_err("xen-pvscsi: xenbus_printf error %s\n", state);
+                       scsiback_del_translation_entry(info, vir);
+               }
+       } else {
+               xenbus_printf(XBT_NIL, info->dev->nodename, state,
+                             "%d", XenbusStateClosed);
+       }
+}
+
+static void scsiback_do_del_lun(struct vscsibk_info *info, const char *state,
+                               struct ids_tuple *vir)
+{
+       if (!scsiback_del_translation_entry(info, vir)) {
+               if (xenbus_printf(XBT_NIL, info->dev->nodename, state,
+                                 "%d", XenbusStateClosed))
+                       pr_err("xen-pvscsi: xenbus_printf error %s\n", state);
+       }
+}
+
+#define VSCSIBACK_OP_ADD_OR_DEL_LUN    1
+#define VSCSIBACK_OP_UPDATEDEV_STATE   2
+
+static void scsiback_do_1lun_hotplug(struct vscsibk_info *info, int op,
+                                    char *ent)
+{
+       int err;
+       struct ids_tuple vir;
+       char *val;
+       int device_state;
+       char phy[VSCSI_NAMELEN];
+       char str[64];
+       char state[64];
+       struct xenbus_device *dev = info->dev;
+
+       /* read status */
+       snprintf(state, sizeof(state), "vscsi-devs/%s/state", ent);
+       err = xenbus_scanf(XBT_NIL, dev->nodename, state, "%u", &device_state);
+       if (XENBUS_EXIST_ERR(err))
+               return;
+
+       /* physical SCSI device */
+       snprintf(str, sizeof(str), "vscsi-devs/%s/p-dev", ent);
+       val = xenbus_read(XBT_NIL, dev->nodename, str, NULL);
+       if (IS_ERR(val)) {
+               xenbus_printf(XBT_NIL, dev->nodename, state,
+                             "%d", XenbusStateClosed);
+               return;
+       }
+       strlcpy(phy, val, VSCSI_NAMELEN);
+       kfree(val);
+
+       /* virtual SCSI device */
+       snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", ent);
+       err = xenbus_scanf(XBT_NIL, dev->nodename, str, "%u:%u:%u:%u",
+                          &vir.hst, &vir.chn, &vir.tgt, &vir.lun);
+       if (XENBUS_EXIST_ERR(err)) {
+               xenbus_printf(XBT_NIL, dev->nodename, state,
+                             "%d", XenbusStateClosed);
+               return;
+       }
+
+       switch (op) {
+       case VSCSIBACK_OP_ADD_OR_DEL_LUN:
+               if (device_state == XenbusStateInitialising)
+                       scsiback_do_add_lun(info, state, phy, &vir);
+               if (device_state == XenbusStateClosing)
+                       scsiback_do_del_lun(info, state, &vir);
+               break;
+
+       case VSCSIBACK_OP_UPDATEDEV_STATE:
+               if (device_state == XenbusStateInitialised) {
+                       /* modify vscsi-devs/dev-x/state */
+                       if (xenbus_printf(XBT_NIL, dev->nodename, state,
+                                         "%d", XenbusStateConnected)) {
+                               pr_err("xen-pvscsi: xenbus_printf error %s\n",
+                                      str);
+                               scsiback_del_translation_entry(info, &vir);
+                               xenbus_printf(XBT_NIL, dev->nodename, state,
+                                             "%d", XenbusStateClosed);
+                       }
+               }
+               break;
+       /*When it is necessary, processing is added here.*/
+       default:
+               break;
+       }
+}
+
+static void scsiback_do_lun_hotplug(struct vscsibk_info *info, int op)
+{
+       int i;
+       char **dir;
+       unsigned int ndir = 0;
+
+       dir = xenbus_directory(XBT_NIL, info->dev->nodename, "vscsi-devs",
+                              &ndir);
+       if (IS_ERR(dir))
+               return;
+
+       for (i = 0; i < ndir; i++)
+               scsiback_do_1lun_hotplug(info, op, dir[i]);
+
+       kfree(dir);
+}
+
+static void scsiback_frontend_changed(struct xenbus_device *dev,
+                                       enum xenbus_state frontend_state)
+{
+       struct vscsibk_info *info = dev_get_drvdata(&dev->dev);
+
+       switch (frontend_state) {
+       case XenbusStateInitialising:
+               break;
+
+       case XenbusStateInitialised:
+               if (scsiback_map(info))
+                       break;
+
+               scsiback_do_lun_hotplug(info, VSCSIBACK_OP_ADD_OR_DEL_LUN);
+               xenbus_switch_state(dev, XenbusStateConnected);
+               break;
+
+       case XenbusStateConnected:
+               scsiback_do_lun_hotplug(info, VSCSIBACK_OP_UPDATEDEV_STATE);
+
+               if (dev->state == XenbusStateConnected)
+                       break;
+
+               xenbus_switch_state(dev, XenbusStateConnected);
+               break;
+
+       case XenbusStateClosing:
+               if (info->irq)
+                       scsiback_disconnect(info);
+
+               xenbus_switch_state(dev, XenbusStateClosing);
+               break;
+
+       case XenbusStateClosed:
+               xenbus_switch_state(dev, XenbusStateClosed);
+               if (xenbus_dev_is_online(dev))
+                       break;
+               /* fall through if not online */
+       case XenbusStateUnknown:
+               device_unregister(&dev->dev);
+               break;
+
+       case XenbusStateReconfiguring:
+               scsiback_do_lun_hotplug(info, VSCSIBACK_OP_ADD_OR_DEL_LUN);
+               xenbus_switch_state(dev, XenbusStateReconfigured);
+
+               break;
+
+       default:
+               xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+                                       frontend_state);
+               break;
+       }
+}
+
+/*
+  Release the translation entry specfied
+*/
+static void scsiback_release_translation_entry(struct vscsibk_info *info)
+{
+       struct v2p_entry *entry, *tmp;
+       struct list_head *head = &(info->v2p_entry_lists);
+       unsigned long flags;
+
+       spin_lock_irqsave(&info->v2p_lock, flags);
+
+       list_for_each_entry_safe(entry, tmp, head, l)
+               __scsiback_del_translation_entry(entry);
+
+       spin_unlock_irqrestore(&info->v2p_lock, flags);
+}
+
+static int scsiback_remove(struct xenbus_device *dev)
+{
+       struct vscsibk_info *info = dev_get_drvdata(&dev->dev);
+
+       if (info->irq)
+               scsiback_disconnect(info);
+
+       scsiback_release_translation_entry(info);
+
+       dev_set_drvdata(&dev->dev, NULL);
+
+       return 0;
+}
+
+static int scsiback_probe(struct xenbus_device *dev,
+                          const struct xenbus_device_id *id)
+{
+       int err;
+
+       struct vscsibk_info *info = kzalloc(sizeof(struct vscsibk_info),
+                                           GFP_KERNEL);
+
+       DPRINTK("%p %d\n", dev, dev->otherend_id);
+
+       if (!info) {
+               xenbus_dev_fatal(dev, -ENOMEM, "allocating backend structure");
+               return -ENOMEM;
+       }
+       info->dev = dev;
+       dev_set_drvdata(&dev->dev, info);
+
+       info->domid = dev->otherend_id;
+       spin_lock_init(&info->ring_lock);
+       info->ring_error = 0;
+       atomic_set(&info->nr_unreplied_reqs, 0);
+       init_waitqueue_head(&info->waiting_to_free);
+       info->dev = dev;
+       info->irq = 0;
+       INIT_LIST_HEAD(&info->v2p_entry_lists);
+       spin_lock_init(&info->v2p_lock);
+
+       err = xenbus_printf(XBT_NIL, dev->nodename, "feature-sg-grant", "%u",
+                           SG_ALL);
+       if (err)
+               xenbus_dev_error(dev, err, "writing feature-sg-grant");
+
+       err = xenbus_switch_state(dev, XenbusStateInitWait);
+       if (err)
+               goto fail;
+
+       return 0;
+
+fail:
+       pr_warn("xen-pvscsi: %s failed\n", __func__);
+       scsiback_remove(dev);
+
+       return err;
+}
+
+static char *scsiback_dump_proto_id(struct scsiback_tport *tport)
+{
+       switch (tport->tport_proto_id) {
+       case SCSI_PROTOCOL_SAS:
+               return "SAS";
+       case SCSI_PROTOCOL_FCP:
+               return "FCP";
+       case SCSI_PROTOCOL_ISCSI:
+               return "iSCSI";
+       default:
+               break;
+       }
+
+       return "Unknown";
+}
+
+static u8 scsiback_get_fabric_proto_ident(struct se_portal_group *se_tpg)
+{
+       struct scsiback_tpg *tpg = container_of(se_tpg,
+                               struct scsiback_tpg, se_tpg);
+       struct scsiback_tport *tport = tpg->tport;
+
+       switch (tport->tport_proto_id) {
+       case SCSI_PROTOCOL_SAS:
+               return sas_get_fabric_proto_ident(se_tpg);
+       case SCSI_PROTOCOL_FCP:
+               return fc_get_fabric_proto_ident(se_tpg);
+       case SCSI_PROTOCOL_ISCSI:
+               return iscsi_get_fabric_proto_ident(se_tpg);
+       default:
+               pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n",
+                       tport->tport_proto_id);
+               break;
+       }
+
+       return sas_get_fabric_proto_ident(se_tpg);
+}
+
+static char *scsiback_get_fabric_wwn(struct se_portal_group *se_tpg)
+{
+       struct scsiback_tpg *tpg = container_of(se_tpg,
+                               struct scsiback_tpg, se_tpg);
+       struct scsiback_tport *tport = tpg->tport;
+
+       return &tport->tport_name[0];
+}
+
+static u16 scsiback_get_tag(struct se_portal_group *se_tpg)
+{
+       struct scsiback_tpg *tpg = container_of(se_tpg,
+                               struct scsiback_tpg, se_tpg);
+       return tpg->tport_tpgt;
+}
+
+static u32 scsiback_get_default_depth(struct se_portal_group *se_tpg)
+{
+       return 1;
+}
+
+static u32
+scsiback_get_pr_transport_id(struct se_portal_group *se_tpg,
+                             struct se_node_acl *se_nacl,
+                             struct t10_pr_registration *pr_reg,
+                             int *format_code,
+                             unsigned char *buf)
+{
+       struct scsiback_tpg *tpg = container_of(se_tpg,
+                               struct scsiback_tpg, se_tpg);
+       struct scsiback_tport *tport = tpg->tport;
+
+       switch (tport->tport_proto_id) {
+       case SCSI_PROTOCOL_SAS:
+               return sas_get_pr_transport_id(se_tpg, se_nacl, pr_reg,
+                                       format_code, buf);
+       case SCSI_PROTOCOL_FCP:
+               return fc_get_pr_transport_id(se_tpg, se_nacl, pr_reg,
+                                       format_code, buf);
+       case SCSI_PROTOCOL_ISCSI:
+               return iscsi_get_pr_transport_id(se_tpg, se_nacl, pr_reg,
+                                       format_code, buf);
+       default:
+               pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n",
+                       tport->tport_proto_id);
+               break;
+       }
+
+       return sas_get_pr_transport_id(se_tpg, se_nacl, pr_reg,
+                       format_code, buf);
+}
+
+static u32
+scsiback_get_pr_transport_id_len(struct se_portal_group *se_tpg,
+                                 struct se_node_acl *se_nacl,
+                                 struct t10_pr_registration *pr_reg,
+                                 int *format_code)
+{
+       struct scsiback_tpg *tpg = container_of(se_tpg,
+                               struct scsiback_tpg, se_tpg);
+       struct scsiback_tport *tport = tpg->tport;
+
+       switch (tport->tport_proto_id) {
+       case SCSI_PROTOCOL_SAS:
+               return sas_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg,
+                                       format_code);
+       case SCSI_PROTOCOL_FCP:
+               return fc_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg,
+                                       format_code);
+       case SCSI_PROTOCOL_ISCSI:
+               return iscsi_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg,
+                                       format_code);
+       default:
+               pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n",
+                       tport->tport_proto_id);
+               break;
+       }
+
+       return sas_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg,
+                       format_code);
+}
+
+static char *
+scsiback_parse_pr_out_transport_id(struct se_portal_group *se_tpg,
+                                   const char *buf,
+                                   u32 *out_tid_len,
+                                   char **port_nexus_ptr)
+{
+       struct scsiback_tpg *tpg = container_of(se_tpg,
+                               struct scsiback_tpg, se_tpg);
+       struct scsiback_tport *tport = tpg->tport;
+
+       switch (tport->tport_proto_id) {
+       case SCSI_PROTOCOL_SAS:
+               return sas_parse_pr_out_transport_id(se_tpg, buf, out_tid_len,
+                                       port_nexus_ptr);
+       case SCSI_PROTOCOL_FCP:
+               return fc_parse_pr_out_transport_id(se_tpg, buf, out_tid_len,
+                                       port_nexus_ptr);
+       case SCSI_PROTOCOL_ISCSI:
+               return iscsi_parse_pr_out_transport_id(se_tpg, buf, out_tid_len,
+                                       port_nexus_ptr);
+       default:
+               pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n",
+                       tport->tport_proto_id);
+               break;
+       }
+
+       return sas_parse_pr_out_transport_id(se_tpg, buf, out_tid_len,
+                       port_nexus_ptr);
+}
+
+static struct se_wwn *
+scsiback_make_tport(struct target_fabric_configfs *tf,
+                    struct config_group *group,
+                    const char *name)
+{
+       struct scsiback_tport *tport;
+       char *ptr;
+       u64 wwpn = 0;
+       int off = 0;
+
+       tport = kzalloc(sizeof(struct scsiback_tport), GFP_KERNEL);
+       if (!tport)
+               return ERR_PTR(-ENOMEM);
+
+       tport->tport_wwpn = wwpn;
+       /*
+        * Determine the emulated Protocol Identifier and Target Port Name
+        * based on the incoming configfs directory name.
+        */
+       ptr = strstr(name, "naa.");
+       if (ptr) {
+               tport->tport_proto_id = SCSI_PROTOCOL_SAS;
+               goto check_len;
+       }
+       ptr = strstr(name, "fc.");
+       if (ptr) {
+               tport->tport_proto_id = SCSI_PROTOCOL_FCP;
+               off = 3; /* Skip over "fc." */
+               goto check_len;
+       }
+       ptr = strstr(name, "iqn.");
+       if (ptr) {
+               tport->tport_proto_id = SCSI_PROTOCOL_ISCSI;
+               goto check_len;
+       }
+
+       pr_err("Unable to locate prefix for emulated Target Port: %s\n", name);
+       kfree(tport);
+       return ERR_PTR(-EINVAL);
+
+check_len:
+       if (strlen(name) >= VSCSI_NAMELEN) {
+               pr_err("Emulated %s Address: %s, exceeds max: %d\n", name,
+                       scsiback_dump_proto_id(tport), VSCSI_NAMELEN);
+               kfree(tport);
+               return ERR_PTR(-EINVAL);
+       }
+       snprintf(&tport->tport_name[0], VSCSI_NAMELEN, "%s", &name[off]);
+
+       pr_debug("xen-pvscsi: Allocated emulated Target %s Address: %s\n",
+                scsiback_dump_proto_id(tport), name);
+
+       return &tport->tport_wwn;
+}
+
+static void scsiback_drop_tport(struct se_wwn *wwn)
+{
+       struct scsiback_tport *tport = container_of(wwn,
+                               struct scsiback_tport, tport_wwn);
+
+       pr_debug("xen-pvscsi: Deallocating emulated Target %s Address: %s\n",
+                scsiback_dump_proto_id(tport), tport->tport_name);
+
+       kfree(tport);
+}
+
+static struct se_node_acl *
+scsiback_alloc_fabric_acl(struct se_portal_group *se_tpg)
+{
+       return kzalloc(sizeof(struct se_node_acl), GFP_KERNEL);
+}
+
+static void
+scsiback_release_fabric_acl(struct se_portal_group *se_tpg,
+                            struct se_node_acl *se_nacl)
+{
+       kfree(se_nacl);
+}
+
+static u32 scsiback_tpg_get_inst_index(struct se_portal_group *se_tpg)
+{
+       return 1;
+}
+
+static int scsiback_check_stop_free(struct se_cmd *se_cmd)
+{
+       /*
+        * Do not release struct se_cmd's containing a valid TMR
+        * pointer.  These will be released directly in scsiback_device_action()
+        * with transport_generic_free_cmd().
+        */
+       if (se_cmd->se_cmd_flags & SCF_SCSI_TMR_CDB)
+               return 0;
+
+       transport_generic_free_cmd(se_cmd, 0);
+       return 1;
+}
+
+static void scsiback_release_cmd(struct se_cmd *se_cmd)
+{
+       struct vscsibk_pend *pending_req = container_of(se_cmd,
+                               struct vscsibk_pend, se_cmd);
+
+       kmem_cache_free(scsiback_cachep, pending_req);
+}
+
+static int scsiback_shutdown_session(struct se_session *se_sess)
+{
+       return 0;
+}
+
+static void scsiback_close_session(struct se_session *se_sess)
+{
+}
+
+static u32 scsiback_sess_get_index(struct se_session *se_sess)
+{
+       return 0;
+}
+
+static int scsiback_write_pending(struct se_cmd *se_cmd)
+{
+       /* Go ahead and process the write immediately */
+       target_execute_cmd(se_cmd);
+
+       return 0;
+}
+
+static int scsiback_write_pending_status(struct se_cmd *se_cmd)
+{
+       return 0;
+}
+
+static void scsiback_set_default_node_attrs(struct se_node_acl *nacl)
+{
+}
+
+static u32 scsiback_get_task_tag(struct se_cmd *se_cmd)
+{
+       struct vscsibk_pend *pending_req = container_of(se_cmd,
+                               struct vscsibk_pend, se_cmd);
+
+       return pending_req->rqid;
+}
+
+static int scsiback_get_cmd_state(struct se_cmd *se_cmd)
+{
+       return 0;
+}
+
+static int scsiback_queue_data_in(struct se_cmd *se_cmd)
+{
+       struct vscsibk_pend *pending_req = container_of(se_cmd,
+                               struct vscsibk_pend, se_cmd);
+
+       pending_req->result = SAM_STAT_GOOD;
+       scsiback_cmd_done(pending_req);
+       return 0;
+}
+
+static int scsiback_queue_status(struct se_cmd *se_cmd)
+{
+       struct vscsibk_pend *pending_req = container_of(se_cmd,
+                               struct vscsibk_pend, se_cmd);
+
+       if (se_cmd->sense_buffer &&
+           ((se_cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) ||
+            (se_cmd->se_cmd_flags & SCF_EMULATED_TASK_SENSE)))
+               pending_req->result = (DRIVER_SENSE << 24) |
+                                     SAM_STAT_CHECK_CONDITION;
+       else
+               pending_req->result = se_cmd->scsi_status;
+
+       scsiback_cmd_done(pending_req);
+       return 0;
+}
+
+static void scsiback_queue_tm_rsp(struct se_cmd *se_cmd)
+{
+       struct se_tmr_req *se_tmr = se_cmd->se_tmr_req;
+       struct scsiback_tmr *tmr = se_tmr->fabric_tmr_ptr;
+
+       atomic_set(&tmr->tmr_complete, 1);
+       wake_up(&tmr->tmr_wait);
+}
+
+static void scsiback_aborted_task(struct se_cmd *se_cmd)
+{
+}
+
+static ssize_t scsiback_tpg_param_show_alias(struct se_portal_group *se_tpg,
+                                            char *page)
+{
+       struct scsiback_tpg *tpg = container_of(se_tpg, struct scsiback_tpg,
+                                               se_tpg);
+       ssize_t rb;
+
+       mutex_lock(&tpg->tv_tpg_mutex);
+       rb = snprintf(page, PAGE_SIZE, "%s\n", tpg->param_alias);
+       mutex_unlock(&tpg->tv_tpg_mutex);
+
+       return rb;
+}
+
+static ssize_t scsiback_tpg_param_store_alias(struct se_portal_group *se_tpg,
+                                             const char *page, size_t count)
+{
+       struct scsiback_tpg *tpg = container_of(se_tpg, struct scsiback_tpg,
+                                               se_tpg);
+       int len;
+
+       if (strlen(page) >= VSCSI_NAMELEN) {
+               pr_err("param alias: %s, exceeds max: %d\n", page,
+                       VSCSI_NAMELEN);
+               return -EINVAL;
+       }
+
+       mutex_lock(&tpg->tv_tpg_mutex);
+       len = snprintf(tpg->param_alias, VSCSI_NAMELEN, "%s", page);
+       if (tpg->param_alias[len - 1] == '\n')
+               tpg->param_alias[len - 1] = '\0';
+       mutex_unlock(&tpg->tv_tpg_mutex);
+
+       return count;
+}
+
+TF_TPG_PARAM_ATTR(scsiback, alias, S_IRUGO | S_IWUSR);
+
+static struct configfs_attribute *scsiback_param_attrs[] = {
+       &scsiback_tpg_param_alias.attr,
+       NULL,
+};
+
+static int scsiback_make_nexus(struct scsiback_tpg *tpg,
+                               const char *name)
+{
+       struct se_portal_group *se_tpg;
+       struct se_session *se_sess;
+       struct scsiback_nexus *tv_nexus;
+
+       mutex_lock(&tpg->tv_tpg_mutex);
+       if (tpg->tpg_nexus) {
+               mutex_unlock(&tpg->tv_tpg_mutex);
+               pr_debug("tpg->tpg_nexus already exists\n");
+               return -EEXIST;
+       }
+       se_tpg = &tpg->se_tpg;
+
+       tv_nexus = kzalloc(sizeof(struct scsiback_nexus), GFP_KERNEL);
+       if (!tv_nexus) {
+               mutex_unlock(&tpg->tv_tpg_mutex);
+               return -ENOMEM;
+       }
+       /*
+        *  Initialize the struct se_session pointer
+        */
+       tv_nexus->tvn_se_sess = transport_init_session(TARGET_PROT_NORMAL);
+       if (IS_ERR(tv_nexus->tvn_se_sess)) {
+               mutex_unlock(&tpg->tv_tpg_mutex);
+               kfree(tv_nexus);
+               return -ENOMEM;
+       }
+       se_sess = tv_nexus->tvn_se_sess;
+       /*
+        * Since we are running in 'demo mode' this call with generate a
+        * struct se_node_acl for the scsiback struct se_portal_group with
+        * the SCSI Initiator port name of the passed configfs group 'name'.
+        */
+       tv_nexus->tvn_se_sess->se_node_acl = core_tpg_check_initiator_node_acl(
+                               se_tpg, (unsigned char *)name);
+       if (!tv_nexus->tvn_se_sess->se_node_acl) {
+               mutex_unlock(&tpg->tv_tpg_mutex);
+               pr_debug("core_tpg_check_initiator_node_acl() failed for %s\n",
+                        name);
+               goto out;
+       }
+       /*
+        * Now register the TCM pvscsi virtual I_T Nexus as active with the
+        * call to __transport_register_session()
+        */
+       __transport_register_session(se_tpg, tv_nexus->tvn_se_sess->se_node_acl,
+                       tv_nexus->tvn_se_sess, tv_nexus);
+       tpg->tpg_nexus = tv_nexus;
+
+       mutex_unlock(&tpg->tv_tpg_mutex);
+       return 0;
+
+out:
+       transport_free_session(se_sess);
+       kfree(tv_nexus);
+       return -ENOMEM;
+}
+
+static int scsiback_drop_nexus(struct scsiback_tpg *tpg)
+{
+       struct se_session *se_sess;
+       struct scsiback_nexus *tv_nexus;
+
+       mutex_lock(&tpg->tv_tpg_mutex);
+       tv_nexus = tpg->tpg_nexus;
+       if (!tv_nexus) {
+               mutex_unlock(&tpg->tv_tpg_mutex);
+               return -ENODEV;
+       }
+
+       se_sess = tv_nexus->tvn_se_sess;
+       if (!se_sess) {
+               mutex_unlock(&tpg->tv_tpg_mutex);
+               return -ENODEV;
+       }
+
+       if (tpg->tv_tpg_port_count != 0) {
+               mutex_unlock(&tpg->tv_tpg_mutex);
+               pr_err("Unable to remove xen-pvscsi I_T Nexus with active TPG port count: %d\n",
+                       tpg->tv_tpg_port_count);
+               return -EBUSY;
+       }
+
+       if (tpg->tv_tpg_fe_count != 0) {
+               mutex_unlock(&tpg->tv_tpg_mutex);
+               pr_err("Unable to remove xen-pvscsi I_T Nexus with active TPG frontend count: %d\n",
+                       tpg->tv_tpg_fe_count);
+               return -EBUSY;
+       }
+
+       pr_debug("xen-pvscsi: Removing I_T Nexus to emulated %s Initiator Port: %s\n",
+               scsiback_dump_proto_id(tpg->tport),
+               tv_nexus->tvn_se_sess->se_node_acl->initiatorname);
+
+       /*
+        * Release the SCSI I_T Nexus to the emulated xen-pvscsi Target Port
+        */
+       transport_deregister_session(tv_nexus->tvn_se_sess);
+       tpg->tpg_nexus = NULL;
+       mutex_unlock(&tpg->tv_tpg_mutex);
+
+       kfree(tv_nexus);
+       return 0;
+}
+
+static ssize_t scsiback_tpg_show_nexus(struct se_portal_group *se_tpg,
+                                       char *page)
+{
+       struct scsiback_tpg *tpg = container_of(se_tpg,
+                               struct scsiback_tpg, se_tpg);
+       struct scsiback_nexus *tv_nexus;
+       ssize_t ret;
+
+       mutex_lock(&tpg->tv_tpg_mutex);
+       tv_nexus = tpg->tpg_nexus;
+       if (!tv_nexus) {
+               mutex_unlock(&tpg->tv_tpg_mutex);
+               return -ENODEV;
+       }
+       ret = snprintf(page, PAGE_SIZE, "%s\n",
+                       tv_nexus->tvn_se_sess->se_node_acl->initiatorname);
+       mutex_unlock(&tpg->tv_tpg_mutex);
+
+       return ret;
+}
+
+static ssize_t scsiback_tpg_store_nexus(struct se_portal_group *se_tpg,
+                                        const char *page,
+                                        size_t count)
+{
+       struct scsiback_tpg *tpg = container_of(se_tpg,
+                               struct scsiback_tpg, se_tpg);
+       struct scsiback_tport *tport_wwn = tpg->tport;
+       unsigned char i_port[VSCSI_NAMELEN], *ptr, *port_ptr;
+       int ret;
+       /*
+        * Shutdown the active I_T nexus if 'NULL' is passed..
+        */
+       if (!strncmp(page, "NULL", 4)) {
+               ret = scsiback_drop_nexus(tpg);
+               return (!ret) ? count : ret;
+       }
+       /*
+        * Otherwise make sure the passed virtual Initiator port WWN matches
+        * the fabric protocol_id set in scsiback_make_tport(), and call
+        * scsiback_make_nexus().
+        */
+       if (strlen(page) >= VSCSI_NAMELEN) {
+               pr_err("Emulated NAA Sas Address: %s, exceeds max: %d\n",
+                       page, VSCSI_NAMELEN);
+               return -EINVAL;
+       }
+       snprintf(&i_port[0], VSCSI_NAMELEN, "%s", page);
+
+       ptr = strstr(i_port, "naa.");
+       if (ptr) {
+               if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_SAS) {
+                       pr_err("Passed SAS Initiator Port %s does not match target port protoid: %s\n",
+                               i_port, scsiback_dump_proto_id(tport_wwn));
+                       return -EINVAL;
+               }
+               port_ptr = &i_port[0];
+               goto check_newline;
+       }
+       ptr = strstr(i_port, "fc.");
+       if (ptr) {
+               if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_FCP) {
+                       pr_err("Passed FCP Initiator Port %s does not match target port protoid: %s\n",
+                               i_port, scsiback_dump_proto_id(tport_wwn));
+                       return -EINVAL;
+               }
+               port_ptr = &i_port[3]; /* Skip over "fc." */
+               goto check_newline;
+       }
+       ptr = strstr(i_port, "iqn.");
+       if (ptr) {
+               if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_ISCSI) {
+                       pr_err("Passed iSCSI Initiator Port %s does not match target port protoid: %s\n",
+                               i_port, scsiback_dump_proto_id(tport_wwn));
+                       return -EINVAL;
+               }
+               port_ptr = &i_port[0];
+               goto check_newline;
+       }
+       pr_err("Unable to locate prefix for emulated Initiator Port: %s\n",
+               i_port);
+       return -EINVAL;
+       /*
+        * Clear any trailing newline for the NAA WWN
+        */
+check_newline:
+       if (i_port[strlen(i_port) - 1] == '\n')
+               i_port[strlen(i_port) - 1] = '\0';
+
+       ret = scsiback_make_nexus(tpg, port_ptr);
+       if (ret < 0)
+               return ret;
+
+       return count;
+}
+
+TF_TPG_BASE_ATTR(scsiback, nexus, S_IRUGO | S_IWUSR);
+
+static struct configfs_attribute *scsiback_tpg_attrs[] = {
+       &scsiback_tpg_nexus.attr,
+       NULL,
+};
+
+static ssize_t
+scsiback_wwn_show_attr_version(struct target_fabric_configfs *tf,
+                               char *page)
+{
+       return sprintf(page, "xen-pvscsi fabric module %s on %s/%s on "
+               UTS_RELEASE"\n",
+               VSCSI_VERSION, utsname()->sysname, utsname()->machine);
+}
+
+TF_WWN_ATTR_RO(scsiback, version);
+
+static struct configfs_attribute *scsiback_wwn_attrs[] = {
+       &scsiback_wwn_version.attr,
+       NULL,
+};
+
+static char *scsiback_get_fabric_name(void)
+{
+       return "xen-pvscsi";
+}
+
+static int scsiback_port_link(struct se_portal_group *se_tpg,
+                              struct se_lun *lun)
+{
+       struct scsiback_tpg *tpg = container_of(se_tpg,
+                               struct scsiback_tpg, se_tpg);
+
+       mutex_lock(&tpg->tv_tpg_mutex);
+       tpg->tv_tpg_port_count++;
+       mutex_unlock(&tpg->tv_tpg_mutex);
+
+       return 0;
+}
+
+static void scsiback_port_unlink(struct se_portal_group *se_tpg,
+                                 struct se_lun *lun)
+{
+       struct scsiback_tpg *tpg = container_of(se_tpg,
+                               struct scsiback_tpg, se_tpg);
+
+       mutex_lock(&tpg->tv_tpg_mutex);
+       tpg->tv_tpg_port_count--;
+       mutex_unlock(&tpg->tv_tpg_mutex);
+}
+
+static struct se_portal_group *
+scsiback_make_tpg(struct se_wwn *wwn,
+                  struct config_group *group,
+                  const char *name)
+{
+       struct scsiback_tport *tport = container_of(wwn,
+                       struct scsiback_tport, tport_wwn);
+
+       struct scsiback_tpg *tpg;
+       u16 tpgt;
+       int ret;
+
+       if (strstr(name, "tpgt_") != name)
+               return ERR_PTR(-EINVAL);
+       ret = kstrtou16(name + 5, 10, &tpgt);
+       if (ret)
+               return ERR_PTR(ret);
+
+       tpg = kzalloc(sizeof(struct scsiback_tpg), GFP_KERNEL);
+       if (!tpg)
+               return ERR_PTR(-ENOMEM);
+
+       mutex_init(&tpg->tv_tpg_mutex);
+       INIT_LIST_HEAD(&tpg->tv_tpg_list);
+       INIT_LIST_HEAD(&tpg->info_list);
+       tpg->tport = tport;
+       tpg->tport_tpgt = tpgt;
+
+       ret = core_tpg_register(&scsiback_fabric_configfs->tf_ops, wwn,
+                               &tpg->se_tpg, tpg, TRANSPORT_TPG_TYPE_NORMAL);
+       if (ret < 0) {
+               kfree(tpg);
+               return NULL;
+       }
+       mutex_lock(&scsiback_mutex);
+       list_add_tail(&tpg->tv_tpg_list, &scsiback_list);
+       mutex_unlock(&scsiback_mutex);
+
+       return &tpg->se_tpg;
+}
+
+static void scsiback_drop_tpg(struct se_portal_group *se_tpg)
+{
+       struct scsiback_tpg *tpg = container_of(se_tpg,
+                               struct scsiback_tpg, se_tpg);
+
+       mutex_lock(&scsiback_mutex);
+       list_del(&tpg->tv_tpg_list);
+       mutex_unlock(&scsiback_mutex);
+       /*
+        * Release the virtual I_T Nexus for this xen-pvscsi TPG
+        */
+       scsiback_drop_nexus(tpg);
+       /*
+        * Deregister the se_tpg from TCM..
+        */
+       core_tpg_deregister(se_tpg);
+       kfree(tpg);
+}
+
+static int scsiback_check_true(struct se_portal_group *se_tpg)
+{
+       return 1;
+}
+
+static int scsiback_check_false(struct se_portal_group *se_tpg)
+{
+       return 0;
+}
+
+static struct target_core_fabric_ops scsiback_ops = {
+       .get_fabric_name                = scsiback_get_fabric_name,
+       .get_fabric_proto_ident         = scsiback_get_fabric_proto_ident,
+       .tpg_get_wwn                    = scsiback_get_fabric_wwn,
+       .tpg_get_tag                    = scsiback_get_tag,
+       .tpg_get_default_depth          = scsiback_get_default_depth,
+       .tpg_get_pr_transport_id        = scsiback_get_pr_transport_id,
+       .tpg_get_pr_transport_id_len    = scsiback_get_pr_transport_id_len,
+       .tpg_parse_pr_out_transport_id  = scsiback_parse_pr_out_transport_id,
+       .tpg_check_demo_mode            = scsiback_check_true,
+       .tpg_check_demo_mode_cache      = scsiback_check_true,
+       .tpg_check_demo_mode_write_protect = scsiback_check_false,
+       .tpg_check_prod_mode_write_protect = scsiback_check_false,
+       .tpg_alloc_fabric_acl           = scsiback_alloc_fabric_acl,
+       .tpg_release_fabric_acl         = scsiback_release_fabric_acl,
+       .tpg_get_inst_index             = scsiback_tpg_get_inst_index,
+       .check_stop_free                = scsiback_check_stop_free,
+       .release_cmd                    = scsiback_release_cmd,
+       .put_session                    = NULL,
+       .shutdown_session               = scsiback_shutdown_session,
+       .close_session                  = scsiback_close_session,
+       .sess_get_index                 = scsiback_sess_get_index,
+       .sess_get_initiator_sid         = NULL,
+       .write_pending                  = scsiback_write_pending,
+       .write_pending_status           = scsiback_write_pending_status,
+       .set_default_node_attributes    = scsiback_set_default_node_attrs,
+       .get_task_tag                   = scsiback_get_task_tag,
+       .get_cmd_state                  = scsiback_get_cmd_state,
+       .queue_data_in                  = scsiback_queue_data_in,
+       .queue_status                   = scsiback_queue_status,
+       .queue_tm_rsp                   = scsiback_queue_tm_rsp,
+       .aborted_task                   = scsiback_aborted_task,
+       /*
+        * Setup callers for generic logic in target_core_fabric_configfs.c
+        */
+       .fabric_make_wwn                = scsiback_make_tport,
+       .fabric_drop_wwn                = scsiback_drop_tport,
+       .fabric_make_tpg                = scsiback_make_tpg,
+       .fabric_drop_tpg                = scsiback_drop_tpg,
+       .fabric_post_link               = scsiback_port_link,
+       .fabric_pre_unlink              = scsiback_port_unlink,
+       .fabric_make_np                 = NULL,
+       .fabric_drop_np                 = NULL,
+#if 0
+       .fabric_make_nodeacl            = scsiback_make_nodeacl,
+       .fabric_drop_nodeacl            = scsiback_drop_nodeacl,
+#endif
+};
+
+static int scsiback_register_configfs(void)
+{
+       struct target_fabric_configfs *fabric;
+       int ret;
+
+       pr_debug("xen-pvscsi: fabric module %s on %s/%s on "UTS_RELEASE"\n",
+                VSCSI_VERSION, utsname()->sysname, utsname()->machine);
+       /*
+        * Register the top level struct config_item_type with TCM core
+        */
+       fabric = target_fabric_configfs_init(THIS_MODULE, "xen-pvscsi");
+       if (IS_ERR(fabric))
+               return PTR_ERR(fabric);
+
+       /*
+        * Setup fabric->tf_ops from our local scsiback_ops
+        */
+       fabric->tf_ops = scsiback_ops;
+       /*
+        * Setup default attribute lists for various fabric->tf_cit_tmpl
+        */
+       fabric->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = scsiback_wwn_attrs;
+       fabric->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = scsiback_tpg_attrs;
+       fabric->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = NULL;
+       fabric->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = scsiback_param_attrs;
+       fabric->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = NULL;
+       fabric->tf_cit_tmpl.tfc_tpg_nacl_base_cit.ct_attrs = NULL;
+       fabric->tf_cit_tmpl.tfc_tpg_nacl_attrib_cit.ct_attrs = NULL;
+       fabric->tf_cit_tmpl.tfc_tpg_nacl_auth_cit.ct_attrs = NULL;
+       fabric->tf_cit_tmpl.tfc_tpg_nacl_param_cit.ct_attrs = NULL;
+       /*
+        * Register the fabric for use within TCM
+        */
+       ret = target_fabric_configfs_register(fabric);
+       if (ret < 0) {
+               target_fabric_configfs_free(fabric);
+               return ret;
+       }
+       /*
+        * Setup our local pointer to *fabric
+        */
+       scsiback_fabric_configfs = fabric;
+       pr_debug("xen-pvscsi: Set fabric -> scsiback_fabric_configfs\n");
+       return 0;
+};
+
+static void scsiback_deregister_configfs(void)
+{
+       if (!scsiback_fabric_configfs)
+               return;
+
+       target_fabric_configfs_deregister(scsiback_fabric_configfs);
+       scsiback_fabric_configfs = NULL;
+       pr_debug("xen-pvscsi: Cleared scsiback_fabric_configfs\n");
+};
+
+static const struct xenbus_device_id scsiback_ids[] = {
+       { "vscsi" },
+       { "" }
+};
+
+static struct xenbus_driver scsiback_driver = {
+       .ids                    = scsiback_ids,
+       .probe                  = scsiback_probe,
+       .remove                 = scsiback_remove,
+       .otherend_changed       = scsiback_frontend_changed
+};
+
+static void scsiback_init_pend(void *p)
+{
+       struct vscsibk_pend *pend = p;
+       int i;
+
+       memset(pend, 0, sizeof(*pend));
+       for (i = 0; i < VSCSI_MAX_GRANTS; i++)
+               pend->grant_handles[i] = SCSIBACK_INVALID_HANDLE;
+}
+
+static int __init scsiback_init(void)
+{
+       int ret;
+
+       if (!xen_domain())
+               return -ENODEV;
+
+       scsiback_cachep = kmem_cache_create("vscsiif_cache",
+               sizeof(struct vscsibk_pend), 0, 0, scsiback_init_pend);
+       if (!scsiback_cachep)
+               return -ENOMEM;
+
+       ret = xenbus_register_backend(&scsiback_driver);
+       if (ret)
+               goto out_cache_destroy;
+
+       ret = scsiback_register_configfs();
+       if (ret)
+               goto out_unregister_xenbus;
+
+       return 0;
+
+out_unregister_xenbus:
+       xenbus_unregister_driver(&scsiback_driver);
+out_cache_destroy:
+       kmem_cache_destroy(scsiback_cachep);
+       pr_err("xen-pvscsi: %s: error %d\n", __func__, ret);
+       return ret;
+}
+
+static void __exit scsiback_exit(void)
+{
+       struct page *page;
+
+       while (free_pages_num) {
+               if (get_free_page(&page))
+                       BUG();
+               free_xenballooned_pages(1, &page);
+       }
+       scsiback_deregister_configfs();
+       xenbus_unregister_driver(&scsiback_driver);
+       kmem_cache_destroy(scsiback_cachep);
+}
+
+module_init(scsiback_init);
+module_exit(scsiback_exit);
+
+MODULE_DESCRIPTION("Xen SCSI backend driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:vscsi");
+MODULE_AUTHOR("Juergen Gross <jgross@suse.com>");
index 439c9dc..ca74410 100644 (file)
@@ -259,7 +259,6 @@ static char *error_path(struct xenbus_device *dev)
 static void xenbus_va_dev_error(struct xenbus_device *dev, int err,
                                const char *fmt, va_list ap)
 {
-       int ret;
        unsigned int len;
        char *printf_buffer = NULL;
        char *path_buffer = NULL;
@@ -270,9 +269,7 @@ static void xenbus_va_dev_error(struct xenbus_device *dev, int err,
                goto fail;
 
        len = sprintf(printf_buffer, "%i ", -err);
-       ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
-
-       BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1);
+       vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
 
        dev_err(&dev->dev, "%s\n", printf_buffer);
 
@@ -361,8 +358,8 @@ static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err,
  * @ring_mfn: mfn of ring to grant
 
  * Grant access to the given @ring_mfn to the peer of the given device.  Return
- * 0 on success, or -errno on error.  On error, the device will switch to
- * XenbusStateClosing, and the error will be saved in the store.
+ * a grant reference on success, or -errno on error. On error, the device will
+ * switch to XenbusStateClosing, and the error will be saved in the store.
  */
 int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn)
 {
index 3c0a74b..564b315 100644 (file)
@@ -297,9 +297,13 @@ void xenbus_dev_shutdown(struct device *_dev)
 EXPORT_SYMBOL_GPL(xenbus_dev_shutdown);
 
 int xenbus_register_driver_common(struct xenbus_driver *drv,
-                                 struct xen_bus_type *bus)
+                                 struct xen_bus_type *bus,
+                                 struct module *owner, const char *mod_name)
 {
+       drv->driver.name = drv->name ? drv->name : drv->ids[0].devicetype;
        drv->driver.bus = &bus->bus;
+       drv->driver.owner = owner;
+       drv->driver.mod_name = mod_name;
 
        return driver_register(&drv->driver);
 }
index 1085ec2..c9ec7ca 100644 (file)
@@ -60,7 +60,9 @@ extern int xenbus_match(struct device *_dev, struct device_driver *_drv);
 extern int xenbus_dev_probe(struct device *_dev);
 extern int xenbus_dev_remove(struct device *_dev);
 extern int xenbus_register_driver_common(struct xenbus_driver *drv,
-                                        struct xen_bus_type *bus);
+                                        struct xen_bus_type *bus,
+                                        struct module *owner,
+                                        const char *mod_name);
 extern int xenbus_probe_node(struct xen_bus_type *bus,
                             const char *type,
                             const char *nodename);
index 5125dce..04f7f85 100644 (file)
@@ -234,13 +234,15 @@ int xenbus_dev_is_online(struct xenbus_device *dev)
 }
 EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
 
-int xenbus_register_backend(struct xenbus_driver *drv)
+int __xenbus_register_backend(struct xenbus_driver *drv, struct module *owner,
+                             const char *mod_name)
 {
        drv->read_otherend_details = read_frontend_details;
 
-       return xenbus_register_driver_common(drv, &xenbus_backend);
+       return xenbus_register_driver_common(drv, &xenbus_backend,
+                                            owner, mod_name);
 }
-EXPORT_SYMBOL_GPL(xenbus_register_backend);
+EXPORT_SYMBOL_GPL(__xenbus_register_backend);
 
 static int backend_probe_and_watch(struct notifier_block *notifier,
                                   unsigned long event,
index cb385c1..bcb53bd 100644 (file)
@@ -317,13 +317,15 @@ static void wait_for_devices(struct xenbus_driver *xendrv)
                         print_device_status);
 }
 
-int xenbus_register_frontend(struct xenbus_driver *drv)
+int __xenbus_register_frontend(struct xenbus_driver *drv, struct module *owner,
+                              const char *mod_name)
 {
        int ret;
 
        drv->read_otherend_details = read_backend_details;
 
-       ret = xenbus_register_driver_common(drv, &xenbus_frontend);
+       ret = xenbus_register_driver_common(drv, &xenbus_frontend,
+                                           owner, mod_name);
        if (ret)
                return ret;
 
@@ -332,7 +334,7 @@ int xenbus_register_frontend(struct xenbus_driver *drv)
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(xenbus_register_frontend);
+EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
 
 static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq);
 static int backend_state;
index 8bee7a7..5321cd9 100644 (file)
@@ -28,6 +28,8 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
                           unsigned long irqflags,
                           const char *devname,
                           void *dev_id);
+int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
+                                  unsigned int remote_port);
 int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
                                          unsigned int remote_port,
                                          irq_handler_t handler,
index 6f4eae3..f90b034 100644 (file)
@@ -3,6 +3,24 @@
  *
  * Definitions used for the Xen ELF notes.
  *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Copyright (c) 2006, Ian Campbell, XenSource Ltd.
  */
 
  *
  * LEGACY indicated the fields in the legacy __xen_guest string which
  * this a note type replaces.
+ *
+ * String values (for non-legacy) are NULL terminated ASCII, also known
+ * as ASCIZ type.
  */
 
 /*
  * NAME=VALUE pair (string).
- *
- * LEGACY: FEATURES and PAE
  */
 #define XEN_ELFNOTE_INFO           0
 
 
 /*
  * Whether or not the guest supports cooperative suspend cancellation.
+ * This is a numeric value.
+ *
+ * Default is 0
  */
 #define XEN_ELFNOTE_SUSPEND_CANCEL 14
 
+/*
+ * The (non-default) location the initial phys-to-machine map should be
+ * placed at by the hypervisor (Dom0) or the tools (DomU).
+ * The kernel must be prepared for this mapping to be established using
+ * large pages, despite such otherwise not being available to guests.
+ * The kernel must also be able to handle the page table pages used for
+ * this mapping not being accessible through the initial mapping.
+ * (Only x86-64 supports this at present.)
+ */
+#define XEN_ELFNOTE_INIT_P2M      15
+
+/*
+ * Whether or not the guest can deal with being passed an initrd not
+ * mapped through its initial page tables.
+ */
+#define XEN_ELFNOTE_MOD_START_PFN 16
+
 /*
  * The features supported by this kernel (numeric).
  *
  */
 #define XEN_ELFNOTE_SUPPORTED_FEATURES 17
 
+/*
+ * The number of the highest elfnote defined.
+ */
+#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUPPORTED_FEATURES
+
 #endif /* __XEN_PUBLIC_ELFNOTE_H__ */
 
 /*
diff --git a/include/xen/interface/io/vscsiif.h b/include/xen/interface/io/vscsiif.h
new file mode 100644 (file)
index 0000000..d07d7ac
--- /dev/null
@@ -0,0 +1,229 @@
+/******************************************************************************
+ * vscsiif.h
+ *
+ * Based on the blkif.h code.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright(c) FUJITSU Limited 2008.
+ */
+
+#ifndef __XEN__PUBLIC_IO_SCSI_H__
+#define __XEN__PUBLIC_IO_SCSI_H__
+
+#include "ring.h"
+#include "../grant_table.h"
+
+/*
+ * Feature and Parameter Negotiation
+ * =================================
+ * The two halves of a Xen pvSCSI driver utilize nodes within the XenStore to
+ * communicate capabilities and to negotiate operating parameters.  This
+ * section enumerates these nodes which reside in the respective front and
+ * backend portions of the XenStore, following the XenBus convention.
+ *
+ * Any specified default value is in effect if the corresponding XenBus node
+ * is not present in the XenStore.
+ *
+ * XenStore nodes in sections marked "PRIVATE" are solely for use by the
+ * driver side whose XenBus tree contains them.
+ *
+ *****************************************************************************
+ *                            Backend XenBus Nodes
+ *****************************************************************************
+ *
+ *------------------ Backend Device Identification (PRIVATE) ------------------
+ *
+ * p-devname
+ *      Values:         string
+ *
+ *      A free string used to identify the physical device (e.g. a disk name).
+ *
+ * p-dev
+ *      Values:         string
+ *
+ *      A string specifying the backend device: either a 4-tuple "h:c:t:l"
+ *      (host, controller, target, lun, all integers), or a WWN (e.g.
+ *      "naa.60014054ac780582").
+ *
+ * v-dev
+ *      Values:         string
+ *
+ *      A string specifying the frontend device in form of a 4-tuple "h:c:t:l"
+ *      (host, controller, target, lun, all integers).
+ *
+ *--------------------------------- Features ---------------------------------
+ *
+ * feature-sg-grant
+ *      Values:         unsigned [VSCSIIF_SG_TABLESIZE...65535]
+ *      Default Value:  0
+ *
+ *      Specifies the maximum number of scatter/gather elements in grant pages
+ *      supported. If not set, the backend supports up to VSCSIIF_SG_TABLESIZE
+ *      SG elements specified directly in the request.
+ *
+ *****************************************************************************
+ *                            Frontend XenBus Nodes
+ *****************************************************************************
+ *
+ *----------------------- Request Transport Parameters -----------------------
+ *
+ * event-channel
+ *      Values:         unsigned
+ *
+ *      The identifier of the Xen event channel used to signal activity
+ *      in the ring buffer.
+ *
+ * ring-ref
+ *      Values:         unsigned
+ *
+ *      The Xen grant reference granting permission for the backend to map
+ *      the sole page in a single page sized ring buffer.
+ *
+ * protocol
+ *      Values:         string (XEN_IO_PROTO_ABI_*)
+ *      Default Value:  XEN_IO_PROTO_ABI_NATIVE
+ *
+ *      The machine ABI rules governing the format of all ring request and
+ *      response structures.
+ */
+
+/* Requests from the frontend to the backend */
+
+/*
+ * Request a SCSI operation specified via a CDB in vscsiif_request.cmnd.
+ * The target is specified via channel, id and lun.
+ *
+ * The operation to be performed is specified via a CDB in cmnd[], the length
+ * of the CDB is in cmd_len. sc_data_direction specifies the direction of data
+ * (to the device, from the device, or none at all).
+ *
+ * If data is to be transferred to or from the device the buffer(s) in the
+ * guest memory is/are specified via one or multiple scsiif_request_segment
+ * descriptors each specifying a memory page via a grant_ref_t, a offset into
+ * the page and the length of the area in that page. All scsiif_request_segment
+ * areas concatenated form the resulting data buffer used by the operation.
+ * If the number of scsiif_request_segment areas is not too large (less than
+ * or equal VSCSIIF_SG_TABLESIZE) the areas can be specified directly in the
+ * seg[] array and the number of valid scsiif_request_segment elements is to be
+ * set in nr_segments.
+ *
+ * If "feature-sg-grant" in the Xenstore is set it is possible to specify more
+ * than VSCSIIF_SG_TABLESIZE scsiif_request_segment elements via indirection.
+ * The maximum number of allowed scsiif_request_segment elements is the value
+ * of the "feature-sg-grant" entry from Xenstore. When using indirection the
+ * seg[] array doesn't contain specifications of the data buffers, but
+ * references to scsiif_request_segment arrays, which in turn reference the
+ * data buffers. While nr_segments holds the number of populated seg[] entries
+ * (plus the set VSCSIIF_SG_GRANT bit), the number of scsiif_request_segment
+ * elements referencing the target data buffers is calculated from the lengths
+ * of the seg[] elements (the sum of all valid seg[].length divided by the
+ * size of one scsiif_request_segment structure).
+ */
+#define VSCSIIF_ACT_SCSI_CDB           1
+
+/*
+ * Request abort of a running operation for the specified target given by
+ * channel, id, lun and the operation's rqid in ref_rqid.
+ */
+#define VSCSIIF_ACT_SCSI_ABORT         2
+
+/*
+ * Request a device reset of the specified target (channel and id).
+ */
+#define VSCSIIF_ACT_SCSI_RESET         3
+
+/*
+ * Preset scatter/gather elements for a following request. Deprecated.
+ * Keeping the define only to avoid usage of the value "4" for other actions.
+ */
+#define VSCSIIF_ACT_SCSI_SG_PRESET     4
+
+/*
+ * Maximum scatter/gather segments per request.
+ *
+ * Considering balance between allocating at least 16 "vscsiif_request"
+ * structures on one page (4096 bytes) and the number of scatter/gather
+ * elements needed, we decided to use 26 as a magic number.
+ *
+ * If "feature-sg-grant" is set, more scatter/gather elements can be specified
+ * by placing them in one or more (up to VSCSIIF_SG_TABLESIZE) granted pages.
+ * In this case the vscsiif_request seg elements don't contain references to
+ * the user data, but to the SG elements referencing the user data.
+ */
+#define VSCSIIF_SG_TABLESIZE           26
+
+/*
+ * based on Linux kernel 2.6.18, still valid
+ * Changing these values requires support of multiple protocols via the rings
+ * as "old clients" will blindly use these values and the resulting structure
+ * sizes.
+ */
+#define VSCSIIF_MAX_COMMAND_SIZE       16
+#define VSCSIIF_SENSE_BUFFERSIZE       96
+
+struct scsiif_request_segment {
+       grant_ref_t gref;
+       uint16_t offset;
+       uint16_t length;
+};
+
+#define VSCSIIF_SG_PER_PAGE (PAGE_SIZE / sizeof(struct scsiif_request_segment))
+
+/* Size of one request is 252 bytes */
+struct vscsiif_request {
+       uint16_t rqid;          /* private guest value, echoed in resp  */
+       uint8_t act;            /* command between backend and frontend */
+       uint8_t cmd_len;        /* valid CDB bytes */
+
+       uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE]; /* the CDB */
+       uint16_t timeout_per_command;   /* deprecated */
+       uint16_t channel, id, lun;      /* (virtual) device specification */
+       uint16_t ref_rqid;              /* command abort reference */
+       uint8_t sc_data_direction;      /* for DMA_TO_DEVICE(1)
+                                          DMA_FROM_DEVICE(2)
+                                          DMA_NONE(3) requests */
+       uint8_t nr_segments;            /* Number of pieces of scatter-gather */
+/*
+ * flag in nr_segments: SG elements via grant page
+ *
+ * If VSCSIIF_SG_GRANT is set, the low 7 bits of nr_segments specify the number
+ * of grant pages containing SG elements. Usable if "feature-sg-grant" set.
+ */
+#define VSCSIIF_SG_GRANT       0x80
+
+       struct scsiif_request_segment seg[VSCSIIF_SG_TABLESIZE];
+       uint32_t reserved[3];
+};
+
+/* Size of one response is 252 bytes */
+struct vscsiif_response {
+       uint16_t rqid;          /* identifies request */
+       uint8_t padding;
+       uint8_t sense_len;
+       uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE];
+       int32_t rslt;
+       uint32_t residual_len;  /* request bufflen -
+                                  return the value from physical device */
+       uint32_t reserved[36];
+};
+
+DEFINE_RING_TYPES(vscsiif, struct vscsiif_request, struct vscsiif_response);
+
+#endif /*__XEN__PUBLIC_IO_SCSI_H__*/
index de08213..f68719f 100644 (file)
@@ -3,6 +3,24 @@
  *
  * Guest OS interface to Xen.
  *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Copyright (c) 2004, K A Fraser
  */
 
  * VIRTUAL INTERRUPTS
  *
  * Virtual interrupts that a guest OS may receive from Xen.
+ * In the side comments, 'V.' denotes a per-VCPU VIRQ while 'G.' denotes a
+ * global VIRQ. The former can be bound once per VCPU and cannot be re-bound.
+ * The latter can be allocated only once per guest: they must initially be
+ * allocated to VCPU0 but can subsequently be re-bound.
  */
-#define VIRQ_TIMER      0  /* Timebase update, and/or requested timeout.  */
-#define VIRQ_DEBUG      1  /* Request guest to dump debug info.           */
-#define VIRQ_CONSOLE    2  /* (DOM0) Bytes received on emergency console. */
-#define VIRQ_DOM_EXC    3  /* (DOM0) Exceptional event for some domain.   */
-#define VIRQ_DEBUGGER   6  /* (DOM0) A domain has paused for debugging.   */
-#define VIRQ_PCPU_STATE 9  /* (DOM0) PCPU state changed                   */
+#define VIRQ_TIMER      0  /* V. Timebase update, and/or requested timeout.  */
+#define VIRQ_DEBUG      1  /* V. Request guest to dump debug info.           */
+#define VIRQ_CONSOLE    2  /* G. (DOM0) Bytes received on emergency console. */
+#define VIRQ_DOM_EXC    3  /* G. (DOM0) Exceptional event for some domain.   */
+#define VIRQ_TBUF       4  /* G. (DOM0) Trace buffer has records available.  */
+#define VIRQ_DEBUGGER   6  /* G. (DOM0) A domain has paused for debugging.   */
+#define VIRQ_XENOPROF   7  /* V. XenOprofile interrupt: new sample available */
+#define VIRQ_CON_RING   8  /* G. (DOM0) Bytes received on console            */
+#define VIRQ_PCPU_STATE 9  /* G. (DOM0) PCPU state changed                   */
+#define VIRQ_MEM_EVENT  10 /* G. (DOM0) A memory event has occured           */
+#define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient                     */
+#define VIRQ_ENOMEM     12 /* G. (DOM0) Low on heap memory       */
 
 /* Architecture-specific VIRQ definitions. */
 #define VIRQ_ARCH_0    16
 #define VIRQ_ARCH_7    23
 
 #define NR_VIRQS       24
+
 /*
- * MMU-UPDATE REQUESTS
- *
- * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.
- * A foreigndom (FD) can be specified (or DOMID_SELF for none).
- * Where the FD has some effect, it is described below.
- * ptr[1:0] specifies the appropriate MMU_* command.
+ * enum neg_errnoval HYPERVISOR_mmu_update(const struct mmu_update reqs[],
+ *                                         unsigned count, unsigned *done_out,
+ *                                         unsigned foreigndom)
+ * @reqs is an array of mmu_update_t structures ((ptr, val) pairs).
+ * @count is the length of the above array.
+ * @pdone is an output parameter indicating number of completed operations
+ * @foreigndom[15:0]: FD, the expected owner of data pages referenced in this
+ *                    hypercall invocation. Can be DOMID_SELF.
+ * @foreigndom[31:16]: PFD, the expected owner of pagetable pages referenced
+ *                     in this hypercall invocation. The value of this field
+ *                     (x) encodes the PFD as follows:
+ *                     x == 0 => PFD == DOMID_SELF
+ *                     x != 0 => PFD == x - 1
  *
+ * Sub-commands: ptr[1:0] specifies the appropriate MMU_* command.
+ * -------------
  * ptr[1:0] == MMU_NORMAL_PT_UPDATE:
- * Updates an entry in a page table. If updating an L1 table, and the new
- * table entry is valid/present, the mapped frame must belong to the FD, if
- * an FD has been specified. If attempting to map an I/O page then the
- * caller assumes the privilege of the FD.
+ * Updates an entry in a page table belonging to PFD. If updating an L1 table,
+ * and the new table entry is valid/present, the mapped frame must belong to
+ * FD. If attempting to map an I/O page then the caller assumes the privilege
+ * of the FD.
  * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.
  * FD == DOMID_XEN: Map restricted areas of Xen's heap space.
  * ptr[:2]  -- Machine address of the page-table entry to modify.
  * val      -- Value to write.
  *
+ * There also certain implicit requirements when using this hypercall. The
+ * pages that make up a pagetable must be mapped read-only in the guest.
+ * This prevents uncontrolled guest updates to the pagetable. Xen strictly
+ * enforces this, and will disallow any pagetable update which will end up
+ * mapping pagetable page RW, and will disallow using any writable page as a
+ * pagetable. In practice it means that when constructing a page table for a
+ * process, thread, etc, we MUST be very dilligient in following these rules:
+ *  1). Start with top-level page (PGD or in Xen language: L4). Fill out
+ *      the entries.
+ *  2). Keep on going, filling out the upper (PUD or L3), and middle (PMD
+ *      or L2).
+ *  3). Start filling out the PTE table (L1) with the PTE entries. Once
+ *      done, make sure to set each of those entries to RO (so writeable bit
+ *      is unset). Once that has been completed, set the PMD (L2) for this
+ *      PTE table as RO.
+ *  4). When completed with all of the PMD (L2) entries, and all of them have
+ *      been set to RO, make sure to set RO the PUD (L3). Do the same
+ *      operation on PGD (L4) pagetable entries that have a PUD (L3) entry.
+ *  5). Now before you can use those pages (so setting the cr3), you MUST also
+ *      pin them so that the hypervisor can verify the entries. This is done
+ *      via the HYPERVISOR_mmuext_op(MMUEXT_PIN_L4_TABLE, guest physical frame
+ *      number of the PGD (L4)). And this point the HYPERVISOR_mmuext_op(
+ *      MMUEXT_NEW_BASEPTR, guest physical frame number of the PGD (L4)) can be
+ *      issued.
+ * For 32-bit guests, the L4 is not used (as there is less pagetables), so
+ * instead use L3.
+ * At this point the pagetables can be modified using the MMU_NORMAL_PT_UPDATE
+ * hypercall. Also if so desired the OS can also try to write to the PTE
+ * and be trapped by the hypervisor (as the PTE entry is RO).
+ *
+ * To deallocate the pages, the operations are the reverse of the steps
+ * mentioned above. The argument is MMUEXT_UNPIN_TABLE for all levels and the
+ * pagetable MUST not be in use (meaning that the cr3 is not set to it).
+ *
  * ptr[1:0] == MMU_MACHPHYS_UPDATE:
  * Updates an entry in the machine->pseudo-physical mapping table.
  * ptr[:2]  -- Machine address within the frame whose mapping to modify.
  * ptr[1:0] == MMU_PT_UPDATE_PRESERVE_AD:
  * As MMU_NORMAL_PT_UPDATE above, but A/D bits currently in the PTE are ORed
  * with those in @val.
+ *
+ * @val is usually the machine frame number along with some attributes.
+ * The attributes by default follow the architecture defined bits. Meaning that
+ * if this is a X86_64 machine and four page table layout is used, the layout
+ * of val is:
+ *  - 63 if set means No execute (NX)
+ *  - 46-13 the machine frame number
+ *  - 12 available for guest
+ *  - 11 available for guest
+ *  - 10 available for guest
+ *  - 9 available for guest
+ *  - 8 global
+ *  - 7 PAT (PSE is disabled, must use hypercall to make 4MB or 2MB pages)
+ *  - 6 dirty
+ *  - 5 accessed
+ *  - 4 page cached disabled
+ *  - 3 page write through
+ *  - 2 userspace accessible
+ *  - 1 writeable
+ *  - 0 present
+ *
+ *  The one bits that does not fit with the default layout is the PAGE_PSE
+ *  also called PAGE_PAT). The MMUEXT_[UN]MARK_SUPER arguments to the
+ *  HYPERVISOR_mmuext_op serve as mechanism to set a pagetable to be 4MB
+ *  (or 2MB) instead of using the PAGE_PSE bit.
+ *
+ *  The reason that the PAGE_PSE (bit 7) is not being utilized is due to Xen
+ *  using it as the Page Attribute Table (PAT) bit - for details on it please
+ *  refer to Intel SDM 10.12. The PAT allows to set the caching attributes of
+ *  pages instead of using MTRRs.
+ *
+ *  The PAT MSR is as follows (it is a 64-bit value, each entry is 8 bits):
+ *                    PAT4                 PAT0
+ *  +-----+-----+----+----+----+-----+----+----+
+ *  | UC  | UC- | WC | WB | UC | UC- | WC | WB |  <= Linux
+ *  +-----+-----+----+----+----+-----+----+----+
+ *  | UC  | UC- | WT | WB | UC | UC- | WT | WB |  <= BIOS (default when machine boots)
+ *  +-----+-----+----+----+----+-----+----+----+
+ *  | rsv | rsv | WP | WC | UC | UC- | WT | WB |  <= Xen
+ *  +-----+-----+----+----+----+-----+----+----+
+ *
+ *  The lookup of this index table translates to looking up
+ *  Bit 7, Bit 4, and Bit 3 of val entry:
+ *
+ *  PAT/PSE (bit 7) ... PCD (bit 4) .. PWT (bit 3).
+ *
+ *  If all bits are off, then we are using PAT0. If bit 3 turned on,
+ *  then we are using PAT1, if bit 3 and bit 4, then PAT2..
+ *
+ *  As you can see, the Linux PAT1 translates to PAT4 under Xen. Which means
+ *  that if a guest that follows Linux's PAT setup and would like to set Write
+ *  Combined on pages it MUST use PAT4 entry. Meaning that Bit 7 (PAGE_PAT) is
+ *  set. For example, under Linux it only uses PAT0, PAT1, and PAT2 for the
+ *  caching as:
+ *
+ *   WB = none (so PAT0)
+ *   WC = PWT (bit 3 on)
+ *   UC = PWT | PCD (bit 3 and 4 are on).
+ *
+ * To make it work with Xen, it needs to translate the WC bit as so:
+ *
+ *  PWT (so bit 3 on) --> PAT (so bit 7 is on) and clear bit 3
+ *
+ * And to translate back it would:
+ *
+ * PAT (bit 7 on) --> PWT (bit 3 on) and clear bit 7.
  */
 #define MMU_NORMAL_PT_UPDATE      0 /* checked '*ptr = val'. ptr is MA.       */
 #define MMU_MACHPHYS_UPDATE       1 /* ptr = MA of frame to modify entry for  */
 /*
  * MMU EXTENDED OPERATIONS
  *
- * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
+ * enum neg_errnoval HYPERVISOR_mmuext_op(mmuext_op_t uops[],
+ *                                        unsigned int count,
+ *                                        unsigned int *pdone,
+ *                                        unsigned int foreigndom)
+ */
+/* HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
  * A foreigndom (FD) can be specified (or DOMID_SELF for none).
  * Where the FD has some effect, it is described below.
  *
  * cmd: MMUEXT_FLUSH_CACHE
  * No additional arguments. Writes back and flushes cache contents.
  *
+ * cmd: MMUEXT_FLUSH_CACHE_GLOBAL
+ * No additional arguments. Writes back and flushes cache contents
+ * on all CPUs in the system.
+ *
  * cmd: MMUEXT_SET_LDT
  * linear_addr: Linear address of LDT base (NB. must be page-aligned).
  * nr_ents: Number of entries in LDT.
+ *
+ * cmd: MMUEXT_CLEAR_PAGE
+ * mfn: Machine frame number to be cleared.
+ *
+ * cmd: MMUEXT_COPY_PAGE
+ * mfn: Machine frame number of the destination page.
+ * src_mfn: Machine frame number of the source page.
+ *
+ * cmd: MMUEXT_[UN]MARK_SUPER
+ * mfn: Machine frame number of head of superpage to be [un]marked.
  */
 #define MMUEXT_PIN_L1_TABLE      0
 #define MMUEXT_PIN_L2_TABLE      1
 #define MMUEXT_FLUSH_CACHE      12
 #define MMUEXT_SET_LDT          13
 #define MMUEXT_NEW_USER_BASEPTR 15
+#define MMUEXT_CLEAR_PAGE       16
+#define MMUEXT_COPY_PAGE        17
+#define MMUEXT_FLUSH_CACHE_GLOBAL 18
+#define MMUEXT_MARK_SUPER       19
+#define MMUEXT_UNMARK_SUPER     20
 
 #ifndef __ASSEMBLY__
 struct mmuext_op {
        unsigned int cmd;
        union {
-               /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
+               /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR
+                * CLEAR_PAGE, COPY_PAGE, [UN]MARK_SUPER */
                xen_pfn_t mfn;
                /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
                unsigned long linear_addr;
@@ -198,6 +361,8 @@ struct mmuext_op {
                unsigned int nr_ents;
                /* TLB_FLUSH_MULTI, INVLPG_MULTI */
                void *vcpumask;
+               /* COPY_PAGE */
+               xen_pfn_t src_mfn;
        } arg2;
 };
 DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
@@ -225,10 +390,23 @@ DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
  */
 #define VMASST_CMD_enable                0
 #define VMASST_CMD_disable               1
+
+/* x86/32 guests: simulate full 4GB segment limits. */
 #define VMASST_TYPE_4gb_segments         0
+
+/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */
 #define VMASST_TYPE_4gb_segments_notify  1
+
+/*
+ * x86 guests: support writes to bottom-level PTEs.
+ * NB1. Page-directory entries cannot be written.
+ * NB2. Guest must continue to remove all writable mappings of PTEs.
+ */
 #define VMASST_TYPE_writable_pagetables  2
+
+/* x86/PAE guests: support PDPTs above 4GB. */
 #define VMASST_TYPE_pae_extended_cr3     3
+
 #define MAX_VMASST_TYPE 3
 
 #ifndef __ASSEMBLY__
@@ -260,6 +438,15 @@ typedef uint16_t domid_t;
  */
 #define DOMID_XEN  (0x7FF2U)
 
+/* DOMID_COW is used as the owner of sharable pages */
+#define DOMID_COW  (0x7FF3U)
+
+/* DOMID_INVALID is used to identify pages with unknown owner. */
+#define DOMID_INVALID (0x7FF4U)
+
+/* Idle domain. */
+#define DOMID_IDLE (0x7FFFU)
+
 /*
  * Send an array of these to HYPERVISOR_mmu_update().
  * NB. The fields are natural pointer/address size for this architecture.
@@ -272,7 +459,9 @@ DEFINE_GUEST_HANDLE_STRUCT(mmu_update);
 
 /*
  * Send an array of these to HYPERVISOR_multicall().
- * NB. The fields are natural register size for this architecture.
+ * NB. The fields are logically the natural register size for this
+ * architecture. In cases where xen_ulong_t is larger than this then
+ * any unused bits in the upper portion must be zero.
  */
 struct multicall_entry {
     xen_ulong_t op;
@@ -442,8 +631,48 @@ struct start_info {
        unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
        unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
        int8_t cmd_line[MAX_GUEST_CMDLINE];
+       /* The pfn range here covers both page table and p->m table frames.   */
+       unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table.    */
+       unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table.  */
 };
 
+/* These flags are passed in the 'flags' field of start_info_t. */
+#define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
+#define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
+#define SIF_MULTIBOOT_MOD (1<<2)  /* Is mod_start a multiboot module? */
+#define SIF_MOD_START_PFN (1<<3)  /* Is mod_start a PFN? */
+#define SIF_PM_MASK       (0xFF<<8) /* reserve 1 byte for xen-pm options */
+
+/*
+ * A multiboot module is a package containing modules very similar to a
+ * multiboot module array. The only differences are:
+ * - the array of module descriptors is by convention simply at the beginning
+ *   of the multiboot module,
+ * - addresses in the module descriptors are based on the beginning of the
+ *   multiboot module,
+ * - the number of modules is determined by a termination descriptor that has
+ *   mod_start == 0.
+ *
+ * This permits to both build it statically and reference it in a configuration
+ * file, and let the PV guest easily rebase the addresses to virtual addresses
+ * and at the same time count the number of modules.
+ */
+struct xen_multiboot_mod_list {
+       /* Address of first byte of the module */
+       uint32_t mod_start;
+       /* Address of last byte of the module (inclusive) */
+       uint32_t mod_end;
+       /* Address of zero-terminated command line */
+       uint32_t cmdline;
+       /* Unused, must be zero */
+       uint32_t pad;
+};
+/*
+ * The console structure in start_info.console.dom0
+ *
+ * This structure includes a variety of information required to
+ * have a working VGA/VESA console.
+ */
 struct dom0_vga_console_info {
        uint8_t video_type;
 #define XEN_VGATYPE_TEXT_MODE_3 0x03
@@ -484,11 +713,6 @@ struct dom0_vga_console_info {
        } u;
 };
 
-/* These flags are passed in the 'flags' field of start_info_t. */
-#define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
-#define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
-#define SIF_PM_MASK       (0xFF<<8) /* reserve 1 byte for xen-pm options */
-
 typedef uint64_t cpumap_t;
 
 typedef uint8_t xen_domain_handle_t[16];
index 0324c6d..b78f21c 100644 (file)
@@ -86,6 +86,7 @@ struct xenbus_device_id
 
 /* A xenbus driver. */
 struct xenbus_driver {
+       const char *name;       /* defaults to ids[0].devicetype */
        const struct xenbus_device_id *ids;
        int (*probe)(struct xenbus_device *dev,
                     const struct xenbus_device_id *id);
@@ -100,20 +101,22 @@ struct xenbus_driver {
        int (*is_ready)(struct xenbus_device *dev);
 };
 
-#define DEFINE_XENBUS_DRIVER(var, drvname, methods...)         \
-struct xenbus_driver var ## _driver = {                                \
-       .driver.name = drvname + 0 ?: var ## _ids->devicetype,  \
-       .driver.owner = THIS_MODULE,                            \
-       .ids = var ## _ids, ## methods                          \
-}
-
 static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv)
 {
        return container_of(drv, struct xenbus_driver, driver);
 }
 
-int __must_check xenbus_register_frontend(struct xenbus_driver *);
-int __must_check xenbus_register_backend(struct xenbus_driver *);
+int __must_check __xenbus_register_frontend(struct xenbus_driver *drv,
+                                           struct module *owner,
+                                           const char *mod_name);
+int __must_check __xenbus_register_backend(struct xenbus_driver *drv,
+                                          struct module *owner,
+                                          const char *mod_name);
+
+#define xenbus_register_frontend(drv) \
+       __xenbus_register_frontend(drv, THIS_MODULE, KBUILD_MODNAME);
+#define xenbus_register_backend(drv) \
+       __xenbus_register_backend(drv, THIS_MODULE, KBUILD_MODNAME);
 
 void xenbus_unregister_driver(struct xenbus_driver *drv);