#include <linux/vmalloc.h>
#include <linux/module.h>
#include <linux/gfp.h>
++#include <linux/memblock.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/e820.h>
#include <asm/linkage.h>
#include <asm/page.h>
++#include <asm/init.h>
++ #include <asm/pat.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
* large enough to allocate page table pages to allocate the rest.
* Each page can map 2MB.
*/
-- static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
++ #define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
++ static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
#ifdef CONFIG_X86_64
/* l3 pud for userspace vsyscall mapping */
*/
#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
- #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
- #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
++ /*
++ * Xen leaves the responsibility for maintaining p2m mappings to the
++ * guests themselves, but it must also access and update the p2m array
++ * during suspend/resume when all the pages are reallocated.
++ *
++ * The p2m table is logically a flat array, but we implement it as a
++ * three-level tree to allow the address space to be sparse.
++ *
++ * Xen
++ * |
++ * p2m_top p2m_top_mfn
++ * / \ / \
++ * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
++ * / \ / \ / /
++ * p2m p2m p2m p2m p2m p2m p2m ...
++ *
++ * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
++ *
++ * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
++ * maximum representable pseudo-physical address space is:
++ * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
++ *
++ * P2M_PER_PAGE depends on the architecture, as a mfn is always
++ * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
++ * 512 and 1024 entries respectively.
++ */
+
++ unsigned long xen_max_p2m_pfn __read_mostly;
- #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
- #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
- /* Placeholder for holes in the address space */
- static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
- { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
++ #define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
++ #define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
++ #define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
- /* Placeholder for holes in the address space */
- static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
- { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
- /* Array of pointers to pages containing p2m entries */
- static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
- { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
++ #define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
- /* Array of pointers to pages containing p2m entries */
- static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
- { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
- /* Arrays of p2m arrays expressed in mfns used for save/restore */
- static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
++ /* Placeholders for holes in the address space */
++ static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
++ static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
++ static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
- /* Arrays of p2m arrays expressed in mfns used for save/restore */
- static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
- static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
- __page_aligned_bss;
++ static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
++ static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
++ static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
+
- static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
- __page_aligned_bss;
++ RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
++ RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
static inline unsigned p2m_top_index(unsigned long pfn)
{
-- BUG_ON(pfn >= MAX_DOMAIN_PAGES);
-- return pfn / P2M_ENTRIES_PER_PAGE;
++ BUG_ON(pfn >= MAX_P2M_PFN);
++ return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
++ }
++
++ static inline unsigned p2m_mid_index(unsigned long pfn)
++ {
++ return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
}
static inline unsigned p2m_index(unsigned long pfn)
{
-- return pfn % P2M_ENTRIES_PER_PAGE;
++ return pfn % P2M_PER_PAGE;
+ }
+
- /* Build the parallel p2m_top_mfn structures */
++ static void p2m_top_init(unsigned long ***top)
++ {
++ unsigned i;
++
++ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
++ top[i] = p2m_mid_missing;
++ }
++
++ static void p2m_top_mfn_init(unsigned long *top)
++ {
++ unsigned i;
++
++ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
++ top[i] = virt_to_mfn(p2m_mid_missing_mfn);
++ }
++
++ static void p2m_top_mfn_p_init(unsigned long **top)
++ {
++ unsigned i;
++
++ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
++ top[i] = p2m_mid_missing_mfn;
++ }
++
++ static void p2m_mid_init(unsigned long **mid)
++ {
++ unsigned i;
++
++ for (i = 0; i < P2M_MID_PER_PAGE; i++)
++ mid[i] = p2m_missing;
++ }
++
++ static void p2m_mid_mfn_init(unsigned long *mid)
++ {
++ unsigned i;
++
++ for (i = 0; i < P2M_MID_PER_PAGE; i++)
++ mid[i] = virt_to_mfn(p2m_missing);
+ }
+
- /* Build the parallel p2m_top_mfn structures */
++ static void p2m_init(unsigned long *p2m)
++ {
++ unsigned i;
++
++ for (i = 0; i < P2M_MID_PER_PAGE; i++)
++ p2m[i] = INVALID_P2M_ENTRY;
++ }
++
++ /*
++ * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
++ *
++ * This is called both at boot time, and after resuming from suspend:
++ * - At boot time we're called very early, and must use extend_brk()
++ * to allocate memory.
++ *
++ * - After resume we're called from within stop_machine, but the mfn
++ * tree should alreay be completely allocated.
++ */
void xen_build_mfn_list_list(void)
{
-- unsigned pfn, idx;
++ unsigned long pfn;
-- for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
-- unsigned topidx = p2m_top_index(pfn);
++ /* Pre-initialize p2m_top_mfn to be completely missing */
++ if (p2m_top_mfn == NULL) {
++ p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
++ p2m_mid_mfn_init(p2m_mid_missing_mfn);
++
++ p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
++ p2m_top_mfn_p_init(p2m_top_mfn_p);
-- p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
++ p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
++ p2m_top_mfn_init(p2m_top_mfn);
++ } else {
++ /* Reinitialise, mfn's all change after migration */
++ p2m_mid_mfn_init(p2m_mid_missing_mfn);
}
-- for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
-- unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
-- p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
++ for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
++ unsigned topidx = p2m_top_index(pfn);
++ unsigned mididx = p2m_mid_index(pfn);
++ unsigned long **mid;
++ unsigned long *mid_mfn_p;
++
++ mid = p2m_top[topidx];
++ mid_mfn_p = p2m_top_mfn_p[topidx];
++
++ /* Don't bother allocating any mfn mid levels if
++ * they're just missing, just update the stored mfn,
++ * since all could have changed over a migrate.
++ */
++ if (mid == p2m_mid_missing) {
++ BUG_ON(mididx);
++ BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
++ p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
++ pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
++ continue;
++ }
++
++ if (mid_mfn_p == p2m_mid_missing_mfn) {
++ /*
++ * XXX boot-time only! We should never find
++ * missing parts of the mfn tree after
++ * runtime. extend_brk() will BUG if we call
++ * it too late.
++ */
++ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
++ p2m_mid_mfn_init(mid_mfn_p);
++
++ p2m_top_mfn_p[topidx] = mid_mfn_p;
++ }
++
++ p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
++ mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
}
}
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
-- virt_to_mfn(p2m_top_mfn_list);
-- HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
++ virt_to_mfn(p2m_top_mfn);
++ HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
}
/* Set up p2m_top to point to the domain-builder provided p2m pages */
{
unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
-- unsigned pfn;
++ unsigned long pfn;
++
++ xen_max_p2m_pfn = max_pfn;
+
- for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
++ p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
++ p2m_init(p2m_missing);
+
- for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
++ p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
++ p2m_mid_init(p2m_mid_missing);
++
++ p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
++ p2m_top_init(p2m_top);
++
++ /*
++ * The domain builder gives us a pre-constructed p2m array in
++ * mfn_list for all the pages initially given to us, so we just
++ * need to graft that into our tree structure.
++ */
++ for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
unsigned topidx = p2m_top_index(pfn);
++ unsigned mididx = p2m_mid_index(pfn);
-- p2m_top[topidx] = &mfn_list[pfn];
-- }
++ if (p2m_top[topidx] == p2m_mid_missing) {
++ unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
++ p2m_mid_init(mid);
++
++ p2m_top[topidx] = mid;
++ }
-- xen_build_mfn_list_list();
++ p2m_top[topidx][mididx] = &mfn_list[pfn];
++ }
}
unsigned long get_phys_to_machine(unsigned long pfn)
{
-- unsigned topidx, idx;
++ unsigned topidx, mididx, idx;
-- if (unlikely(pfn >= MAX_DOMAIN_PAGES))
++ if (unlikely(pfn >= MAX_P2M_PFN))
return INVALID_P2M_ENTRY;
topidx = p2m_top_index(pfn);
++ mididx = p2m_mid_index(pfn);
idx = p2m_index(pfn);
-- return p2m_top[topidx][idx];
++
++ return p2m_top[topidx][mididx][idx];
}
EXPORT_SYMBOL_GPL(get_phys_to_machine);
-- /* install a new p2m_top page */
-- bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
++ static void *alloc_p2m_page(void)
{
-- unsigned topidx = p2m_top_index(pfn);
-- unsigned long **pfnp, *mfnp;
-- unsigned i;
++ return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
++ }
-- pfnp = &p2m_top[topidx];
-- mfnp = &p2m_top_mfn[topidx];
++ static void free_p2m_page(void *p)
++ {
++ free_page((unsigned long)p);
++ }
-- for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
-- p[i] = INVALID_P2M_ENTRY;
++ /*
++ * Fully allocate the p2m structure for a given pfn. We need to check
++ * that both the top and mid levels are allocated, and make sure the
++ * parallel mfn tree is kept in sync. We may race with other cpus, so
++ * the new pages are installed with cmpxchg; if we lose the race then
++ * simply free the page we allocated and use the one that's there.
++ */
++ static bool alloc_p2m(unsigned long pfn)
++ {
++ unsigned topidx, mididx;
++ unsigned long ***top_p, **mid;
++ unsigned long *top_mfn_p, *mid_mfn;
-- if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
-- *mfnp = virt_to_mfn(p);
-- return true;
++ topidx = p2m_top_index(pfn);
++ mididx = p2m_mid_index(pfn);
++
++ top_p = &p2m_top[topidx];
++ mid = *top_p;
++
++ if (mid == p2m_mid_missing) {
++ /* Mid level is missing, allocate a new one */
++ mid = alloc_p2m_page();
++ if (!mid)
++ return false;
++
++ p2m_mid_init(mid);
++
++ if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
++ free_p2m_page(mid);
}
-- return false;
-- }
++ top_mfn_p = &p2m_top_mfn[topidx];
++ mid_mfn = p2m_top_mfn_p[topidx];
-- static void alloc_p2m(unsigned long pfn)
-- {
-- unsigned long *p;
++ BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
++
++ if (mid_mfn == p2m_mid_missing_mfn) {
++ /* Separately check the mid mfn level */
++ unsigned long missing_mfn;
++ unsigned long mid_mfn_mfn;
+
- p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
- BUG_ON(p == NULL);
++ mid_mfn = alloc_p2m_page();
++ if (!mid_mfn)
++ return false;
+
- if (!install_p2mtop_page(pfn, p))
- free_page((unsigned long)p);
++ p2m_mid_mfn_init(mid_mfn);
++
++ missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
++ mid_mfn_mfn = virt_to_mfn(mid_mfn);
++ if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
++ free_p2m_page(mid_mfn);
++ else
++ p2m_top_mfn_p[topidx] = mid_mfn;
++ }
++
++ if (p2m_top[topidx][mididx] == p2m_missing) {
++ /* p2m leaf page is missing */
++ unsigned long *p2m;
++
++ p2m = alloc_p2m_page();
++ if (!p2m)
++ return false;
+
- p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
- BUG_ON(p == NULL);
++ p2m_init(p2m);
++
++ if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
++ free_p2m_page(p2m);
++ else
++ mid_mfn[mididx] = virt_to_mfn(p2m);
++ }
+
- if (!install_p2mtop_page(pfn, p))
- free_page((unsigned long)p);
++ return true;
}
/* Try to install p2m mapping; fail if intermediate bits missing */
bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
-- unsigned topidx, idx;
++ unsigned topidx, mididx, idx;
-- if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
++ if (unlikely(pfn >= MAX_P2M_PFN)) {
BUG_ON(mfn != INVALID_P2M_ENTRY);
return true;
}
topidx = p2m_top_index(pfn);
-- if (p2m_top[topidx] == p2m_missing) {
-- if (mfn == INVALID_P2M_ENTRY)
-- return true;
-- return false;
-- }
--
++ mididx = p2m_mid_index(pfn);
idx = p2m_index(pfn);
-- p2m_top[topidx][idx] = mfn;
++
++ if (p2m_top[topidx][mididx] == p2m_missing)
++ return mfn == INVALID_P2M_ENTRY;
++
++ p2m_top[topidx][mididx][idx] = mfn;
return true;
}
-- void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
++ bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
-- return;
++ return true;
}
if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
-- alloc_p2m(pfn);
++ if (!alloc_p2m(pfn))
++ return false;
if (!__set_phys_to_machine(pfn, mfn))
-- BUG();
++ return false;
}
++
++ return true;
}
unsigned long arbitrary_virt_to_mfn(void *vaddr)
unsigned int level;
pte = lookup_address(address, &level);
-- BUG_ON(pte == NULL);
++ if (pte == NULL)
++ return; /* vaddr missing */
ptev = pte_wrprotect(*pte);
unsigned int level;
pte = lookup_address(address, &level);
-- BUG_ON(pte == NULL);
++ if (pte == NULL)
++ return; /* vaddr missing */
ptev = pte_mkwrite(*pte);
return pte_flags(pte) & _PAGE_IOMAP;
}
- -static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
+ +void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
{
struct multicall_space mcs;
struct mmu_update *u;
u->ptr = arbitrary_virt_to_machine(ptep).maddr;
u->val = pte_val_ma(pteval);
- - MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO);
+ + MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
+ +EXPORT_SYMBOL_GPL(xen_set_domain_pte);
+ +
+ +static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
+ +{
+ + xen_set_domain_pte(ptep, pteval, DOMID_IO);
+ +}
static void xen_extend_mmu_update(const struct mmu_update *update)
{
if (val & _PAGE_PRESENT) {
unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
pteval_t flags = val & PTE_FLAGS_MASK;
-- val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
++ unsigned long mfn = pfn_to_mfn(pfn);
++
++ /*
++ * If there's no mfn for the pfn, then just create an
++ * empty non-present pte. Unfortunately this loses
++ * information about the original pfn, so
++ * pte_mfn_to_pfn is asymmetric.
++ */
++ if (unlikely(mfn == INVALID_P2M_ENTRY)) {
++ mfn = 0;
++ flags = 0;
++ }
++
++ val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
}
return val;
pteval_t xen_pte_val(pte_t pte)
{
-- if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP))
-- return pte.pte;
++ pteval_t pteval = pte.pte;
++
++ /* If this is a WC pte, convert back from Xen WC to Linux WC */
++ if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
++ WARN_ON(!pat_enabled);
++ pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
++ }
-- return pte_mfn_to_pfn(pte.pte);
++ if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
++ return pteval;
++
++ return pte_mfn_to_pfn(pteval);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
++ /*
++ * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
++ * are reserved for now, to correspond to the Intel-reserved PAT
++ * types.
++ *
++ * We expect Linux's PAT set as follows:
++ *
++ * Idx PTE flags Linux Xen Default
++ * 0 WB WB WB
++ * 1 PWT WC WT WT
++ * 2 PCD UC- UC- UC-
++ * 3 PCD PWT UC UC UC
++ * 4 PAT WB WC WB
++ * 5 PAT PWT WC WP WT
++ * 6 PAT PCD UC- UC UC-
++ * 7 PAT PCD PWT UC UC UC
++ */
++
++ void xen_set_pat(u64 pat)
++ {
++ /* We expect Linux to use a PAT setting of
++ * UC UC- WC WB (ignoring the PAT flag) */
++ WARN_ON(pat != 0x0007010600070106ull);
++ }
++
pte_t xen_make_pte(pteval_t pte)
{
phys_addr_t addr = (pte & PTE_PFN_MASK);
++ /* If Linux is trying to set a WC pte, then map to the Xen WC.
++ * If _PAGE_PAT is set, then it probably means it is really
++ * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
++ * things work out OK...
++ *
++ * (We should never see kernel mappings with _PAGE_PSE set,
++ * but we could see hugetlbfs mappings, I think.).
++ */
++ if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
++ if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
++ pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
++ }
++
/*
* Unprivileged domains are allowed to do IOMAPpings for
* PCI passthrough, but not map ISA space. The ISA
#endif
}
--#ifdef CONFIG_X86_32
static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
{
++ unsigned long pfn = pte_pfn(pte);
++
++#ifdef CONFIG_X86_32
/* If there's an existing pte, then don't allow _PAGE_RW to be set */
if (pte_val_ma(*ptep) & _PAGE_PRESENT)
pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
pte_val_ma(pte));
++#endif
++
++ /*
++ * If the new pfn is within the range of the newly allocated
++ * kernel pagetable, and it isn't being mapped into an
++ * early_ioremap fixmap slot, make sure it is RO.
++ */
++ if (!is_early_ioremap_ptep(ptep) &&
++ pfn >= e820_table_start && pfn < e820_table_end)
++ pte = pte_wrprotect(pte);
return pte;
}
xen_set_pte(ptep, pte);
}
--#endif
static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
{
unsigned ident_pte;
unsigned long pfn;
++ level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
++ PAGE_SIZE);
++
ident_pte = 0;
pfn = 0;
for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
pte_page = m2v(pmd[pmdidx].pmd);
else {
/* Check for free pte pages */
-- if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
++ if (ident_pte == LEVEL1_IDENT_ENTRIES)
break;
pte_page = &level1_ident_pgt[ident_pte];
__xen_write_cr3(true, __pa(pgd));
xen_mc_issue(PARAVIRT_LAZY_CPU);
-- reserve_early(__pa(xen_start_info->pt_base),
++ memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
__pa(xen_start_info->pt_base +
xen_start_info->nr_pt_frames * PAGE_SIZE),
"XEN PAGETABLES");
return pgd;
}
#else /* !CONFIG_X86_64 */
-- static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
++ static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD);
__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
unsigned long max_pfn)
{
pmd_t *kernel_pmd;
++ level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE);
++
max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
xen_start_info->nr_pt_frames * PAGE_SIZE +
512*1024);
pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
-- reserve_early(__pa(xen_start_info->pt_base),
++ memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
__pa(xen_start_info->pt_base +
xen_start_info->nr_pt_frames * PAGE_SIZE),
"XEN PAGETABLES");
.alloc_pte = xen_alloc_pte_init,
.release_pte = xen_release_pte_init,
.alloc_pmd = xen_alloc_pmd_init,
-- .alloc_pmd_clone = paravirt_nop,
.release_pmd = xen_release_pmd_init,
--#ifdef CONFIG_X86_64
-- .set_pte = xen_set_pte,
--#else
.set_pte = xen_set_pte_init,
--#endif
.set_pte_at = xen_set_pte_at,
.set_pmd = xen_set_pmd_hyper,
}
#endif
+ +#define REMAP_BATCH_SIZE 16
+ +
+ +struct remap_data {
+ + unsigned long mfn;
+ + pgprot_t prot;
+ + struct mmu_update *mmu_update;
+ +};
+ +
+ +static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
+ + unsigned long addr, void *data)
+ +{
+ + struct remap_data *rmd = data;
+ + pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
+ +
+ + rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr;
+ + rmd->mmu_update->val = pte_val_ma(pte);
+ + rmd->mmu_update++;
+ +
+ + return 0;
+ +}
+ +
+ +int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+ + unsigned long addr,
+ + unsigned long mfn, int nr,
+ + pgprot_t prot, unsigned domid)
+ +{
+ + struct remap_data rmd;
+ + struct mmu_update mmu_update[REMAP_BATCH_SIZE];
+ + int batch;
+ + unsigned long range;
+ + int err = 0;
+ +
+ + prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
+ +
+ + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+ +
+ + rmd.mfn = mfn;
+ + rmd.prot = prot;
+ +
+ + while (nr) {
+ + batch = min(REMAP_BATCH_SIZE, nr);
+ + range = (unsigned long)batch << PAGE_SHIFT;
+ +
+ + rmd.mmu_update = mmu_update;
+ + err = apply_to_page_range(vma->vm_mm, addr, range,
+ + remap_area_mfn_pte_fn, &rmd);
+ + if (err)
+ + goto out;
+ +
+ + err = -EFAULT;
+ + if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
+ + goto out;
+ +
+ + nr -= batch;
+ + addr += range;
+ + }
+ +
+ + err = 0;
+ +out:
+ +
+ + flush_tlb_all();
+ +
+ + return err;
+ +}
+ +EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+ +
#ifdef CONFIG_XEN_DEBUG_FS
static struct dentry *d_mmu_debug;
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/pm.h>
++#include <linux/memblock.h>
#include <asm/elf.h>
#include <asm/vdso.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
++ #include <xen/xen.h>
#include <xen/page.h>
#include <xen/interface/callback.h>
++ #include <xen/interface/memory.h>
#include <xen/interface/physdev.h>
#include <xen/interface/memory.h>
#include <xen/features.h>
extern void xen_syscall_target(void);
extern void xen_syscall32_target(void);
- reserve_early(extra_start, extra_start + size, "XEN EXTRA");
++ /* Amount of extra memory space we add to the e820 ranges */
++ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
++
++ /*
++ * The maximum amount of extra memory compared to the base size. The
++ * main scaling factor is the size of struct page. At extreme ratios
++ * of base:extra, all the base memory can be filled with page
++ * structures for the extra memory, leaving no space for anything
++ * else.
++ *
++ * 10x seems like a reasonable balance between scaling flexibility and
++ * leaving a practically usable system.
++ */
++ #define EXTRA_MEM_RATIO (10)
++
++ static __init void xen_add_extra_mem(unsigned long pages)
++ {
++ u64 size = (u64)pages * PAGE_SIZE;
++ u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
++
++ if (!pages)
++ return;
++
++ e820_add_region(extra_start, size, E820_RAM);
++ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
++
+++ memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA");
++
++ xen_extra_mem_size += size;
++
++ xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
++ }
++
static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
phys_addr_t end_addr)
{
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
**/
--
char * __init xen_memory_setup(void)
{
++ static struct e820entry map[E820MAX] __initdata;
++
unsigned long max_pfn = xen_start_info->nr_pages;
++ unsigned long long mem_end;
++ int rc;
++ struct xen_memory_map memmap;
++ unsigned long extra_pages = 0;
++ unsigned long extra_limit;
++ int i;
++ int op;
max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
++ mem_end = PFN_PHYS(max_pfn);
++
++ memmap.nr_entries = E820MAX;
++ set_xen_guest_handle(memmap.buffer, map);
++
++ op = xen_initial_domain() ?
++ XENMEM_machine_memory_map :
++ XENMEM_memory_map;
++ rc = HYPERVISOR_memory_op(op, &memmap);
++ if (rc == -ENOSYS) {
++ memmap.nr_entries = 1;
++ map[0].addr = 0ULL;
++ map[0].size = mem_end;
++ /* 8MB slack (to balance backend allocations). */
++ map[0].size += 8ULL << 20;
++ map[0].type = E820_RAM;
++ rc = 0;
++ }
++ BUG_ON(rc);
e820.nr_map = 0;
++ xen_extra_mem_start = mem_end;
++ for (i = 0; i < memmap.nr_entries; i++) {
++ unsigned long long end = map[i].addr + map[i].size;
-- e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM);
++ if (map[i].type == E820_RAM) {
++ if (map[i].addr < mem_end && end > mem_end) {
++ /* Truncate region to max_mem. */
++ u64 delta = end - mem_end;
++
++ map[i].size -= delta;
++ extra_pages += PFN_DOWN(delta);
++
++ end = mem_end;
++ }
++ }
++
++ if (end > xen_extra_mem_start)
++ xen_extra_mem_start = end;
++
++ /* If region is non-RAM or below mem_end, add what remains */
++ if ((map[i].type != E820_RAM || map[i].addr < mem_end) &&
++ map[i].size > 0)
++ e820_add_region(map[i].addr, map[i].size, map[i].type);
++ }
/*
* Even though this is normal, usable memory under Xen, reserve
* - xen_start_info
* See comment above "struct start_info" in <xen/interface/xen.h>
*/
-- reserve_early(__pa(xen_start_info->mfn_list),
++ memblock_x86_reserve_range(__pa(xen_start_info->mfn_list),
__pa(xen_start_info->pt_base),
"XEN START INFO");
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-- xen_return_unused_memory(xen_start_info->nr_pages, &e820);
++ extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820);
++
++ /*
++ * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
++ * factor the base size. On non-highmem systems, the base
++ * size is the full initial memory allocation; on highmem it
++ * is limited to the max size of lowmem, so that it doesn't
++ * get completely filled.
++ *
++ * In principle there could be a problem in lowmem systems if
++ * the initial memory is also very large with respect to
++ * lowmem, but we won't try to deal with that here.
++ */
++ extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
++ max_pfn + extra_pages);
++
++ if (extra_limit >= max_pfn)
++ extra_pages = extra_limit - max_pfn;
++ else
++ extra_pages = 0;
++
++ if (!xen_initial_domain())
++ xen_add_extra_mem(extra_pages);
return "Xen";
}
}
#endif
-- memset(cpu_evtchn_mask(0), ~0, sizeof(cpu_evtchn_mask(0)));
++ memset(cpu_evtchn_mask(0), ~0, sizeof(struct cpu_evtchn_s));
}
static inline void clear_evtchn(int port)
static int find_unbound_irq(void)
{
-- int irq;
-- struct irq_desc *desc;
++ struct irq_data *data;
++ int irq, res;
for (irq = 0; irq < nr_irqs; irq++) {
-- desc = irq_to_desc(irq);
++ data = irq_get_irq_data(irq);
/* only 0->15 have init'd desc; handle irq > 16 */
-- if (desc == NULL)
++ if (!data)
break;
-- if (desc->chip == &no_irq_chip)
++ if (data->chip == &no_irq_chip)
break;
-- if (desc->chip != &xen_dynamic_chip)
++ if (data->chip != &xen_dynamic_chip)
continue;
if (irq_info[irq].type == IRQT_UNBOUND)
-- break;
++ return irq;
}
if (irq == nr_irqs)
panic("No available IRQ to bind to: increase nr_irqs!\n");
-- desc = irq_to_desc_alloc_node(irq, 0);
-- if (WARN_ON(desc == NULL))
-- return -1;
++ res = irq_alloc_desc_at(irq, 0);
-- dynamic_irq_init_keep_chip_data(irq);
++ if (WARN_ON(res != irq))
++ return -1;
return irq;
}
irq = find_unbound_irq();
set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
-- handle_edge_irq, "event");
++ handle_fasteoi_irq, "event");
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_evtchn_info(evtchn);
irq = per_cpu(virq_to_irq, cpu)[virq];
if (irq == -1) {
++ irq = find_unbound_irq();
++
++ set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
++ handle_percpu_irq, "virq");
++
bind_virq.virq = virq;
bind_virq.vcpu = cpu;
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
BUG();
evtchn = bind_virq.port;
-- irq = find_unbound_irq();
--
-- set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
-- handle_percpu_irq, "virq");
--
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_virq_info(evtchn, virq);
if (irq_info[irq].type != IRQT_UNBOUND) {
irq_info[irq] = mk_unbound_info();
-- dynamic_irq_cleanup(irq);
++ irq_free_desc(irq);
}
spin_unlock(&irq_mapping_update_lock);
{
struct shared_info *sh = HYPERVISOR_shared_info;
int cpu = smp_processor_id();
++ unsigned long *cpu_evtchn = cpu_evtchn_mask(cpu);
int i;
unsigned long flags;
static DEFINE_SPINLOCK(debug_lock);
++ struct vcpu_info *v;
spin_lock_irqsave(&debug_lock, flags);
-- printk("vcpu %d\n ", cpu);
++ printk("\nvcpu %d\n ", cpu);
for_each_online_cpu(i) {
-- struct vcpu_info *v = per_cpu(xen_vcpu, i);
-- printk("%d: masked=%d pending=%d event_sel %08lx\n ", i,
-- (get_irq_regs() && i == cpu) ? xen_irqs_disabled(get_irq_regs()) : v->evtchn_upcall_mask,
-- v->evtchn_upcall_pending,
-- v->evtchn_pending_sel);
++ int pending;
++ v = per_cpu(xen_vcpu, i);
++ pending = (get_irq_regs() && i == cpu)
++ ? xen_irqs_disabled(get_irq_regs())
++ : v->evtchn_upcall_mask;
++ printk("%d: masked=%d pending=%d event_sel %0*lx\n ", i,
++ pending, v->evtchn_upcall_pending,
++ (int)(sizeof(v->evtchn_pending_sel)*2),
++ v->evtchn_pending_sel);
++ }
++ v = per_cpu(xen_vcpu, cpu);
++
++ printk("\npending:\n ");
++ for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
++ printk("%0*lx%s", (int)sizeof(sh->evtchn_pending[0])*2,
++ sh->evtchn_pending[i],
++ i % 8 == 0 ? "\n " : " ");
++ printk("\nglobal mask:\n ");
++ for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
++ printk("%0*lx%s",
++ (int)(sizeof(sh->evtchn_mask[0])*2),
++ sh->evtchn_mask[i],
++ i % 8 == 0 ? "\n " : " ");
++
++ printk("\nglobally unmasked:\n ");
++ for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
++ printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2),
++ sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
++ i % 8 == 0 ? "\n " : " ");
++
++ printk("\nlocal cpu%d mask:\n ", cpu);
++ for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--)
++ printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2),
++ cpu_evtchn[i],
++ i % 8 == 0 ? "\n " : " ");
++
++ printk("\nlocally unmasked:\n ");
++ for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) {
++ unsigned long pending = sh->evtchn_pending[i]
++ & ~sh->evtchn_mask[i]
++ & cpu_evtchn[i];
++ printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2),
++ pending, i % 8 == 0 ? "\n " : " ");
}
-- printk("pending:\n ");
-- for(i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
-- printk("%08lx%s", sh->evtchn_pending[i],
-- i % 8 == 0 ? "\n " : " ");
-- printk("\nmasks:\n ");
-- for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
-- printk("%08lx%s", sh->evtchn_mask[i],
-- i % 8 == 0 ? "\n " : " ");
--
-- printk("\nunmasked:\n ");
-- for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
-- printk("%08lx%s", sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
-- i % 8 == 0 ? "\n " : " ");
printk("\npending list:\n");
-- for(i = 0; i < NR_EVENT_CHANNELS; i++) {
++ for (i = 0; i < NR_EVENT_CHANNELS; i++) {
if (sync_test_bit(i, sh->evtchn_pending)) {
-- printk(" %d: event %d -> irq %d\n",
++ int word_idx = i / BITS_PER_LONG;
++ printk(" %d: event %d -> irq %d%s%s%s\n",
cpu_from_evtchn(i), i,
-- evtchn_to_irq[i]);
++ evtchn_to_irq[i],
++ sync_test_bit(word_idx, &v->evtchn_pending_sel)
++ ? "" : " l2-clear",
++ !sync_test_bit(i, sh->evtchn_mask)
++ ? "" : " globally-masked",
++ sync_test_bit(i, cpu_evtchn)
++ ? "" : " locally-masked");
}
}
int irq = evtchn_to_irq[port];
struct irq_desc *desc;
++ mask_evtchn(port);
++ clear_evtchn(port);
++
if (irq != -1) {
desc = irq_to_desc(irq);
if (desc)
{
int evtchn = evtchn_from_irq(irq);
-- move_native_irq(irq);
++ move_masked_irq(irq);
if (VALID_EVTCHN(evtchn))
-- clear_evtchn(evtchn);
++ unmask_evtchn(evtchn);
}
static int retrigger_dynirq(unsigned int irq)
.mask = disable_dynirq,
.unmask = enable_dynirq,
-- .ack = ack_dynirq,
++ .eoi = ack_dynirq,
.set_affinity = set_affinity_irq,
.retrigger = retrigger_dynirq,
};