arch/x86/xen/mmu.c

   1 /*
   2  * Xen mmu operations
   3  *
   4  * This file contains the various mmu fetch and update operations.
   5  * The most important job they must perform is the mapping between the
   6  * domain's pfn and the overall machine mfns.
   7  *
   8  * Xen allows guests to directly update the pagetable, in a controlled
   9  * fashion.  In other words, the guest modifies the same pagetable
  10  * that the CPU actually uses, which eliminates the overhead of having
  11  * a separate shadow pagetable.
  12  *
  13  * In order to allow this, it falls on the guest domain to map its
  14  * notion of a "physical" pfn - which is just a domain-local linear
  15  * address - into a real "machine address" which the CPU's MMU can
  16  * use.
  17  *
  18  * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
  19  * inserted directly into the pagetable.  When creating a new
  20  * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
  21  * when reading the content back with __(pgd|pmd|pte)_val, it converts
  22  * the mfn back into a pfn.
  23  *
  24  * The other constraint is that all pages which make up a pagetable
  25  * must be mapped read-only in the guest.  This prevents uncontrolled
  26  * guest updates to the pagetable.  Xen strictly enforces this, and
  27  * will disallow any pagetable update which will end up mapping a
  28  * pagetable page RW, and will disallow using any writable page as a
  29  * pagetable.
  30  *
  31  * Naively, when loading %cr3 with the base of a new pagetable, Xen
  32  * would need to validate the whole pagetable before going on.
  33  * Naturally, this is quite slow.  The solution is to "pin" a
  34  * pagetable, which enforces all the constraints on the pagetable even
  35  * when it is not actively in use.  This menas that Xen can be assured
  36  * that it is still valid when you do load it into %cr3, and doesn't
  37  * need to revalidate it.
  38  *
  39  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  40  */
  41 #include <linux/sched.h>
  42 #include <linux/highmem.h>
  43 #include <linux/debugfs.h>
  44 #include <linux/bug.h>
  45 #include <linux/vmalloc.h>
  46 #include <linux/module.h>
  47 #include <linux/gfp.h>
  48
  49 #include <asm/pgtable.h>
  50 #include <asm/tlbflush.h>
  51 #include <asm/fixmap.h>
  52 #include <asm/mmu_context.h>
  53 #include <asm/setup.h>
  54 #include <asm/paravirt.h>
  55 #include <asm/e820.h>
  56 #include <asm/linkage.h>
  57 #include <asm/page.h>
  58
  59 #include <asm/xen/hypercall.h>
  60 #include <asm/xen/hypervisor.h>
  61
  62 #include <xen/xen.h>
  63 #include <xen/page.h>
  64 #include <xen/interface/xen.h>
  65 #include <xen/interface/hvm/hvm_op.h>
  66 #include <xen/interface/version.h>
  67 #include <xen/interface/memory.h>
  68 #include <xen/hvc-console.h>
  69
  70 #include "multicalls.h"
  71 #include "mmu.h"
  72 #include "debugfs.h"
  73
  74 #define MMU_UPDATE_HISTO        30
  75
  76 /*
  77  * Protects atomic reservation decrease/increase against concurrent increases.
  78  * Also protects non-atomic updates of current_pages and driver_pages, and
  79  * balloon lists.
  80  */
  81 DEFINE_SPINLOCK(xen_reservation_lock);
  82
  83 #ifdef CONFIG_XEN_DEBUG_FS
  84
  85 static struct {
  86         u32 pgd_update;
  87         u32 pgd_update_pinned;
  88         u32 pgd_update_batched;
  89
  90         u32 pud_update;
  91         u32 pud_update_pinned;
  92         u32 pud_update_batched;
  93
  94         u32 pmd_update;
  95         u32 pmd_update_pinned;
  96         u32 pmd_update_batched;
  97
  98         u32 pte_update;
  99         u32 pte_update_pinned;
 100         u32 pte_update_batched;
 101
 102         u32 mmu_update;
 103         u32 mmu_update_extended;
 104         u32 mmu_update_histo[MMU_UPDATE_HISTO];
 105
 106         u32 prot_commit;
 107         u32 prot_commit_batched;
 108
 109         u32 set_pte_at;
 110         u32 set_pte_at_batched;
 111         u32 set_pte_at_pinned;
 112         u32 set_pte_at_current;
 113         u32 set_pte_at_kernel;
 114 } mmu_stats;
 115
 116 static u8 zero_stats;
 117
 118 static inline void check_zero(void)
 119 {
 120         if (unlikely(zero_stats)) {
 121                 memset(&mmu_stats, 0, sizeof(mmu_stats));
 122                 zero_stats = 0;
 123         }
 124 }
 125
 126 #define ADD_STATS(elem, val)                    \
 127         do { check_zero(); mmu_stats.elem += (val); } while(0)
 128
 129 #else  /* !CONFIG_XEN_DEBUG_FS */
 130
 131 #define ADD_STATS(elem, val)    do { (void)(val); } while(0)
 132
 133 #endif /* CONFIG_XEN_DEBUG_FS */
 134
 135
 136 /*
 137  * Identity map, in addition to plain kernel map.  This needs to be
 138  * large enough to allocate page table pages to allocate the rest.
 139  * Each page can map 2MB.
 140  */
 141 #define LEVEL1_IDENT_ENTRIES    (PTRS_PER_PTE * 4)
 142 static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
 143
 144 #ifdef CONFIG_X86_64
 145 /* l3 pud for userspace vsyscall mapping */
 146 static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
 147 #endif /* CONFIG_X86_64 */
 148
 149 /*
 150  * Note about cr3 (pagetable base) values:
 151  *
 152  * xen_cr3 contains the current logical cr3 value; it contains the
 153  * last set cr3.  This may not be the current effective cr3, because
 154  * its update may be being lazily deferred.  However, a vcpu looking
 155  * at its own cr3 can use this value knowing that it everything will
 156  * be self-consistent.
 157  *
 158  * xen_current_cr3 contains the actual vcpu cr3; it is set once the
 159  * hypercall to set the vcpu cr3 is complete (so it may be a little
 160  * out of date, but it will never be set early).  If one vcpu is
 161  * looking at another vcpu's cr3 value, it should use this variable.
 162  */
 163 DEFINE_PER_CPU(unsigned long, xen_cr3);  /* cr3 stored as physaddr */
 164 DEFINE_PER_CPU(unsigned long, xen_current_cr3);  /* actual vcpu cr3 */
 165
 166
 167 /*
 168  * Just beyond the highest usermode address.  STACK_TOP_MAX has a
 169  * redzone above it, so round it up to a PGD boundary.
 170  */
 171 #define USER_LIMIT      ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
 172
 173 static unsigned long max_p2m_pfn __read_mostly = MAX_DOMAIN_PAGES;
 174
 175 #define P2M_ENTRIES_PER_PAGE            (PAGE_SIZE / sizeof(unsigned long))
 176 #define TOP_ENTRIES(pages)              ((pages) / P2M_ENTRIES_PER_PAGE)
 177 #define MAX_TOP_ENTRIES                 TOP_ENTRIES(MAX_DOMAIN_PAGES)
 178
 179 /* Placeholder for holes in the address space */
 180 static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_ENTRIES_PER_PAGE);
 181
 182  /* Array of pointers to pages containing p2m entries */
 183 static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, MAX_TOP_ENTRIES);
 184
 185 /* Arrays of p2m arrays expressed in mfns used for save/restore */
 186 static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, MAX_TOP_ENTRIES);
 187
 188 static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn_list,
 189                          (MAX_TOP_ENTRIES / P2M_ENTRIES_PER_PAGE));
 190
 191 static inline unsigned p2m_top_index(unsigned long pfn)
 192 {
 193         BUG_ON(pfn >= max_p2m_pfn);
 194         return pfn / P2M_ENTRIES_PER_PAGE;
 195 }
 196
 197 static inline unsigned p2m_index(unsigned long pfn)
 198 {
 199         return pfn % P2M_ENTRIES_PER_PAGE;
 200 }
 201
 202 /* Build the parallel p2m_top_mfn structures */
 203 void xen_build_mfn_list_list(void)
 204 {
 205         unsigned pfn, idx;
 206
 207         for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
 208                 unsigned topidx = p2m_top_index(pfn);
 209
 210                 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
 211         }
 212
 213         for (idx = 0;
 214              idx < TOP_ENTRIES(max_p2m_pfn)/P2M_ENTRIES_PER_PAGE;
 215              idx++) {
 216                 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
 217                 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
 218         }
 219 }
 220
 221 void xen_setup_mfn_list_list(void)
 222 {
 223         BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 224
 225         HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
 226                 virt_to_mfn(p2m_top_mfn_list);
 227         HYPERVISOR_shared_info->arch.max_pfn = max_p2m_mfn;
 228 }
 229
 230 /* Set up p2m_top to point to the domain-builder provided p2m pages */
 231 void __init xen_build_dynamic_phys_to_machine(void)
 232 {
 233         unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
 234         unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
 235         unsigned pfn;
 236         unsigned i;
 237
 238         max_p2m_pfn = max_pfn;
 239
 240         p2m_missing = extend_brk(sizeof(*p2m_missing) * P2M_ENTRIES_PER_PAGE,
 241                                  PAGE_SIZE);
 242         for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
 243                 p2m_missing[i] = ~0UL;
 244
 245         p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES(max_pfn),
 246                              PAGE_SIZE);
 247         for (i = 0; i < TOP_ENTRIES(max_pfn); i++)
 248                 p2m_top[i] = p2m_missing;
 249
 250         p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES(max_pfn),
 251                                  PAGE_SIZE);
 252         p2m_top_mfn_list = extend_brk(sizeof(*p2m_top_mfn_list) *
 253                                       (TOP_ENTRIES(max_pfn) / P2M_ENTRIES_PER_PAGE),
 254                                       PAGE_SIZE);
 255
 256         for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
 257                 unsigned topidx = p2m_top_index(pfn);
 258
 259                 p2m_top[topidx] = &mfn_list[pfn];
 260         }
 261
 262         xen_build_mfn_list_list();
 263 }
 264
 265 unsigned long get_phys_to_machine(unsigned long pfn)
 266 {
 267         unsigned topidx, idx;
 268
 269         if (unlikely(pfn >= max_p2m_pfn))
 270                 return INVALID_P2M_ENTRY;
 271
 272         topidx = p2m_top_index(pfn);
 273         idx = p2m_index(pfn);
 274         return p2m_top[topidx][idx];
 275 }
 276 EXPORT_SYMBOL_GPL(get_phys_to_machine);
 277
 278 /* install a  new p2m_top page */
 279 bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
 280 {
 281         unsigned topidx = p2m_top_index(pfn);
 282         unsigned long **pfnp, *mfnp;
 283         unsigned i;
 284
 285         pfnp = &p2m_top[topidx];
 286         mfnp = &p2m_top_mfn[topidx];
 287
 288         for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
 289                 p[i] = INVALID_P2M_ENTRY;
 290
 291         if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
 292                 *mfnp = virt_to_mfn(p);
 293                 return true;
 294         }
 295
 296         return false;
 297 }
 298
 299 static void alloc_p2m(unsigned long pfn)
 300 {
 301         unsigned long *p;
 302
 303         p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
 304         BUG_ON(p == NULL);
 305
 306         if (!install_p2mtop_page(pfn, p))
 307                 free_page((unsigned long)p);
 308 }
 309
 310 /* Try to install p2m mapping; fail if intermediate bits missing */
 311 bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 312 {
 313         unsigned topidx, idx;
 314
 315         if (unlikely(pfn >= max_p2m_pfn)) {
 316                 BUG_ON(mfn != INVALID_P2M_ENTRY);
 317                 return true;
 318         }
 319
 320         topidx = p2m_top_index(pfn);
 321         if (p2m_top[topidx] == p2m_missing) {
 322                 if (mfn == INVALID_P2M_ENTRY)
 323                         return true;
 324                 return false;
 325         }
 326
 327         idx = p2m_index(pfn);
 328         p2m_top[topidx][idx] = mfn;
 329
 330         return true;
 331 }
 332
 333 void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 334 {
 335         if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
 336                 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
 337                 return;
 338         }
 339
 340         if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
 341                 alloc_p2m(pfn);
 342
 343                 if (!__set_phys_to_machine(pfn, mfn))
 344                         BUG();
 345         }
 346 }
 347
 348 unsigned long arbitrary_virt_to_mfn(void *vaddr)
 349 {
 350         xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
 351
 352         return PFN_DOWN(maddr.maddr);
 353 }
 354
 355 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 356 {
 357         unsigned long address = (unsigned long)vaddr;
 358         unsigned int level;
 359         pte_t *pte;
 360         unsigned offset;
 361
 362         /*
 363          * if the PFN is in the linear mapped vaddr range, we can just use
 364          * the (quick) virt_to_machine() p2m lookup
 365          */
 366         if (virt_addr_valid(vaddr))
 367                 return virt_to_machine(vaddr);
 368
 369         /* otherwise we have to do a (slower) full page-table walk */
 370
 371         pte = lookup_address(address, &level);
 372         BUG_ON(pte == NULL);
 373         offset = address & ~PAGE_MASK;
 374         return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
 375 }
 376
 377 void make_lowmem_page_readonly(void *vaddr)
 378 {
 379         pte_t *pte, ptev;
 380         unsigned long address = (unsigned long)vaddr;
 381         unsigned int level;
 382
 383         pte = lookup_address(address, &level);
 384         BUG_ON(pte == NULL);
 385
 386         ptev = pte_wrprotect(*pte);
 387
 388         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 389                 BUG();
 390 }
 391
 392 void make_lowmem_page_readwrite(void *vaddr)
 393 {
 394         pte_t *pte, ptev;
 395         unsigned long address = (unsigned long)vaddr;
 396         unsigned int level;
 397
 398         pte = lookup_address(address, &level);
 399         BUG_ON(pte == NULL);
 400
 401         ptev = pte_mkwrite(*pte);
 402
 403         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 404                 BUG();
 405 }
 406
 407
 408 static bool xen_page_pinned(void *ptr)
 409 {
 410         struct page *page = virt_to_page(ptr);
 411
 412         return PagePinned(page);
 413 }
 414
 415 static bool xen_iomap_pte(pte_t pte)
 416 {
 417         return pte_flags(pte) & _PAGE_IOMAP;
 418 }
 419
 420 static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
 421 {
 422         struct multicall_space mcs;
 423         struct mmu_update *u;
 424
 425         mcs = xen_mc_entry(sizeof(*u));
 426         u = mcs.args;
 427
 428         /* ptep might be kmapped when using 32-bit HIGHPTE */
 429         u->ptr = arbitrary_virt_to_machine(ptep).maddr;
 430         u->val = pte_val_ma(pteval);
 431
 432         MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO);
 433
 434         xen_mc_issue(PARAVIRT_LAZY_MMU);
 435 }
 436
 437 static void xen_extend_mmu_update(const struct mmu_update *update)
 438 {
 439         struct multicall_space mcs;
 440         struct mmu_update *u;
 441
 442         mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
 443
 444         if (mcs.mc != NULL) {
 445                 ADD_STATS(mmu_update_extended, 1);
 446                 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
 447
 448                 mcs.mc->args[1]++;
 449
 450                 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
 451                         ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
 452                 else
 453                         ADD_STATS(mmu_update_histo[0], 1);
 454         } else {
 455                 ADD_STATS(mmu_update, 1);
 456                 mcs = __xen_mc_entry(sizeof(*u));
 457                 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 458                 ADD_STATS(mmu_update_histo[1], 1);
 459         }
 460
 461         u = mcs.args;
 462         *u = *update;
 463 }
 464
 465 void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 466 {
 467         struct mmu_update u;
 468
 469         preempt_disable();
 470
 471         xen_mc_batch();
 472
 473         /* ptr may be ioremapped for 64-bit pagetable setup */
 474         u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 475         u.val = pmd_val_ma(val);
 476         xen_extend_mmu_update(&u);
 477
 478         ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 479
 480         xen_mc_issue(PARAVIRT_LAZY_MMU);
 481
 482         preempt_enable();
 483 }
 484
 485 void xen_set_pmd(pmd_t *ptr, pmd_t val)
 486 {
 487         ADD_STATS(pmd_update, 1);
 488
 489         /* If page is not pinned, we can just update the entry
 490            directly */
 491         if (!xen_page_pinned(ptr)) {
 492                 *ptr = val;
 493                 return;
 494         }
 495
 496         ADD_STATS(pmd_update_pinned, 1);
 497
 498         xen_set_pmd_hyper(ptr, val);
 499 }
 500
 501 /*
 502  * Associate a virtual page frame with a given physical page frame
 503  * and protection flags for that frame.
 504  */
 505 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 506 {
 507         set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
 508 }
 509
 510 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 511                     pte_t *ptep, pte_t pteval)
 512 {
 513         if (xen_iomap_pte(pteval)) {
 514                 xen_set_iomap_pte(ptep, pteval);
 515                 goto out;
 516         }
 517
 518         ADD_STATS(set_pte_at, 1);
 519 //      ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
 520         ADD_STATS(set_pte_at_current, mm == current->mm);
 521         ADD_STATS(set_pte_at_kernel, mm == &init_mm);
 522
 523         if (mm == current->mm || mm == &init_mm) {
 524                 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
 525                         struct multicall_space mcs;
 526                         mcs = xen_mc_entry(0);
 527
 528                         MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
 529                         ADD_STATS(set_pte_at_batched, 1);
 530                         xen_mc_issue(PARAVIRT_LAZY_MMU);
 531                         goto out;
 532                 } else
 533                         if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
 534                                 goto out;
 535         }
 536         xen_set_pte(ptep, pteval);
 537
 538 out:    return;
 539 }
 540
 541 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
 542                                  unsigned long addr, pte_t *ptep)
 543 {
 544         /* Just return the pte as-is.  We preserve the bits on commit */
 545         return *ptep;
 546 }
 547
 548 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 549                                  pte_t *ptep, pte_t pte)
 550 {
 551         struct mmu_update u;
 552
 553         xen_mc_batch();
 554
 555         u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
 556         u.val = pte_val_ma(pte);
 557         xen_extend_mmu_update(&u);
 558
 559         ADD_STATS(prot_commit, 1);
 560         ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 561
 562         xen_mc_issue(PARAVIRT_LAZY_MMU);
 563 }
 564
 565 /* Assume pteval_t is equivalent to all the other *val_t types. */
 566 static pteval_t pte_mfn_to_pfn(pteval_t val)
 567 {
 568         if (val & _PAGE_PRESENT) {
 569                 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 570                 pteval_t flags = val & PTE_FLAGS_MASK;
 571                 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
 572         }
 573
 574         return val;
 575 }
 576
 577 static pteval_t pte_pfn_to_mfn(pteval_t val)
 578 {
 579         if (val & _PAGE_PRESENT) {
 580                 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 581                 pteval_t flags = val & PTE_FLAGS_MASK;
 582                 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
 583         }
 584
 585         return val;
 586 }
 587
 588 static pteval_t iomap_pte(pteval_t val)
 589 {
 590         if (val & _PAGE_PRESENT) {
 591                 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 592                 pteval_t flags = val & PTE_FLAGS_MASK;
 593
 594                 /* We assume the pte frame number is a MFN, so
 595                    just use it as-is. */
 596                 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
 597         }
 598
 599         return val;
 600 }
 601
 602 pteval_t xen_pte_val(pte_t pte)
 603 {
 604         if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP))
 605                 return pte.pte;
 606
 607         return pte_mfn_to_pfn(pte.pte);
 608 }
 609 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
 610
 611 pgdval_t xen_pgd_val(pgd_t pgd)
 612 {
 613         return pte_mfn_to_pfn(pgd.pgd);
 614 }
 615 PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
 616
 617 pte_t xen_make_pte(pteval_t pte)
 618 {
 619         phys_addr_t addr = (pte & PTE_PFN_MASK);
 620
 621         /*
 622          * Unprivileged domains are allowed to do IOMAPpings for
 623          * PCI passthrough, but not map ISA space.  The ISA
 624          * mappings are just dummy local mappings to keep other
 625          * parts of the kernel happy.
 626          */
 627         if (unlikely(pte & _PAGE_IOMAP) &&
 628             (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
 629                 pte = iomap_pte(pte);
 630         } else {
 631                 pte &= ~_PAGE_IOMAP;
 632                 pte = pte_pfn_to_mfn(pte);
 633         }
 634
 635         return native_make_pte(pte);
 636 }
 637 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
 638
 639 pgd_t xen_make_pgd(pgdval_t pgd)
 640 {
 641         pgd = pte_pfn_to_mfn(pgd);
 642         return native_make_pgd(pgd);
 643 }
 644 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
 645
 646 pmdval_t xen_pmd_val(pmd_t pmd)
 647 {
 648         return pte_mfn_to_pfn(pmd.pmd);
 649 }
 650 PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
 651
 652 void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 653 {
 654         struct mmu_update u;
 655
 656         preempt_disable();
 657
 658         xen_mc_batch();
 659
 660         /* ptr may be ioremapped for 64-bit pagetable setup */
 661         u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 662         u.val = pud_val_ma(val);
 663         xen_extend_mmu_update(&u);
 664
 665         ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 666
 667         xen_mc_issue(PARAVIRT_LAZY_MMU);
 668
 669         preempt_enable();
 670 }
 671
 672 void xen_set_pud(pud_t *ptr, pud_t val)
 673 {
 674         ADD_STATS(pud_update, 1);
 675
 676         /* If page is not pinned, we can just update the entry
 677            directly */
 678         if (!xen_page_pinned(ptr)) {
 679                 *ptr = val;
 680                 return;
 681         }
 682
 683         ADD_STATS(pud_update_pinned, 1);
 684
 685         xen_set_pud_hyper(ptr, val);
 686 }
 687
 688 void xen_set_pte(pte_t *ptep, pte_t pte)
 689 {
 690         if (xen_iomap_pte(pte)) {
 691                 xen_set_iomap_pte(ptep, pte);
 692                 return;
 693         }
 694
 695         ADD_STATS(pte_update, 1);
 696 //      ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
 697         ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 698
 699 #ifdef CONFIG_X86_PAE
 700         ptep->pte_high = pte.pte_high;
 701         smp_wmb();
 702         ptep->pte_low = pte.pte_low;
 703 #else
 704         *ptep = pte;
 705 #endif
 706 }
 707
 708 #ifdef CONFIG_X86_PAE
 709 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 710 {
 711         if (xen_iomap_pte(pte)) {
 712                 xen_set_iomap_pte(ptep, pte);
 713                 return;
 714         }
 715
 716         set_64bit((u64 *)ptep, native_pte_val(pte));
 717 }
 718
 719 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 720 {
 721         ptep->pte_low = 0;
 722         smp_wmb();              /* make sure low gets written first */
 723         ptep->pte_high = 0;
 724 }
 725
 726 void xen_pmd_clear(pmd_t *pmdp)
 727 {
 728         set_pmd(pmdp, __pmd(0));
 729 }
 730 #endif  /* CONFIG_X86_PAE */
 731
 732 pmd_t xen_make_pmd(pmdval_t pmd)
 733 {
 734         pmd = pte_pfn_to_mfn(pmd);
 735         return native_make_pmd(pmd);
 736 }
 737 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
 738
 739 #if PAGETABLE_LEVELS == 4
 740 pudval_t xen_pud_val(pud_t pud)
 741 {
 742         return pte_mfn_to_pfn(pud.pud);
 743 }
 744 PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
 745
 746 pud_t xen_make_pud(pudval_t pud)
 747 {
 748         pud = pte_pfn_to_mfn(pud);
 749
 750         return native_make_pud(pud);
 751 }
 752 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
 753
 754 pgd_t *xen_get_user_pgd(pgd_t *pgd)
 755 {
 756         pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
 757         unsigned offset = pgd - pgd_page;
 758         pgd_t *user_ptr = NULL;
 759
 760         if (offset < pgd_index(USER_LIMIT)) {
 761                 struct page *page = virt_to_page(pgd_page);
 762                 user_ptr = (pgd_t *)page->private;
 763                 if (user_ptr)
 764                         user_ptr += offset;
 765         }
 766
 767         return user_ptr;
 768 }
 769
 770 static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 771 {
 772         struct mmu_update u;
 773
 774         u.ptr = virt_to_machine(ptr).maddr;
 775         u.val = pgd_val_ma(val);
 776         xen_extend_mmu_update(&u);
 777 }
 778
 779 /*
 780  * Raw hypercall-based set_pgd, intended for in early boot before
 781  * there's a page structure.  This implies:
 782  *  1. The only existing pagetable is the kernel's
 783  *  2. It is always pinned
 784  *  3. It has no user pagetable attached to it
 785  */
 786 void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 787 {
 788         preempt_disable();
 789
 790         xen_mc_batch();
 791
 792         __xen_set_pgd_hyper(ptr, val);
 793
 794         xen_mc_issue(PARAVIRT_LAZY_MMU);
 795
 796         preempt_enable();
 797 }
 798
 799 void xen_set_pgd(pgd_t *ptr, pgd_t val)
 800 {
 801         pgd_t *user_ptr = xen_get_user_pgd(ptr);
 802
 803         ADD_STATS(pgd_update, 1);
 804
 805         /* If page is not pinned, we can just update the entry
 806            directly */
 807         if (!xen_page_pinned(ptr)) {
 808                 *ptr = val;
 809                 if (user_ptr) {
 810                         WARN_ON(xen_page_pinned(user_ptr));
 811                         *user_ptr = val;
 812                 }
 813                 return;
 814         }
 815
 816         ADD_STATS(pgd_update_pinned, 1);
 817         ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 818
 819         /* If it's pinned, then we can at least batch the kernel and
 820            user updates together. */
 821         xen_mc_batch();
 822
 823         __xen_set_pgd_hyper(ptr, val);
 824         if (user_ptr)
 825                 __xen_set_pgd_hyper(user_ptr, val);
 826
 827         xen_mc_issue(PARAVIRT_LAZY_MMU);
 828 }
 829 #endif  /* PAGETABLE_LEVELS == 4 */
 830
 831 /*
 832  * (Yet another) pagetable walker.  This one is intended for pinning a
 833  * pagetable.  This means that it walks a pagetable and calls the
 834  * callback function on each page it finds making up the page table,
 835  * at every level.  It walks the entire pagetable, but it only bothers
 836  * pinning pte pages which are below limit.  In the normal case this
 837  * will be STACK_TOP_MAX, but at boot we need to pin up to
 838  * FIXADDR_TOP.
 839  *
 840  * For 32-bit the important bit is that we don't pin beyond there,
 841  * because then we start getting into Xen's ptes.
 842  *
 843  * For 64-bit, we must skip the Xen hole in the middle of the address
 844  * space, just after the big x86-64 virtual hole.
 845  */
 846 static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
 847                           int (*func)(struct mm_struct *mm, struct page *,
 848                                       enum pt_level),
 849                           unsigned long limit)
 850 {
 851         int flush = 0;
 852         unsigned hole_low, hole_high;
 853         unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
 854         unsigned pgdidx, pudidx, pmdidx;
 855
 856         /* The limit is the last byte to be touched */
 857         limit--;
 858         BUG_ON(limit >= FIXADDR_TOP);
 859
 860         if (xen_feature(XENFEAT_auto_translated_physmap))
 861                 return 0;
 862
 863         /*
 864          * 64-bit has a great big hole in the middle of the address
 865          * space, which contains the Xen mappings.  On 32-bit these
 866          * will end up making a zero-sized hole and so is a no-op.
 867          */
 868         hole_low = pgd_index(USER_LIMIT);
 869         hole_high = pgd_index(PAGE_OFFSET);
 870
 871         pgdidx_limit = pgd_index(limit);
 872 #if PTRS_PER_PUD > 1
 873         pudidx_limit = pud_index(limit);
 874 #else
 875         pudidx_limit = 0;
 876 #endif
 877 #if PTRS_PER_PMD > 1
 878         pmdidx_limit = pmd_index(limit);
 879 #else
 880         pmdidx_limit = 0;
 881 #endif
 882
 883         for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
 884                 pud_t *pud;
 885
 886                 if (pgdidx >= hole_low && pgdidx < hole_high)
 887                         continue;
 888
 889                 if (!pgd_val(pgd[pgdidx]))
 890                         continue;
 891
 892                 pud = pud_offset(&pgd[pgdidx], 0);
 893
 894                 if (PTRS_PER_PUD > 1) /* not folded */
 895                         flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
 896
 897                 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
 898                         pmd_t *pmd;
 899
 900                         if (pgdidx == pgdidx_limit &&
 901                             pudidx > pudidx_limit)
 902                                 goto out;
 903
 904                         if (pud_none(pud[pudidx]))
 905                                 continue;
 906
 907                         pmd = pmd_offset(&pud[pudidx], 0);
 908
 909                         if (PTRS_PER_PMD > 1) /* not folded */
 910                                 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
 911
 912                         for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
 913                                 struct page *pte;
 914
 915                                 if (pgdidx == pgdidx_limit &&
 916                                     pudidx == pudidx_limit &&
 917                                     pmdidx > pmdidx_limit)
 918                                         goto out;
 919
 920                                 if (pmd_none(pmd[pmdidx]))
 921                                         continue;
 922
 923                                 pte = pmd_page(pmd[pmdidx]);
 924                                 flush |= (*func)(mm, pte, PT_PTE);
 925                         }
 926                 }
 927         }
 928
 929 out:
 930         /* Do the top level last, so that the callbacks can use it as
 931            a cue to do final things like tlb flushes. */
 932         flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
 933
 934         return flush;
 935 }
 936
 937 static int xen_pgd_walk(struct mm_struct *mm,
 938                         int (*func)(struct mm_struct *mm, struct page *,
 939                                     enum pt_level),
 940                         unsigned long limit)
 941 {
 942         return __xen_pgd_walk(mm, mm->pgd, func, limit);
 943 }
 944
 945 /* If we're using split pte locks, then take the page's lock and
 946    return a pointer to it.  Otherwise return NULL. */
 947 static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
 948 {
 949         spinlock_t *ptl = NULL;
 950
 951 #if USE_SPLIT_PTLOCKS
 952         ptl = __pte_lockptr(page);
 953         spin_lock_nest_lock(ptl, &mm->page_table_lock);
 954 #endif
 955
 956         return ptl;
 957 }
 958
 959 static void xen_pte_unlock(void *v)
 960 {
 961         spinlock_t *ptl = v;
 962         spin_unlock(ptl);
 963 }
 964
 965 static void xen_do_pin(unsigned level, unsigned long pfn)
 966 {
 967         struct mmuext_op *op;
 968         struct multicall_space mcs;
 969
 970         mcs = __xen_mc_entry(sizeof(*op));
 971         op = mcs.args;
 972         op->cmd = level;
 973         op->arg1.mfn = pfn_to_mfn(pfn);
 974         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 975 }
 976
 977 static int xen_pin_page(struct mm_struct *mm, struct page *page,
 978                         enum pt_level level)
 979 {
 980         unsigned pgfl = TestSetPagePinned(page);
 981         int flush;
 982
 983         if (pgfl)
 984                 flush = 0;              /* already pinned */
 985         else if (PageHighMem(page))
 986                 /* kmaps need flushing if we found an unpinned
 987                    highpage */
 988                 flush = 1;
 989         else {
 990                 void *pt = lowmem_page_address(page);
 991                 unsigned long pfn = page_to_pfn(page);
 992                 struct multicall_space mcs = __xen_mc_entry(0);
 993                 spinlock_t *ptl;
 994
 995                 flush = 0;
 996
 997                 /*
 998                  * We need to hold the pagetable lock between the time
 999                  * we make the pagetable RO and when we actually pin
1000                  * it.  If we don't, then other users may come in and
1001                  * attempt to update the pagetable by writing it,
1002                  * which will fail because the memory is RO but not
1003                  * pinned, so Xen won't do the trap'n'emulate.
1004                  *
1005                  * If we're using split pte locks, we can't hold the
1006                  * entire pagetable's worth of locks during the
1007                  * traverse, because we may wrap the preempt count (8
1008                  * bits).  The solution is to mark RO and pin each PTE
1009                  * page while holding the lock.  This means the number
1010                  * of locks we end up holding is never more than a
1011                  * batch size (~32 entries, at present).
1012                  *
1013                  * If we're not using split pte locks, we needn't pin
1014                  * the PTE pages independently, because we're
1015                  * protected by the overall pagetable lock.
1016                  */
1017                 ptl = NULL;
1018                 if (level == PT_PTE)
1019                         ptl = xen_pte_lock(page, mm);
1020
1021                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1022                                         pfn_pte(pfn, PAGE_KERNEL_RO),
1023                                         level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1024
1025                 if (ptl) {
1026                         xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
1027
1028                         /* Queue a deferred unlock for when this batch
1029                            is completed. */
1030                         xen_mc_callback(xen_pte_unlock, ptl);
1031                 }
1032         }
1033
1034         return flush;
1035 }
1036
1037 /* This is called just after a mm has been created, but it has not
1038    been used yet.  We need to make sure that its pagetable is all
1039    read-only, and can be pinned. */
1040 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
1041 {
1042         xen_mc_batch();
1043
1044         if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
1045                 /* re-enable interrupts for flushing */
1046                 xen_mc_issue(0);
1047
1048                 kmap_flush_unused();
1049
1050                 xen_mc_batch();
1051         }
1052
1053 #ifdef CONFIG_X86_64
1054         {
1055                 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1056
1057                 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
1058
1059                 if (user_pgd) {
1060                         xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
1061                         xen_do_pin(MMUEXT_PIN_L4_TABLE,
1062                                    PFN_DOWN(__pa(user_pgd)));
1063                 }
1064         }
1065 #else /* CONFIG_X86_32 */
1066 #ifdef CONFIG_X86_PAE
1067         /* Need to make sure unshared kernel PMD is pinnable */
1068         xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
1069                      PT_PMD);
1070 #endif
1071         xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
1072 #endif /* CONFIG_X86_64 */
1073         xen_mc_issue(0);
1074 }
1075
1076 static void xen_pgd_pin(struct mm_struct *mm)
1077 {
1078         __xen_pgd_pin(mm, mm->pgd);
1079 }
1080
1081 /*
1082  * On save, we need to pin all pagetables to make sure they get their
1083  * mfns turned into pfns.  Search the list for any unpinned pgds and pin
1084  * them (unpinned pgds are not currently in use, probably because the
1085  * process is under construction or destruction).
1086  *
1087  * Expected to be called in stop_machine() ("equivalent to taking
1088  * every spinlock in the system"), so the locking doesn't really
1089  * matter all that much.
1090  */
1091 void xen_mm_pin_all(void)
1092 {
1093         unsigned long flags;
1094         struct page *page;
1095
1096         spin_lock_irqsave(&pgd_lock, flags);
1097
1098         list_for_each_entry(page, &pgd_list, lru) {
1099                 if (!PagePinned(page)) {
1100                         __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
1101                         SetPageSavePinned(page);
1102                 }
1103         }
1104
1105         spin_unlock_irqrestore(&pgd_lock, flags);
1106 }
1107
1108 /*
1109  * The init_mm pagetable is really pinned as soon as its created, but
1110  * that's before we have page structures to store the bits.  So do all
1111  * the book-keeping now.
1112  */
1113 static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
1114                                   enum pt_level level)
1115 {
1116         SetPagePinned(page);
1117         return 0;
1118 }
1119
1120 static void __init xen_mark_init_mm_pinned(void)
1121 {
1122         xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
1123 }
1124
1125 static int xen_unpin_page(struct mm_struct *mm, struct page *page,
1126                           enum pt_level level)
1127 {
1128         unsigned pgfl = TestClearPagePinned(page);
1129
1130         if (pgfl && !PageHighMem(page)) {
1131                 void *pt = lowmem_page_address(page);
1132                 unsigned long pfn = page_to_pfn(page);
1133                 spinlock_t *ptl = NULL;
1134                 struct multicall_space mcs;
1135
1136                 /*
1137                  * Do the converse to pin_page.  If we're using split
1138                  * pte locks, we must be holding the lock for while
1139                  * the pte page is unpinned but still RO to prevent
1140                  * concurrent updates from seeing it in this
1141                  * partially-pinned state.
1142                  */
1143                 if (level == PT_PTE) {
1144                         ptl = xen_pte_lock(page, mm);
1145
1146                         if (ptl)
1147                                 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
1148                 }
1149
1150                 mcs = __xen_mc_entry(0);
1151
1152                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1153                                         pfn_pte(pfn, PAGE_KERNEL),
1154                                         level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1155
1156                 if (ptl) {
1157                         /* unlock when batch completed */
1158                         xen_mc_callback(xen_pte_unlock, ptl);
1159                 }
1160         }
1161
1162         return 0;               /* never need to flush on unpin */
1163 }
1164
1165 /* Release a pagetables pages back as normal RW */
1166 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
1167 {
1168         xen_mc_batch();
1169
1170         xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1171
1172 #ifdef CONFIG_X86_64
1173         {
1174                 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1175
1176                 if (user_pgd) {
1177                         xen_do_pin(MMUEXT_UNPIN_TABLE,
1178                                    PFN_DOWN(__pa(user_pgd)));
1179                         xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
1180                 }
1181         }
1182 #endif
1183
1184 #ifdef CONFIG_X86_PAE
1185         /* Need to make sure unshared kernel PMD is unpinned */
1186         xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
1187                        PT_PMD);
1188 #endif
1189
1190         __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
1191
1192         xen_mc_issue(0);
1193 }
1194
1195 static void xen_pgd_unpin(struct mm_struct *mm)
1196 {
1197         __xen_pgd_unpin(mm, mm->pgd);
1198 }
1199
1200 /*
1201  * On resume, undo any pinning done at save, so that the rest of the
1202  * kernel doesn't see any unexpected pinned pagetables.
1203  */
1204 void xen_mm_unpin_all(void)
1205 {
1206         unsigned long flags;
1207         struct page *page;
1208
1209         spin_lock_irqsave(&pgd_lock, flags);
1210
1211         list_for_each_entry(page, &pgd_list, lru) {
1212                 if (PageSavePinned(page)) {
1213                         BUG_ON(!PagePinned(page));
1214                         __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
1215                         ClearPageSavePinned(page);
1216                 }
1217         }
1218
1219         spin_unlock_irqrestore(&pgd_lock, flags);
1220 }
1221
1222 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1223 {
1224         spin_lock(&next->page_table_lock);
1225         xen_pgd_pin(next);
1226         spin_unlock(&next->page_table_lock);
1227 }
1228
1229 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1230 {
1231         spin_lock(&mm->page_table_lock);
1232         xen_pgd_pin(mm);
1233         spin_unlock(&mm->page_table_lock);
1234 }
1235
1236
1237 #ifdef CONFIG_SMP
1238 /* Another cpu may still have their %cr3 pointing at the pagetable, so
1239    we need to repoint it somewhere else before we can unpin it. */
1240 static void drop_other_mm_ref(void *info)
1241 {
1242         struct mm_struct *mm = info;
1243         struct mm_struct *active_mm;
1244
1245         active_mm = percpu_read(cpu_tlbstate.active_mm);
1246
1247         if (active_mm == mm)
1248                 leave_mm(smp_processor_id());
1249
1250         /* If this cpu still has a stale cr3 reference, then make sure
1251            it has been flushed. */
1252         if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
1253                 load_cr3(swapper_pg_dir);
1254 }
1255
1256 static void xen_drop_mm_ref(struct mm_struct *mm)
1257 {
1258         cpumask_var_t mask;
1259         unsigned cpu;
1260
1261         if (current->active_mm == mm) {
1262                 if (current->mm == mm)
1263                         load_cr3(swapper_pg_dir);
1264                 else
1265                         leave_mm(smp_processor_id());
1266         }
1267
1268         /* Get the "official" set of cpus referring to our pagetable. */
1269         if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1270                 for_each_online_cpu(cpu) {
1271                         if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1272                             && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1273                                 continue;
1274                         smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1275                 }
1276                 return;
1277         }
1278         cpumask_copy(mask, mm_cpumask(mm));
1279
1280         /* It's possible that a vcpu may have a stale reference to our
1281            cr3, because its in lazy mode, and it hasn't yet flushed
1282            its set of pending hypercalls yet.  In this case, we can
1283            look at its actual current cr3 value, and force it to flush
1284            if needed. */
1285         for_each_online_cpu(cpu) {
1286                 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1287                         cpumask_set_cpu(cpu, mask);
1288         }
1289
1290         if (!cpumask_empty(mask))
1291                 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1292         free_cpumask_var(mask);
1293 }
1294 #else
1295 static void xen_drop_mm_ref(struct mm_struct *mm)
1296 {
1297         if (current->active_mm == mm)
1298                 load_cr3(swapper_pg_dir);
1299 }
1300 #endif
1301
1302 /*
1303  * While a process runs, Xen pins its pagetables, which means that the
1304  * hypervisor forces it to be read-only, and it controls all updates
1305  * to it.  This means that all pagetable updates have to go via the
1306  * hypervisor, which is moderately expensive.
1307  *
1308  * Since we're pulling the pagetable down, we switch to use init_mm,
1309  * unpin old process pagetable and mark it all read-write, which
1310  * allows further operations on it to be simple memory accesses.
1311  *
1312  * The only subtle point is that another CPU may be still using the
1313  * pagetable because of lazy tlb flushing.  This means we need need to
1314  * switch all CPUs off this pagetable before we can unpin it.
1315  */
1316 void xen_exit_mmap(struct mm_struct *mm)
1317 {
1318         get_cpu();              /* make sure we don't move around */
1319         xen_drop_mm_ref(mm);
1320         put_cpu();
1321
1322         spin_lock(&mm->page_table_lock);
1323
1324         /* pgd may not be pinned in the error exit path of execve */
1325         if (xen_page_pinned(mm->pgd))
1326                 xen_pgd_unpin(mm);
1327
1328         spin_unlock(&mm->page_table_lock);
1329 }
1330
1331 static __init void xen_pagetable_setup_start(pgd_t *base)
1332 {
1333 }
1334
1335 static void xen_post_allocator_init(void);
1336
1337 static __init void xen_pagetable_setup_done(pgd_t *base)
1338 {
1339         xen_setup_shared_info();
1340         xen_post_allocator_init();
1341 }
1342
1343 static void xen_write_cr2(unsigned long cr2)
1344 {
1345         percpu_read(xen_vcpu)->arch.cr2 = cr2;
1346 }
1347
1348 static unsigned long xen_read_cr2(void)
1349 {
1350         return percpu_read(xen_vcpu)->arch.cr2;
1351 }
1352
1353 unsigned long xen_read_cr2_direct(void)
1354 {
1355         return percpu_read(xen_vcpu_info.arch.cr2);
1356 }
1357
1358 static void xen_flush_tlb(void)
1359 {
1360         struct mmuext_op *op;
1361         struct multicall_space mcs;
1362
1363         preempt_disable();
1364
1365         mcs = xen_mc_entry(sizeof(*op));
1366
1367         op = mcs.args;
1368         op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1369         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1370
1371         xen_mc_issue(PARAVIRT_LAZY_MMU);
1372
1373         preempt_enable();
1374 }
1375
1376 static void xen_flush_tlb_single(unsigned long addr)
1377 {
1378         struct mmuext_op *op;
1379         struct multicall_space mcs;
1380
1381         preempt_disable();
1382
1383         mcs = xen_mc_entry(sizeof(*op));
1384         op = mcs.args;
1385         op->cmd = MMUEXT_INVLPG_LOCAL;
1386         op->arg1.linear_addr = addr & PAGE_MASK;
1387         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1388
1389         xen_mc_issue(PARAVIRT_LAZY_MMU);
1390
1391         preempt_enable();
1392 }
1393
1394 static void xen_flush_tlb_others(const struct cpumask *cpus,
1395                                  struct mm_struct *mm, unsigned long va)
1396 {
1397         struct {
1398                 struct mmuext_op op;
1399                 DECLARE_BITMAP(mask, NR_CPUS);
1400         } *args;
1401         struct multicall_space mcs;
1402
1403         if (cpumask_empty(cpus))
1404                 return;         /* nothing to do */
1405
1406         mcs = xen_mc_entry(sizeof(*args));
1407         args = mcs.args;
1408         args->op.arg2.vcpumask = to_cpumask(args->mask);
1409
1410         /* Remove us, and any offline CPUS. */
1411         cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1412         cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1413
1414         if (va == TLB_FLUSH_ALL) {
1415                 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1416         } else {
1417                 args->op.cmd = MMUEXT_INVLPG_MULTI;
1418                 args->op.arg1.linear_addr = va;
1419         }
1420
1421         MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1422
1423         xen_mc_issue(PARAVIRT_LAZY_MMU);
1424 }
1425
1426 static unsigned long xen_read_cr3(void)
1427 {
1428         return percpu_read(xen_cr3);
1429 }
1430
1431 static void set_current_cr3(void *v)
1432 {
1433         percpu_write(xen_current_cr3, (unsigned long)v);
1434 }
1435
1436 static void __xen_write_cr3(bool kernel, unsigned long cr3)
1437 {
1438         struct mmuext_op *op;
1439         struct multicall_space mcs;
1440         unsigned long mfn;
1441
1442         if (cr3)
1443                 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1444         else
1445                 mfn = 0;
1446
1447         WARN_ON(mfn == 0 && kernel);
1448
1449         mcs = __xen_mc_entry(sizeof(*op));
1450
1451         op = mcs.args;
1452         op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1453         op->arg1.mfn = mfn;
1454
1455         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1456
1457         if (kernel) {
1458                 percpu_write(xen_cr3, cr3);
1459
1460                 /* Update xen_current_cr3 once the batch has actually
1461                    been submitted. */
1462                 xen_mc_callback(set_current_cr3, (void *)cr3);
1463         }
1464 }
1465
1466 static void xen_write_cr3(unsigned long cr3)
1467 {
1468         BUG_ON(preemptible());
1469
1470         xen_mc_batch();  /* disables interrupts */
1471
1472         /* Update while interrupts are disabled, so its atomic with
1473            respect to ipis */
1474         percpu_write(xen_cr3, cr3);
1475
1476         __xen_write_cr3(true, cr3);
1477
1478 #ifdef CONFIG_X86_64
1479         {
1480                 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1481                 if (user_pgd)
1482                         __xen_write_cr3(false, __pa(user_pgd));
1483                 else
1484                         __xen_write_cr3(false, 0);
1485         }
1486 #endif
1487
1488         xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1489 }
1490
1491 static int xen_pgd_alloc(struct mm_struct *mm)
1492 {
1493         pgd_t *pgd = mm->pgd;
1494         int ret = 0;
1495
1496         BUG_ON(PagePinned(virt_to_page(pgd)));
1497
1498 #ifdef CONFIG_X86_64
1499         {
1500                 struct page *page = virt_to_page(pgd);
1501                 pgd_t *user_pgd;
1502
1503                 BUG_ON(page->private != 0);
1504
1505                 ret = -ENOMEM;
1506
1507                 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1508                 page->private = (unsigned long)user_pgd;
1509
1510                 if (user_pgd != NULL) {
1511                         user_pgd[pgd_index(VSYSCALL_START)] =
1512                                 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1513                         ret = 0;
1514                 }
1515
1516                 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1517         }
1518 #endif
1519
1520         return ret;
1521 }
1522
1523 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1524 {
1525 #ifdef CONFIG_X86_64
1526         pgd_t *user_pgd = xen_get_user_pgd(pgd);
1527
1528         if (user_pgd)
1529                 free_page((unsigned long)user_pgd);
1530 #endif
1531 }
1532
1533 #ifdef CONFIG_X86_32
1534 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1535 {
1536         /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1537         if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1538                 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1539                                pte_val_ma(pte));
1540
1541         return pte;
1542 }
1543
1544 /* Init-time set_pte while constructing initial pagetables, which
1545    doesn't allow RO pagetable pages to be remapped RW */
1546 static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1547 {
1548         pte = mask_rw_pte(ptep, pte);
1549
1550         xen_set_pte(ptep, pte);
1551 }
1552 #endif
1553
1554 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1555 {
1556         struct mmuext_op op;
1557         op.cmd = cmd;
1558         op.arg1.mfn = pfn_to_mfn(pfn);
1559         if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1560                 BUG();
1561 }
1562
1563 /* Early in boot, while setting up the initial pagetable, assume
1564    everything is pinned. */
1565 static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1566 {
1567 #ifdef CONFIG_FLATMEM
1568         BUG_ON(mem_map);        /* should only be used early */
1569 #endif
1570         make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1571         pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1572 }
1573
1574 /* Used for pmd and pud */
1575 static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1576 {
1577 #ifdef CONFIG_FLATMEM
1578         BUG_ON(mem_map);        /* should only be used early */
1579 #endif
1580         make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1581 }
1582
1583 /* Early release_pte assumes that all pts are pinned, since there's
1584    only init_mm and anything attached to that is pinned. */
1585 static __init void xen_release_pte_init(unsigned long pfn)
1586 {
1587         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1588         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1589 }
1590
1591 static __init void xen_release_pmd_init(unsigned long pfn)
1592 {
1593         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1594 }
1595
1596 /* This needs to make sure the new pte page is pinned iff its being
1597    attached to a pinned pagetable. */
1598 static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1599 {
1600         struct page *page = pfn_to_page(pfn);
1601
1602         if (PagePinned(virt_to_page(mm->pgd))) {
1603                 SetPagePinned(page);
1604
1605                 if (!PageHighMem(page)) {
1606                         make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1607                         if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1608                                 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1609                 } else {
1610                         /* make sure there are no stray mappings of
1611                            this page */
1612                         kmap_flush_unused();
1613                 }
1614         }
1615 }
1616
1617 static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1618 {
1619         xen_alloc_ptpage(mm, pfn, PT_PTE);
1620 }
1621
1622 static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1623 {
1624         xen_alloc_ptpage(mm, pfn, PT_PMD);
1625 }
1626
1627 /* This should never happen until we're OK to use struct page */
1628 static void xen_release_ptpage(unsigned long pfn, unsigned level)
1629 {
1630         struct page *page = pfn_to_page(pfn);
1631
1632         if (PagePinned(page)) {
1633                 if (!PageHighMem(page)) {
1634                         if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1635                                 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1636                         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1637                 }
1638                 ClearPagePinned(page);
1639         }
1640 }
1641
1642 static void xen_release_pte(unsigned long pfn)
1643 {
1644         xen_release_ptpage(pfn, PT_PTE);
1645 }
1646
1647 static void xen_release_pmd(unsigned long pfn)
1648 {
1649         xen_release_ptpage(pfn, PT_PMD);
1650 }
1651
1652 #if PAGETABLE_LEVELS == 4
1653 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1654 {
1655         xen_alloc_ptpage(mm, pfn, PT_PUD);
1656 }
1657
1658 static void xen_release_pud(unsigned long pfn)
1659 {
1660         xen_release_ptpage(pfn, PT_PUD);
1661 }
1662 #endif
1663
1664 void __init xen_reserve_top(void)
1665 {
1666 #ifdef CONFIG_X86_32
1667         unsigned long top = HYPERVISOR_VIRT_START;
1668         struct xen_platform_parameters pp;
1669
1670         if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1671                 top = pp.virt_start;
1672
1673         reserve_top_address(-top);
1674 #endif  /* CONFIG_X86_32 */
1675 }
1676
1677 /*
1678  * Like __va(), but returns address in the kernel mapping (which is
1679  * all we have until the physical memory mapping has been set up.
1680  */
1681 static void *__ka(phys_addr_t paddr)
1682 {
1683 #ifdef CONFIG_X86_64
1684         return (void *)(paddr + __START_KERNEL_map);
1685 #else
1686         return __va(paddr);
1687 #endif
1688 }
1689
1690 /* Convert a machine address to physical address */
1691 static unsigned long m2p(phys_addr_t maddr)
1692 {
1693         phys_addr_t paddr;
1694
1695         maddr &= PTE_PFN_MASK;
1696         paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1697
1698         return paddr;
1699 }
1700
1701 /* Convert a machine address to kernel virtual */
1702 static void *m2v(phys_addr_t maddr)
1703 {
1704         return __ka(m2p(maddr));
1705 }
1706
1707 static void set_page_prot(void *addr, pgprot_t prot)
1708 {
1709         unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1710         pte_t pte = pfn_pte(pfn, prot);
1711
1712         if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1713                 BUG();
1714 }
1715
1716 static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1717 {
1718         unsigned pmdidx, pteidx;
1719         unsigned ident_pte;
1720         unsigned long pfn;
1721
1722         level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1723                                       PAGE_SIZE);
1724
1725         ident_pte = 0;
1726         pfn = 0;
1727         for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1728                 pte_t *pte_page;
1729
1730                 /* Reuse or allocate a page of ptes */
1731                 if (pmd_present(pmd[pmdidx]))
1732                         pte_page = m2v(pmd[pmdidx].pmd);
1733                 else {
1734                         /* Check for free pte pages */
1735                         if (ident_pte == LEVEL1_IDENT_ENTRIES)
1736                                 break;
1737
1738                         pte_page = &level1_ident_pgt[ident_pte];
1739                         ident_pte += PTRS_PER_PTE;
1740
1741                         pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1742                 }
1743
1744                 /* Install mappings */
1745                 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1746                         pte_t pte;
1747
1748                         if (pfn > max_pfn_mapped)
1749                                 max_pfn_mapped = pfn;
1750
1751                         if (!pte_none(pte_page[pteidx]))
1752                                 continue;
1753
1754                         pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1755                         pte_page[pteidx] = pte;
1756                 }
1757         }
1758
1759         for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1760                 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1761
1762         set_page_prot(pmd, PAGE_KERNEL_RO);
1763 }
1764
1765 #ifdef CONFIG_X86_64
1766 static void convert_pfn_mfn(void *v)
1767 {
1768         pte_t *pte = v;
1769         int i;
1770
1771         /* All levels are converted the same way, so just treat them
1772            as ptes. */
1773         for (i = 0; i < PTRS_PER_PTE; i++)
1774                 pte[i] = xen_make_pte(pte[i].pte);
1775 }
1776
1777 /*
1778  * Set up the inital kernel pagetable.
1779  *
1780  * We can construct this by grafting the Xen provided pagetable into
1781  * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
1782  * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
1783  * means that only the kernel has a physical mapping to start with -
1784  * but that's enough to get __va working.  We need to fill in the rest
1785  * of the physical mapping once some sort of allocator has been set
1786  * up.
1787  */
1788 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1789                                          unsigned long max_pfn)
1790 {
1791         pud_t *l3;
1792         pmd_t *l2;
1793
1794         /* Zap identity mapping */
1795         init_level4_pgt[0] = __pgd(0);
1796
1797         /* Pre-constructed entries are in pfn, so convert to mfn */
1798         convert_pfn_mfn(init_level4_pgt);
1799         convert_pfn_mfn(level3_ident_pgt);
1800         convert_pfn_mfn(level3_kernel_pgt);
1801
1802         l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1803         l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1804
1805         memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1806         memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1807
1808         l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1809         l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1810         memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1811
1812         /* Set up identity map */
1813         xen_map_identity_early(level2_ident_pgt, max_pfn);
1814
1815         /* Make pagetable pieces RO */
1816         set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1817         set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1818         set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1819         set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1820         set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1821         set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1822
1823         /* Pin down new L4 */
1824         pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1825                           PFN_DOWN(__pa_symbol(init_level4_pgt)));
1826
1827         /* Unpin Xen-provided one */
1828         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1829
1830         /* Switch over */
1831         pgd = init_level4_pgt;
1832
1833         /*
1834          * At this stage there can be no user pgd, and no page
1835          * structure to attach it to, so make sure we just set kernel
1836          * pgd.
1837          */
1838         xen_mc_batch();
1839         __xen_write_cr3(true, __pa(pgd));
1840         xen_mc_issue(PARAVIRT_LAZY_CPU);
1841
1842         reserve_early(__pa(xen_start_info->pt_base),
1843                       __pa(xen_start_info->pt_base +
1844                            xen_start_info->nr_pt_frames * PAGE_SIZE),
1845                       "XEN PAGETABLES");
1846
1847         return pgd;
1848 }
1849 #else   /* !CONFIG_X86_64 */
1850 static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD);
1851
1852 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1853                                          unsigned long max_pfn)
1854 {
1855         pmd_t *kernel_pmd;
1856
1857         level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE);
1858
1859         max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1860                                   xen_start_info->nr_pt_frames * PAGE_SIZE +
1861                                   512*1024);
1862
1863         kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1864         memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1865
1866         xen_map_identity_early(level2_kernel_pgt, max_pfn);
1867
1868         memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1869         set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1870                         __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1871
1872         set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1873         set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1874         set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1875
1876         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1877
1878         xen_write_cr3(__pa(swapper_pg_dir));
1879
1880         pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1881
1882         reserve_early(__pa(xen_start_info->pt_base),
1883                       __pa(xen_start_info->pt_base +
1884                            xen_start_info->nr_pt_frames * PAGE_SIZE),
1885                       "XEN PAGETABLES");
1886
1887         return swapper_pg_dir;
1888 }
1889 #endif  /* CONFIG_X86_64 */
1890
1891 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1892 {
1893         pte_t pte;
1894
1895         phys >>= PAGE_SHIFT;
1896
1897         switch (idx) {
1898         case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1899 #ifdef CONFIG_X86_F00F_BUG
1900         case FIX_F00F_IDT:
1901 #endif
1902 #ifdef CONFIG_X86_32
1903         case FIX_WP_TEST:
1904         case FIX_VDSO:
1905 # ifdef CONFIG_HIGHMEM
1906         case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1907 # endif
1908 #else
1909         case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1910 #endif
1911 #ifdef CONFIG_X86_LOCAL_APIC
1912         case FIX_APIC_BASE:     /* maps dummy local APIC */
1913 #endif
1914         case FIX_TEXT_POKE0:
1915         case FIX_TEXT_POKE1:
1916                 /* All local page mappings */
1917                 pte = pfn_pte(phys, prot);
1918                 break;
1919
1920         case FIX_PARAVIRT_BOOTMAP:
1921                 /* This is an MFN, but it isn't an IO mapping from the
1922                    IO domain */
1923                 pte = mfn_pte(phys, prot);
1924                 break;
1925
1926         default:
1927                 /* By default, set_fixmap is used for hardware mappings */
1928                 pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
1929                 break;
1930         }
1931
1932         __native_set_fixmap(idx, pte);
1933
1934 #ifdef CONFIG_X86_64
1935         /* Replicate changes to map the vsyscall page into the user
1936            pagetable vsyscall mapping. */
1937         if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1938                 unsigned long vaddr = __fix_to_virt(idx);
1939                 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1940         }
1941 #endif
1942 }
1943
1944 static __init void xen_post_allocator_init(void)
1945 {
1946         pv_mmu_ops.set_pte = xen_set_pte;
1947         pv_mmu_ops.set_pmd = xen_set_pmd;
1948         pv_mmu_ops.set_pud = xen_set_pud;
1949 #if PAGETABLE_LEVELS == 4
1950         pv_mmu_ops.set_pgd = xen_set_pgd;
1951 #endif
1952
1953         /* This will work as long as patching hasn't happened yet
1954            (which it hasn't) */
1955         pv_mmu_ops.alloc_pte = xen_alloc_pte;
1956         pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1957         pv_mmu_ops.release_pte = xen_release_pte;
1958         pv_mmu_ops.release_pmd = xen_release_pmd;
1959 #if PAGETABLE_LEVELS == 4
1960         pv_mmu_ops.alloc_pud = xen_alloc_pud;
1961         pv_mmu_ops.release_pud = xen_release_pud;
1962 #endif
1963
1964 #ifdef CONFIG_X86_64
1965         SetPagePinned(virt_to_page(level3_user_vsyscall));
1966 #endif
1967         xen_mark_init_mm_pinned();
1968 }
1969
1970 static void xen_leave_lazy_mmu(void)
1971 {
1972         preempt_disable();
1973         xen_mc_flush();
1974         paravirt_leave_lazy_mmu();
1975         preempt_enable();
1976 }
1977
1978 static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1979         .read_cr2 = xen_read_cr2,
1980         .write_cr2 = xen_write_cr2,
1981
1982         .read_cr3 = xen_read_cr3,
1983         .write_cr3 = xen_write_cr3,
1984
1985         .flush_tlb_user = xen_flush_tlb,
1986         .flush_tlb_kernel = xen_flush_tlb,
1987         .flush_tlb_single = xen_flush_tlb_single,
1988         .flush_tlb_others = xen_flush_tlb_others,
1989
1990         .pte_update = paravirt_nop,
1991         .pte_update_defer = paravirt_nop,
1992
1993         .pgd_alloc = xen_pgd_alloc,
1994         .pgd_free = xen_pgd_free,
1995
1996         .alloc_pte = xen_alloc_pte_init,
1997         .release_pte = xen_release_pte_init,
1998         .alloc_pmd = xen_alloc_pmd_init,
1999         .alloc_pmd_clone = paravirt_nop,
2000         .release_pmd = xen_release_pmd_init,
2001
2002 #ifdef CONFIG_X86_64
2003         .set_pte = xen_set_pte,
2004 #else
2005         .set_pte = xen_set_pte_init,
2006 #endif
2007         .set_pte_at = xen_set_pte_at,
2008         .set_pmd = xen_set_pmd_hyper,
2009
2010         .ptep_modify_prot_start = __ptep_modify_prot_start,
2011         .ptep_modify_prot_commit = __ptep_modify_prot_commit,
2012
2013         .pte_val = PV_CALLEE_SAVE(xen_pte_val),
2014         .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2015
2016         .make_pte = PV_CALLEE_SAVE(xen_make_pte),
2017         .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2018
2019 #ifdef CONFIG_X86_PAE
2020         .set_pte_atomic = xen_set_pte_atomic,
2021         .pte_clear = xen_pte_clear,
2022         .pmd_clear = xen_pmd_clear,
2023 #endif  /* CONFIG_X86_PAE */
2024         .set_pud = xen_set_pud_hyper,
2025
2026         .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2027         .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2028
2029 #if PAGETABLE_LEVELS == 4
2030         .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2031         .make_pud = PV_CALLEE_SAVE(xen_make_pud),
2032         .set_pgd = xen_set_pgd_hyper,
2033
2034         .alloc_pud = xen_alloc_pmd_init,
2035         .release_pud = xen_release_pmd_init,
2036 #endif  /* PAGETABLE_LEVELS == 4 */
2037
2038         .activate_mm = xen_activate_mm,
2039         .dup_mmap = xen_dup_mmap,
2040         .exit_mmap = xen_exit_mmap,
2041
2042         .lazy_mode = {
2043                 .enter = paravirt_enter_lazy_mmu,
2044                 .leave = xen_leave_lazy_mmu,
2045         },
2046
2047         .set_fixmap = xen_set_fixmap,
2048 };
2049
2050 void __init xen_init_mmu_ops(void)
2051 {
2052         x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2053         x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2054         pv_mmu_ops = xen_mmu_ops;
2055
2056         vmap_lazy_unmap = false;
2057 }
2058
2059 /* Protected by xen_reservation_lock. */
2060 #define MAX_CONTIG_ORDER 9 /* 2MB */
2061 static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2062
2063 #define VOID_PTE (mfn_pte(0, __pgprot(0)))
2064 static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2065                                 unsigned long *in_frames,
2066                                 unsigned long *out_frames)
2067 {
2068         int i;
2069         struct multicall_space mcs;
2070
2071         xen_mc_batch();
2072         for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2073                 mcs = __xen_mc_entry(0);
2074
2075                 if (in_frames)
2076                         in_frames[i] = virt_to_mfn(vaddr);
2077
2078                 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2079                 set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2080
2081                 if (out_frames)
2082                         out_frames[i] = virt_to_pfn(vaddr);
2083         }
2084         xen_mc_issue(0);
2085 }
2086
2087 /*
2088  * Update the pfn-to-mfn mappings for a virtual address range, either to
2089  * point to an array of mfns, or contiguously from a single starting
2090  * mfn.
2091  */
2092 static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2093                                      unsigned long *mfns,
2094                                      unsigned long first_mfn)
2095 {
2096         unsigned i, limit;
2097         unsigned long mfn;
2098
2099         xen_mc_batch();
2100
2101         limit = 1u << order;
2102         for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2103                 struct multicall_space mcs;
2104                 unsigned flags;
2105
2106                 mcs = __xen_mc_entry(0);
2107                 if (mfns)
2108                         mfn = mfns[i];
2109                 else
2110                         mfn = first_mfn + i;
2111
2112                 if (i < (limit - 1))
2113                         flags = 0;
2114                 else {
2115                         if (order == 0)
2116                                 flags = UVMF_INVLPG | UVMF_ALL;
2117                         else
2118                                 flags = UVMF_TLB_FLUSH | UVMF_ALL;
2119                 }
2120
2121                 MULTI_update_va_mapping(mcs.mc, vaddr,
2122                                 mfn_pte(mfn, PAGE_KERNEL), flags);
2123
2124                 set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2125         }
2126
2127         xen_mc_issue(0);
2128 }
2129
2130 /*
2131  * Perform the hypercall to exchange a region of our pfns to point to
2132  * memory with the required contiguous alignment.  Takes the pfns as
2133  * input, and populates mfns as output.
2134  *
2135  * Returns a success code indicating whether the hypervisor was able to
2136  * satisfy the request or not.
2137  */
2138 static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2139                                unsigned long *pfns_in,
2140                                unsigned long extents_out,
2141                                unsigned int order_out,
2142                                unsigned long *mfns_out,
2143                                unsigned int address_bits)
2144 {
2145         long rc;
2146         int success;
2147
2148         struct xen_memory_exchange exchange = {
2149                 .in = {
2150                         .nr_extents   = extents_in,
2151                         .extent_order = order_in,
2152                         .extent_start = pfns_in,
2153                         .domid        = DOMID_SELF
2154                 },
2155                 .out = {
2156                         .nr_extents   = extents_out,
2157                         .extent_order = order_out,
2158                         .extent_start = mfns_out,
2159                         .address_bits = address_bits,
2160                         .domid        = DOMID_SELF
2161                 }
2162         };
2163
2164         BUG_ON(extents_in << order_in != extents_out << order_out);
2165
2166         rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2167         success = (exchange.nr_exchanged == extents_in);
2168
2169         BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2170         BUG_ON(success && (rc != 0));
2171
2172         return success;
2173 }
2174
2175 int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2176                                  unsigned int address_bits)
2177 {
2178         unsigned long *in_frames = discontig_frames, out_frame;
2179         unsigned long  flags;
2180         int            success;
2181
2182         /*
2183          * Currently an auto-translated guest will not perform I/O, nor will
2184          * it require PAE page directories below 4GB. Therefore any calls to
2185          * this function are redundant and can be ignored.
2186          */
2187
2188         if (xen_feature(XENFEAT_auto_translated_physmap))
2189                 return 0;
2190
2191         if (unlikely(order > MAX_CONTIG_ORDER))
2192                 return -ENOMEM;
2193
2194         memset((void *) vstart, 0, PAGE_SIZE << order);
2195
2196         spin_lock_irqsave(&xen_reservation_lock, flags);
2197
2198         /* 1. Zap current PTEs, remembering MFNs. */
2199         xen_zap_pfn_range(vstart, order, in_frames, NULL);
2200
2201         /* 2. Get a new contiguous memory extent. */
2202         out_frame = virt_to_pfn(vstart);
2203         success = xen_exchange_memory(1UL << order, 0, in_frames,
2204                                       1, order, &out_frame,
2205                                       address_bits);
2206
2207         /* 3. Map the new extent in place of old pages. */
2208         if (success)
2209                 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2210         else
2211                 xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2212
2213         spin_unlock_irqrestore(&xen_reservation_lock, flags);
2214
2215         return success ? 0 : -ENOMEM;
2216 }
2217 EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2218
2219 void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2220 {
2221         unsigned long *out_frames = discontig_frames, in_frame;
2222         unsigned long  flags;
2223         int success;
2224
2225         if (xen_feature(XENFEAT_auto_translated_physmap))
2226                 return;
2227
2228         if (unlikely(order > MAX_CONTIG_ORDER))
2229                 return;
2230
2231         memset((void *) vstart, 0, PAGE_SIZE << order);
2232
2233         spin_lock_irqsave(&xen_reservation_lock, flags);
2234
2235         /* 1. Find start MFN of contiguous extent. */
2236         in_frame = virt_to_mfn(vstart);
2237
2238         /* 2. Zap current PTEs. */
2239         xen_zap_pfn_range(vstart, order, NULL, out_frames);
2240
2241         /* 3. Do the exchange for non-contiguous MFNs. */
2242         success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2243                                         0, out_frames, 0);
2244
2245         /* 4. Map new pages in place of old pages. */
2246         if (success)
2247                 xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2248         else
2249                 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2250
2251         spin_unlock_irqrestore(&xen_reservation_lock, flags);
2252 }
2253 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2254
2255 #ifdef CONFIG_XEN_PVHVM
2256 static void xen_hvm_exit_mmap(struct mm_struct *mm)
2257 {
2258         struct xen_hvm_pagetable_dying a;
2259         int rc;
2260
2261         a.domid = DOMID_SELF;
2262         a.gpa = __pa(mm->pgd);
2263         rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2264         WARN_ON_ONCE(rc < 0);
2265 }
2266
2267 static int is_pagetable_dying_supported(void)
2268 {
2269         struct xen_hvm_pagetable_dying a;
2270         int rc = 0;
2271
2272         a.domid = DOMID_SELF;
2273         a.gpa = 0x00;
2274         rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2275         if (rc < 0) {
2276                 printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2277                 return 0;
2278         }
2279         return 1;
2280 }
2281
2282 void __init xen_hvm_init_mmu_ops(void)
2283 {
2284         if (is_pagetable_dying_supported())
2285                 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2286 }
2287 #endif
2288
2289 #ifdef CONFIG_XEN_DEBUG_FS
2290
2291 static struct dentry *d_mmu_debug;
2292
2293 static int __init xen_mmu_debugfs(void)
2294 {
2295         struct dentry *d_xen = xen_init_debugfs();
2296
2297         if (d_xen == NULL)
2298                 return -ENOMEM;
2299
2300         d_mmu_debug = debugfs_create_dir("mmu", d_xen);
2301
2302         debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
2303
2304         debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
2305         debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
2306                            &mmu_stats.pgd_update_pinned);
2307         debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
2308                            &mmu_stats.pgd_update_pinned);
2309
2310         debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
2311         debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
2312                            &mmu_stats.pud_update_pinned);
2313         debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
2314                            &mmu_stats.pud_update_pinned);
2315
2316         debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
2317         debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
2318                            &mmu_stats.pmd_update_pinned);
2319         debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
2320                            &mmu_stats.pmd_update_pinned);
2321
2322         debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
2323 //      debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
2324 //                         &mmu_stats.pte_update_pinned);
2325         debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2326                            &mmu_stats.pte_update_pinned);
2327
2328         debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2329         debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2330                            &mmu_stats.mmu_update_extended);
2331         xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2332                                      mmu_stats.mmu_update_histo, 20);
2333
2334         debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
2335         debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
2336                            &mmu_stats.set_pte_at_batched);
2337         debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
2338                            &mmu_stats.set_pte_at_current);
2339         debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
2340                            &mmu_stats.set_pte_at_kernel);
2341
2342         debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2343         debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2344                            &mmu_stats.prot_commit_batched);
2345
2346         return 0;
2347 }
2348 fs_initcall(xen_mmu_debugfs);
2349
2350 #endif  /* CONFIG_XEN_DEBUG_FS */