arch/x86/xen/mmu.c

   1 /*
   2  * Xen mmu operations
   3  *
   4  * This file contains the various mmu fetch and update operations.
   5  * The most important job they must perform is the mapping between the
   6  * domain's pfn and the overall machine mfns.
   7  *
   8  * Xen allows guests to directly update the pagetable, in a controlled
   9  * fashion.  In other words, the guest modifies the same pagetable
  10  * that the CPU actually uses, which eliminates the overhead of having
  11  * a separate shadow pagetable.
  12  *
  13  * In order to allow this, it falls on the guest domain to map its
  14  * notion of a "physical" pfn - which is just a domain-local linear
  15  * address - into a real "machine address" which the CPU's MMU can
  16  * use.
  17  *
  18  * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
  19  * inserted directly into the pagetable.  When creating a new
  20  * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
  21  * when reading the content back with __(pgd|pmd|pte)_val, it converts
  22  * the mfn back into a pfn.
  23  *
  24  * The other constraint is that all pages which make up a pagetable
  25  * must be mapped read-only in the guest.  This prevents uncontrolled
  26  * guest updates to the pagetable.  Xen strictly enforces this, and
  27  * will disallow any pagetable update which will end up mapping a
  28  * pagetable page RW, and will disallow using any writable page as a
  29  * pagetable.
  30  *
  31  * Naively, when loading %cr3 with the base of a new pagetable, Xen
  32  * would need to validate the whole pagetable before going on.
  33  * Naturally, this is quite slow.  The solution is to "pin" a
  34  * pagetable, which enforces all the constraints on the pagetable even
  35  * when it is not actively in use.  This menas that Xen can be assured
  36  * that it is still valid when you do load it into %cr3, and doesn't
  37  * need to revalidate it.
  38  *
  39  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  40  */
  41 #include <linux/sched.h>
  42 #include <linux/highmem.h>
  43 #include <linux/debugfs.h>
  44 #include <linux/bug.h>
  45 #include <linux/vmalloc.h>
  46 #include <linux/module.h>
  47 #include <linux/gfp.h>
  48 #include <linux/memblock.h>
  49
  50 #include <asm/pgtable.h>
  51 #include <asm/tlbflush.h>
  52 #include <asm/fixmap.h>
  53 #include <asm/mmu_context.h>
  54 #include <asm/setup.h>
  55 #include <asm/paravirt.h>
  56 #include <asm/e820.h>
  57 #include <asm/linkage.h>
  58 #include <asm/page.h>
  59 #include <asm/init.h>
  60
  61 #include <asm/xen/hypercall.h>
  62 #include <asm/xen/hypervisor.h>
  63
  64 #include <xen/xen.h>
  65 #include <xen/page.h>
  66 #include <xen/interface/xen.h>
  67 #include <xen/interface/hvm/hvm_op.h>
  68 #include <xen/interface/version.h>
  69 #include <xen/interface/memory.h>
  70 #include <xen/hvc-console.h>
  71
  72 #include "multicalls.h"
  73 #include "mmu.h"
  74 #include "debugfs.h"
  75
  76 #define MMU_UPDATE_HISTO        30
  77
  78 /*
  79  * Protects atomic reservation decrease/increase against concurrent increases.
  80  * Also protects non-atomic updates of current_pages and driver_pages, and
  81  * balloon lists.
  82  */
  83 DEFINE_SPINLOCK(xen_reservation_lock);
  84
  85 #ifdef CONFIG_XEN_DEBUG_FS
  86
  87 static struct {
  88         u32 pgd_update;
  89         u32 pgd_update_pinned;
  90         u32 pgd_update_batched;
  91
  92         u32 pud_update;
  93         u32 pud_update_pinned;
  94         u32 pud_update_batched;
  95
  96         u32 pmd_update;
  97         u32 pmd_update_pinned;
  98         u32 pmd_update_batched;
  99
 100         u32 pte_update;
 101         u32 pte_update_pinned;
 102         u32 pte_update_batched;
 103
 104         u32 mmu_update;
 105         u32 mmu_update_extended;
 106         u32 mmu_update_histo[MMU_UPDATE_HISTO];
 107
 108         u32 prot_commit;
 109         u32 prot_commit_batched;
 110
 111         u32 set_pte_at;
 112         u32 set_pte_at_batched;
 113         u32 set_pte_at_pinned;
 114         u32 set_pte_at_current;
 115         u32 set_pte_at_kernel;
 116 } mmu_stats;
 117
 118 static u8 zero_stats;
 119
 120 static inline void check_zero(void)
 121 {
 122         if (unlikely(zero_stats)) {
 123                 memset(&mmu_stats, 0, sizeof(mmu_stats));
 124                 zero_stats = 0;
 125         }
 126 }
 127
 128 #define ADD_STATS(elem, val)                    \
 129         do { check_zero(); mmu_stats.elem += (val); } while(0)
 130
 131 #else  /* !CONFIG_XEN_DEBUG_FS */
 132
 133 #define ADD_STATS(elem, val)    do { (void)(val); } while(0)
 134
 135 #endif /* CONFIG_XEN_DEBUG_FS */
 136
 137
 138 /*
 139  * Identity map, in addition to plain kernel map.  This needs to be
 140  * large enough to allocate page table pages to allocate the rest.
 141  * Each page can map 2MB.
 142  */
 143 static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
 144
 145 #ifdef CONFIG_X86_64
 146 /* l3 pud for userspace vsyscall mapping */
 147 static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
 148 #endif /* CONFIG_X86_64 */
 149
 150 /*
 151  * Note about cr3 (pagetable base) values:
 152  *
 153  * xen_cr3 contains the current logical cr3 value; it contains the
 154  * last set cr3.  This may not be the current effective cr3, because
 155  * its update may be being lazily deferred.  However, a vcpu looking
 156  * at its own cr3 can use this value knowing that it everything will
 157  * be self-consistent.
 158  *
 159  * xen_current_cr3 contains the actual vcpu cr3; it is set once the
 160  * hypercall to set the vcpu cr3 is complete (so it may be a little
 161  * out of date, but it will never be set early).  If one vcpu is
 162  * looking at another vcpu's cr3 value, it should use this variable.
 163  */
 164 DEFINE_PER_CPU(unsigned long, xen_cr3);  /* cr3 stored as physaddr */
 165 DEFINE_PER_CPU(unsigned long, xen_current_cr3);  /* actual vcpu cr3 */
 166
 167
 168 /*
 169  * Just beyond the highest usermode address.  STACK_TOP_MAX has a
 170  * redzone above it, so round it up to a PGD boundary.
 171  */
 172 #define USER_LIMIT      ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
 173
 174
 175 #define P2M_ENTRIES_PER_PAGE    (PAGE_SIZE / sizeof(unsigned long))
 176 #define TOP_ENTRIES             (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
 177
 178 /* Placeholder for holes in the address space */
 179 static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
 180                 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
 181
 182  /* Array of pointers to pages containing p2m entries */
 183 static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
 184                 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
 185
 186 /* Arrays of p2m arrays expressed in mfns used for save/restore */
 187 static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
 188
 189 static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
 190         __page_aligned_bss;
 191
 192 static inline unsigned p2m_top_index(unsigned long pfn)
 193 {
 194         BUG_ON(pfn >= MAX_DOMAIN_PAGES);
 195         return pfn / P2M_ENTRIES_PER_PAGE;
 196 }
 197
 198 static inline unsigned p2m_index(unsigned long pfn)
 199 {
 200         return pfn % P2M_ENTRIES_PER_PAGE;
 201 }
 202
 203 /* Build the parallel p2m_top_mfn structures */
 204 void xen_build_mfn_list_list(void)
 205 {
 206         unsigned pfn, idx;
 207
 208         for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
 209                 unsigned topidx = p2m_top_index(pfn);
 210
 211                 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
 212         }
 213
 214         for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
 215                 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
 216                 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
 217         }
 218 }
 219
 220 void xen_setup_mfn_list_list(void)
 221 {
 222         BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 223
 224         HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
 225                 virt_to_mfn(p2m_top_mfn_list);
 226         HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
 227 }
 228
 229 /* Set up p2m_top to point to the domain-builder provided p2m pages */
 230 void __init xen_build_dynamic_phys_to_machine(void)
 231 {
 232         unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
 233         unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
 234         unsigned pfn;
 235
 236         for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
 237                 unsigned topidx = p2m_top_index(pfn);
 238
 239                 p2m_top[topidx] = &mfn_list[pfn];
 240         }
 241
 242         xen_build_mfn_list_list();
 243 }
 244
 245 unsigned long get_phys_to_machine(unsigned long pfn)
 246 {
 247         unsigned topidx, idx;
 248
 249         if (unlikely(pfn >= MAX_DOMAIN_PAGES))
 250                 return INVALID_P2M_ENTRY;
 251
 252         topidx = p2m_top_index(pfn);
 253         idx = p2m_index(pfn);
 254         return p2m_top[topidx][idx];
 255 }
 256 EXPORT_SYMBOL_GPL(get_phys_to_machine);
 257
 258 /* install a  new p2m_top page */
 259 bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
 260 {
 261         unsigned topidx = p2m_top_index(pfn);
 262         unsigned long **pfnp, *mfnp;
 263         unsigned i;
 264
 265         pfnp = &p2m_top[topidx];
 266         mfnp = &p2m_top_mfn[topidx];
 267
 268         for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
 269                 p[i] = INVALID_P2M_ENTRY;
 270
 271         if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
 272                 *mfnp = virt_to_mfn(p);
 273                 return true;
 274         }
 275
 276         return false;
 277 }
 278
 279 static void alloc_p2m(unsigned long pfn)
 280 {
 281         unsigned long *p;
 282
 283         p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
 284         BUG_ON(p == NULL);
 285
 286         if (!install_p2mtop_page(pfn, p))
 287                 free_page((unsigned long)p);
 288 }
 289
 290 /* Try to install p2m mapping; fail if intermediate bits missing */
 291 bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 292 {
 293         unsigned topidx, idx;
 294
 295         if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
 296                 BUG_ON(mfn != INVALID_P2M_ENTRY);
 297                 return true;
 298         }
 299
 300         topidx = p2m_top_index(pfn);
 301         if (p2m_top[topidx] == p2m_missing) {
 302                 if (mfn == INVALID_P2M_ENTRY)
 303                         return true;
 304                 return false;
 305         }
 306
 307         idx = p2m_index(pfn);
 308         p2m_top[topidx][idx] = mfn;
 309
 310         return true;
 311 }
 312
 313 void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 314 {
 315         if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
 316                 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
 317                 return;
 318         }
 319
 320         if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
 321                 alloc_p2m(pfn);
 322
 323                 if (!__set_phys_to_machine(pfn, mfn))
 324                         BUG();
 325         }
 326 }
 327
 328 unsigned long arbitrary_virt_to_mfn(void *vaddr)
 329 {
 330         xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
 331
 332         return PFN_DOWN(maddr.maddr);
 333 }
 334
 335 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 336 {
 337         unsigned long address = (unsigned long)vaddr;
 338         unsigned int level;
 339         pte_t *pte;
 340         unsigned offset;
 341
 342         /*
 343          * if the PFN is in the linear mapped vaddr range, we can just use
 344          * the (quick) virt_to_machine() p2m lookup
 345          */
 346         if (virt_addr_valid(vaddr))
 347                 return virt_to_machine(vaddr);
 348
 349         /* otherwise we have to do a (slower) full page-table walk */
 350
 351         pte = lookup_address(address, &level);
 352         BUG_ON(pte == NULL);
 353         offset = address & ~PAGE_MASK;
 354         return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
 355 }
 356
 357 void make_lowmem_page_readonly(void *vaddr)
 358 {
 359         pte_t *pte, ptev;
 360         unsigned long address = (unsigned long)vaddr;
 361         unsigned int level;
 362
 363         pte = lookup_address(address, &level);
 364         if (pte == NULL)
 365                 return;         /* vaddr missing */
 366
 367         ptev = pte_wrprotect(*pte);
 368
 369         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 370                 BUG();
 371 }
 372
 373 void make_lowmem_page_readwrite(void *vaddr)
 374 {
 375         pte_t *pte, ptev;
 376         unsigned long address = (unsigned long)vaddr;
 377         unsigned int level;
 378
 379         pte = lookup_address(address, &level);
 380         if (pte == NULL)
 381                 return;         /* vaddr missing */
 382
 383         ptev = pte_mkwrite(*pte);
 384
 385         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 386                 BUG();
 387 }
 388
 389
 390 static bool xen_page_pinned(void *ptr)
 391 {
 392         struct page *page = virt_to_page(ptr);
 393
 394         return PagePinned(page);
 395 }
 396
 397 static bool xen_iomap_pte(pte_t pte)
 398 {
 399         return pte_flags(pte) & _PAGE_IOMAP;
 400 }
 401
 402 static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
 403 {
 404         struct multicall_space mcs;
 405         struct mmu_update *u;
 406
 407         mcs = xen_mc_entry(sizeof(*u));
 408         u = mcs.args;
 409
 410         /* ptep might be kmapped when using 32-bit HIGHPTE */
 411         u->ptr = arbitrary_virt_to_machine(ptep).maddr;
 412         u->val = pte_val_ma(pteval);
 413
 414         MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO);
 415
 416         xen_mc_issue(PARAVIRT_LAZY_MMU);
 417 }
 418
 419 static void xen_extend_mmu_update(const struct mmu_update *update)
 420 {
 421         struct multicall_space mcs;
 422         struct mmu_update *u;
 423
 424         mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
 425
 426         if (mcs.mc != NULL) {
 427                 ADD_STATS(mmu_update_extended, 1);
 428                 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
 429
 430                 mcs.mc->args[1]++;
 431
 432                 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
 433                         ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
 434                 else
 435                         ADD_STATS(mmu_update_histo[0], 1);
 436         } else {
 437                 ADD_STATS(mmu_update, 1);
 438                 mcs = __xen_mc_entry(sizeof(*u));
 439                 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 440                 ADD_STATS(mmu_update_histo[1], 1);
 441         }
 442
 443         u = mcs.args;
 444         *u = *update;
 445 }
 446
 447 void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 448 {
 449         struct mmu_update u;
 450
 451         preempt_disable();
 452
 453         xen_mc_batch();
 454
 455         /* ptr may be ioremapped for 64-bit pagetable setup */
 456         u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 457         u.val = pmd_val_ma(val);
 458         xen_extend_mmu_update(&u);
 459
 460         ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 461
 462         xen_mc_issue(PARAVIRT_LAZY_MMU);
 463
 464         preempt_enable();
 465 }
 466
 467 void xen_set_pmd(pmd_t *ptr, pmd_t val)
 468 {
 469         ADD_STATS(pmd_update, 1);
 470
 471         /* If page is not pinned, we can just update the entry
 472            directly */
 473         if (!xen_page_pinned(ptr)) {
 474                 *ptr = val;
 475                 return;
 476         }
 477
 478         ADD_STATS(pmd_update_pinned, 1);
 479
 480         xen_set_pmd_hyper(ptr, val);
 481 }
 482
 483 /*
 484  * Associate a virtual page frame with a given physical page frame
 485  * and protection flags for that frame.
 486  */
 487 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 488 {
 489         set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
 490 }
 491
 492 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 493                     pte_t *ptep, pte_t pteval)
 494 {
 495         if (xen_iomap_pte(pteval)) {
 496                 xen_set_iomap_pte(ptep, pteval);
 497                 goto out;
 498         }
 499
 500         ADD_STATS(set_pte_at, 1);
 501 //      ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
 502         ADD_STATS(set_pte_at_current, mm == current->mm);
 503         ADD_STATS(set_pte_at_kernel, mm == &init_mm);
 504
 505         if (mm == current->mm || mm == &init_mm) {
 506                 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
 507                         struct multicall_space mcs;
 508                         mcs = xen_mc_entry(0);
 509
 510                         MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
 511                         ADD_STATS(set_pte_at_batched, 1);
 512                         xen_mc_issue(PARAVIRT_LAZY_MMU);
 513                         goto out;
 514                 } else
 515                         if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
 516                                 goto out;
 517         }
 518         xen_set_pte(ptep, pteval);
 519
 520 out:    return;
 521 }
 522
 523 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
 524                                  unsigned long addr, pte_t *ptep)
 525 {
 526         /* Just return the pte as-is.  We preserve the bits on commit */
 527         return *ptep;
 528 }
 529
 530 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 531                                  pte_t *ptep, pte_t pte)
 532 {
 533         struct mmu_update u;
 534
 535         xen_mc_batch();
 536
 537         u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
 538         u.val = pte_val_ma(pte);
 539         xen_extend_mmu_update(&u);
 540
 541         ADD_STATS(prot_commit, 1);
 542         ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 543
 544         xen_mc_issue(PARAVIRT_LAZY_MMU);
 545 }
 546
 547 /* Assume pteval_t is equivalent to all the other *val_t types. */
 548 static pteval_t pte_mfn_to_pfn(pteval_t val)
 549 {
 550         if (val & _PAGE_PRESENT) {
 551                 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 552                 pteval_t flags = val & PTE_FLAGS_MASK;
 553                 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
 554         }
 555
 556         return val;
 557 }
 558
 559 static pteval_t pte_pfn_to_mfn(pteval_t val)
 560 {
 561         if (val & _PAGE_PRESENT) {
 562                 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 563                 pteval_t flags = val & PTE_FLAGS_MASK;
 564                 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
 565         }
 566
 567         return val;
 568 }
 569
 570 static pteval_t iomap_pte(pteval_t val)
 571 {
 572         if (val & _PAGE_PRESENT) {
 573                 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 574                 pteval_t flags = val & PTE_FLAGS_MASK;
 575
 576                 /* We assume the pte frame number is a MFN, so
 577                    just use it as-is. */
 578                 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
 579         }
 580
 581         return val;
 582 }
 583
 584 pteval_t xen_pte_val(pte_t pte)
 585 {
 586         if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP))
 587                 return pte.pte;
 588
 589         return pte_mfn_to_pfn(pte.pte);
 590 }
 591 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
 592
 593 pgdval_t xen_pgd_val(pgd_t pgd)
 594 {
 595         return pte_mfn_to_pfn(pgd.pgd);
 596 }
 597 PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
 598
 599 pte_t xen_make_pte(pteval_t pte)
 600 {
 601         phys_addr_t addr = (pte & PTE_PFN_MASK);
 602
 603         /*
 604          * Unprivileged domains are allowed to do IOMAPpings for
 605          * PCI passthrough, but not map ISA space.  The ISA
 606          * mappings are just dummy local mappings to keep other
 607          * parts of the kernel happy.
 608          */
 609         if (unlikely(pte & _PAGE_IOMAP) &&
 610             (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
 611                 pte = iomap_pte(pte);
 612         } else {
 613                 pte &= ~_PAGE_IOMAP;
 614                 pte = pte_pfn_to_mfn(pte);
 615         }
 616
 617         return native_make_pte(pte);
 618 }
 619 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
 620
 621 pgd_t xen_make_pgd(pgdval_t pgd)
 622 {
 623         pgd = pte_pfn_to_mfn(pgd);
 624         return native_make_pgd(pgd);
 625 }
 626 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
 627
 628 pmdval_t xen_pmd_val(pmd_t pmd)
 629 {
 630         return pte_mfn_to_pfn(pmd.pmd);
 631 }
 632 PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
 633
 634 void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 635 {
 636         struct mmu_update u;
 637
 638         preempt_disable();
 639
 640         xen_mc_batch();
 641
 642         /* ptr may be ioremapped for 64-bit pagetable setup */
 643         u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 644         u.val = pud_val_ma(val);
 645         xen_extend_mmu_update(&u);
 646
 647         ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 648
 649         xen_mc_issue(PARAVIRT_LAZY_MMU);
 650
 651         preempt_enable();
 652 }
 653
 654 void xen_set_pud(pud_t *ptr, pud_t val)
 655 {
 656         ADD_STATS(pud_update, 1);
 657
 658         /* If page is not pinned, we can just update the entry
 659            directly */
 660         if (!xen_page_pinned(ptr)) {
 661                 *ptr = val;
 662                 return;
 663         }
 664
 665         ADD_STATS(pud_update_pinned, 1);
 666
 667         xen_set_pud_hyper(ptr, val);
 668 }
 669
 670 void xen_set_pte(pte_t *ptep, pte_t pte)
 671 {
 672         if (xen_iomap_pte(pte)) {
 673                 xen_set_iomap_pte(ptep, pte);
 674                 return;
 675         }
 676
 677         ADD_STATS(pte_update, 1);
 678 //      ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
 679         ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 680
 681 #ifdef CONFIG_X86_PAE
 682         ptep->pte_high = pte.pte_high;
 683         smp_wmb();
 684         ptep->pte_low = pte.pte_low;
 685 #else
 686         *ptep = pte;
 687 #endif
 688 }
 689
 690 #ifdef CONFIG_X86_PAE
 691 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 692 {
 693         if (xen_iomap_pte(pte)) {
 694                 xen_set_iomap_pte(ptep, pte);
 695                 return;
 696         }
 697
 698         set_64bit((u64 *)ptep, native_pte_val(pte));
 699 }
 700
 701 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 702 {
 703         ptep->pte_low = 0;
 704         smp_wmb();              /* make sure low gets written first */
 705         ptep->pte_high = 0;
 706 }
 707
 708 void xen_pmd_clear(pmd_t *pmdp)
 709 {
 710         set_pmd(pmdp, __pmd(0));
 711 }
 712 #endif  /* CONFIG_X86_PAE */
 713
 714 pmd_t xen_make_pmd(pmdval_t pmd)
 715 {
 716         pmd = pte_pfn_to_mfn(pmd);
 717         return native_make_pmd(pmd);
 718 }
 719 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
 720
 721 #if PAGETABLE_LEVELS == 4
 722 pudval_t xen_pud_val(pud_t pud)
 723 {
 724         return pte_mfn_to_pfn(pud.pud);
 725 }
 726 PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
 727
 728 pud_t xen_make_pud(pudval_t pud)
 729 {
 730         pud = pte_pfn_to_mfn(pud);
 731
 732         return native_make_pud(pud);
 733 }
 734 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
 735
 736 pgd_t *xen_get_user_pgd(pgd_t *pgd)
 737 {
 738         pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
 739         unsigned offset = pgd - pgd_page;
 740         pgd_t *user_ptr = NULL;
 741
 742         if (offset < pgd_index(USER_LIMIT)) {
 743                 struct page *page = virt_to_page(pgd_page);
 744                 user_ptr = (pgd_t *)page->private;
 745                 if (user_ptr)
 746                         user_ptr += offset;
 747         }
 748
 749         return user_ptr;
 750 }
 751
 752 static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 753 {
 754         struct mmu_update u;
 755
 756         u.ptr = virt_to_machine(ptr).maddr;
 757         u.val = pgd_val_ma(val);
 758         xen_extend_mmu_update(&u);
 759 }
 760
 761 /*
 762  * Raw hypercall-based set_pgd, intended for in early boot before
 763  * there's a page structure.  This implies:
 764  *  1. The only existing pagetable is the kernel's
 765  *  2. It is always pinned
 766  *  3. It has no user pagetable attached to it
 767  */
 768 void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 769 {
 770         preempt_disable();
 771
 772         xen_mc_batch();
 773
 774         __xen_set_pgd_hyper(ptr, val);
 775
 776         xen_mc_issue(PARAVIRT_LAZY_MMU);
 777
 778         preempt_enable();
 779 }
 780
 781 void xen_set_pgd(pgd_t *ptr, pgd_t val)
 782 {
 783         pgd_t *user_ptr = xen_get_user_pgd(ptr);
 784
 785         ADD_STATS(pgd_update, 1);
 786
 787         /* If page is not pinned, we can just update the entry
 788            directly */
 789         if (!xen_page_pinned(ptr)) {
 790                 *ptr = val;
 791                 if (user_ptr) {
 792                         WARN_ON(xen_page_pinned(user_ptr));
 793                         *user_ptr = val;
 794                 }
 795                 return;
 796         }
 797
 798         ADD_STATS(pgd_update_pinned, 1);
 799         ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 800
 801         /* If it's pinned, then we can at least batch the kernel and
 802            user updates together. */
 803         xen_mc_batch();
 804
 805         __xen_set_pgd_hyper(ptr, val);
 806         if (user_ptr)
 807                 __xen_set_pgd_hyper(user_ptr, val);
 808
 809         xen_mc_issue(PARAVIRT_LAZY_MMU);
 810 }
 811 #endif  /* PAGETABLE_LEVELS == 4 */
 812
 813 /*
 814  * (Yet another) pagetable walker.  This one is intended for pinning a
 815  * pagetable.  This means that it walks a pagetable and calls the
 816  * callback function on each page it finds making up the page table,
 817  * at every level.  It walks the entire pagetable, but it only bothers
 818  * pinning pte pages which are below limit.  In the normal case this
 819  * will be STACK_TOP_MAX, but at boot we need to pin up to
 820  * FIXADDR_TOP.
 821  *
 822  * For 32-bit the important bit is that we don't pin beyond there,
 823  * because then we start getting into Xen's ptes.
 824  *
 825  * For 64-bit, we must skip the Xen hole in the middle of the address
 826  * space, just after the big x86-64 virtual hole.
 827  */
 828 static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
 829                           int (*func)(struct mm_struct *mm, struct page *,
 830                                       enum pt_level),
 831                           unsigned long limit)
 832 {
 833         int flush = 0;
 834         unsigned hole_low, hole_high;
 835         unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
 836         unsigned pgdidx, pudidx, pmdidx;
 837
 838         /* The limit is the last byte to be touched */
 839         limit--;
 840         BUG_ON(limit >= FIXADDR_TOP);
 841
 842         if (xen_feature(XENFEAT_auto_translated_physmap))
 843                 return 0;
 844
 845         /*
 846          * 64-bit has a great big hole in the middle of the address
 847          * space, which contains the Xen mappings.  On 32-bit these
 848          * will end up making a zero-sized hole and so is a no-op.
 849          */
 850         hole_low = pgd_index(USER_LIMIT);
 851         hole_high = pgd_index(PAGE_OFFSET);
 852
 853         pgdidx_limit = pgd_index(limit);
 854 #if PTRS_PER_PUD > 1
 855         pudidx_limit = pud_index(limit);
 856 #else
 857         pudidx_limit = 0;
 858 #endif
 859 #if PTRS_PER_PMD > 1
 860         pmdidx_limit = pmd_index(limit);
 861 #else
 862         pmdidx_limit = 0;
 863 #endif
 864
 865         for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
 866                 pud_t *pud;
 867
 868                 if (pgdidx >= hole_low && pgdidx < hole_high)
 869                         continue;
 870
 871                 if (!pgd_val(pgd[pgdidx]))
 872                         continue;
 873
 874                 pud = pud_offset(&pgd[pgdidx], 0);
 875
 876                 if (PTRS_PER_PUD > 1) /* not folded */
 877                         flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
 878
 879                 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
 880                         pmd_t *pmd;
 881
 882                         if (pgdidx == pgdidx_limit &&
 883                             pudidx > pudidx_limit)
 884                                 goto out;
 885
 886                         if (pud_none(pud[pudidx]))
 887                                 continue;
 888
 889                         pmd = pmd_offset(&pud[pudidx], 0);
 890
 891                         if (PTRS_PER_PMD > 1) /* not folded */
 892                                 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
 893
 894                         for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
 895                                 struct page *pte;
 896
 897                                 if (pgdidx == pgdidx_limit &&
 898                                     pudidx == pudidx_limit &&
 899                                     pmdidx > pmdidx_limit)
 900                                         goto out;
 901
 902                                 if (pmd_none(pmd[pmdidx]))
 903                                         continue;
 904
 905                                 pte = pmd_page(pmd[pmdidx]);
 906                                 flush |= (*func)(mm, pte, PT_PTE);
 907                         }
 908                 }
 909         }
 910
 911 out:
 912         /* Do the top level last, so that the callbacks can use it as
 913            a cue to do final things like tlb flushes. */
 914         flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
 915
 916         return flush;
 917 }
 918
 919 static int xen_pgd_walk(struct mm_struct *mm,
 920                         int (*func)(struct mm_struct *mm, struct page *,
 921                                     enum pt_level),
 922                         unsigned long limit)
 923 {
 924         return __xen_pgd_walk(mm, mm->pgd, func, limit);
 925 }
 926
 927 /* If we're using split pte locks, then take the page's lock and
 928    return a pointer to it.  Otherwise return NULL. */
 929 static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
 930 {
 931         spinlock_t *ptl = NULL;
 932
 933 #if USE_SPLIT_PTLOCKS
 934         ptl = __pte_lockptr(page);
 935         spin_lock_nest_lock(ptl, &mm->page_table_lock);
 936 #endif
 937
 938         return ptl;
 939 }
 940
 941 static void xen_pte_unlock(void *v)
 942 {
 943         spinlock_t *ptl = v;
 944         spin_unlock(ptl);
 945 }
 946
 947 static void xen_do_pin(unsigned level, unsigned long pfn)
 948 {
 949         struct mmuext_op *op;
 950         struct multicall_space mcs;
 951
 952         mcs = __xen_mc_entry(sizeof(*op));
 953         op = mcs.args;
 954         op->cmd = level;
 955         op->arg1.mfn = pfn_to_mfn(pfn);
 956         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 957 }
 958
 959 static int xen_pin_page(struct mm_struct *mm, struct page *page,
 960                         enum pt_level level)
 961 {
 962         unsigned pgfl = TestSetPagePinned(page);
 963         int flush;
 964
 965         if (pgfl)
 966                 flush = 0;              /* already pinned */
 967         else if (PageHighMem(page))
 968                 /* kmaps need flushing if we found an unpinned
 969                    highpage */
 970                 flush = 1;
 971         else {
 972                 void *pt = lowmem_page_address(page);
 973                 unsigned long pfn = page_to_pfn(page);
 974                 struct multicall_space mcs = __xen_mc_entry(0);
 975                 spinlock_t *ptl;
 976
 977                 flush = 0;
 978
 979                 /*
 980                  * We need to hold the pagetable lock between the time
 981                  * we make the pagetable RO and when we actually pin
 982                  * it.  If we don't, then other users may come in and
 983                  * attempt to update the pagetable by writing it,
 984                  * which will fail because the memory is RO but not
 985                  * pinned, so Xen won't do the trap'n'emulate.
 986                  *
 987                  * If we're using split pte locks, we can't hold the
 988                  * entire pagetable's worth of locks during the
 989                  * traverse, because we may wrap the preempt count (8
 990                  * bits).  The solution is to mark RO and pin each PTE
 991                  * page while holding the lock.  This means the number
 992                  * of locks we end up holding is never more than a
 993                  * batch size (~32 entries, at present).
 994                  *
 995                  * If we're not using split pte locks, we needn't pin
 996                  * the PTE pages independently, because we're
 997                  * protected by the overall pagetable lock.
 998                  */
 999                 ptl = NULL;
1000                 if (level == PT_PTE)
1001                         ptl = xen_pte_lock(page, mm);
1002
1003                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1004                                         pfn_pte(pfn, PAGE_KERNEL_RO),
1005                                         level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1006
1007                 if (ptl) {
1008                         xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
1009
1010                         /* Queue a deferred unlock for when this batch
1011                            is completed. */
1012                         xen_mc_callback(xen_pte_unlock, ptl);
1013                 }
1014         }
1015
1016         return flush;
1017 }
1018
1019 /* This is called just after a mm has been created, but it has not
1020    been used yet.  We need to make sure that its pagetable is all
1021    read-only, and can be pinned. */
1022 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
1023 {
1024         xen_mc_batch();
1025
1026         if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
1027                 /* re-enable interrupts for flushing */
1028                 xen_mc_issue(0);
1029
1030                 kmap_flush_unused();
1031
1032                 xen_mc_batch();
1033         }
1034
1035 #ifdef CONFIG_X86_64
1036         {
1037                 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1038
1039                 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
1040
1041                 if (user_pgd) {
1042                         xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
1043                         xen_do_pin(MMUEXT_PIN_L4_TABLE,
1044                                    PFN_DOWN(__pa(user_pgd)));
1045                 }
1046         }
1047 #else /* CONFIG_X86_32 */
1048 #ifdef CONFIG_X86_PAE
1049         /* Need to make sure unshared kernel PMD is pinnable */
1050         xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
1051                      PT_PMD);
1052 #endif
1053         xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
1054 #endif /* CONFIG_X86_64 */
1055         xen_mc_issue(0);
1056 }
1057
1058 static void xen_pgd_pin(struct mm_struct *mm)
1059 {
1060         __xen_pgd_pin(mm, mm->pgd);
1061 }
1062
1063 /*
1064  * On save, we need to pin all pagetables to make sure they get their
1065  * mfns turned into pfns.  Search the list for any unpinned pgds and pin
1066  * them (unpinned pgds are not currently in use, probably because the
1067  * process is under construction or destruction).
1068  *
1069  * Expected to be called in stop_machine() ("equivalent to taking
1070  * every spinlock in the system"), so the locking doesn't really
1071  * matter all that much.
1072  */
1073 void xen_mm_pin_all(void)
1074 {
1075         unsigned long flags;
1076         struct page *page;
1077
1078         spin_lock_irqsave(&pgd_lock, flags);
1079
1080         list_for_each_entry(page, &pgd_list, lru) {
1081                 if (!PagePinned(page)) {
1082                         __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
1083                         SetPageSavePinned(page);
1084                 }
1085         }
1086
1087         spin_unlock_irqrestore(&pgd_lock, flags);
1088 }
1089
1090 /*
1091  * The init_mm pagetable is really pinned as soon as its created, but
1092  * that's before we have page structures to store the bits.  So do all
1093  * the book-keeping now.
1094  */
1095 static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
1096                                   enum pt_level level)
1097 {
1098         SetPagePinned(page);
1099         return 0;
1100 }
1101
1102 static void __init xen_mark_init_mm_pinned(void)
1103 {
1104         xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
1105 }
1106
1107 static int xen_unpin_page(struct mm_struct *mm, struct page *page,
1108                           enum pt_level level)
1109 {
1110         unsigned pgfl = TestClearPagePinned(page);
1111
1112         if (pgfl && !PageHighMem(page)) {
1113                 void *pt = lowmem_page_address(page);
1114                 unsigned long pfn = page_to_pfn(page);
1115                 spinlock_t *ptl = NULL;
1116                 struct multicall_space mcs;
1117
1118                 /*
1119                  * Do the converse to pin_page.  If we're using split
1120                  * pte locks, we must be holding the lock for while
1121                  * the pte page is unpinned but still RO to prevent
1122                  * concurrent updates from seeing it in this
1123                  * partially-pinned state.
1124                  */
1125                 if (level == PT_PTE) {
1126                         ptl = xen_pte_lock(page, mm);
1127
1128                         if (ptl)
1129                                 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
1130                 }
1131
1132                 mcs = __xen_mc_entry(0);
1133
1134                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1135                                         pfn_pte(pfn, PAGE_KERNEL),
1136                                         level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1137
1138                 if (ptl) {
1139                         /* unlock when batch completed */
1140                         xen_mc_callback(xen_pte_unlock, ptl);
1141                 }
1142         }
1143
1144         return 0;               /* never need to flush on unpin */
1145 }
1146
1147 /* Release a pagetables pages back as normal RW */
1148 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
1149 {
1150         xen_mc_batch();
1151
1152         xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1153
1154 #ifdef CONFIG_X86_64
1155         {
1156                 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1157
1158                 if (user_pgd) {
1159                         xen_do_pin(MMUEXT_UNPIN_TABLE,
1160                                    PFN_DOWN(__pa(user_pgd)));
1161                         xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
1162                 }
1163         }
1164 #endif
1165
1166 #ifdef CONFIG_X86_PAE
1167         /* Need to make sure unshared kernel PMD is unpinned */
1168         xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
1169                        PT_PMD);
1170 #endif
1171
1172         __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
1173
1174         xen_mc_issue(0);
1175 }
1176
1177 static void xen_pgd_unpin(struct mm_struct *mm)
1178 {
1179         __xen_pgd_unpin(mm, mm->pgd);
1180 }
1181
1182 /*
1183  * On resume, undo any pinning done at save, so that the rest of the
1184  * kernel doesn't see any unexpected pinned pagetables.
1185  */
1186 void xen_mm_unpin_all(void)
1187 {
1188         unsigned long flags;
1189         struct page *page;
1190
1191         spin_lock_irqsave(&pgd_lock, flags);
1192
1193         list_for_each_entry(page, &pgd_list, lru) {
1194                 if (PageSavePinned(page)) {
1195                         BUG_ON(!PagePinned(page));
1196                         __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
1197                         ClearPageSavePinned(page);
1198                 }
1199         }
1200
1201         spin_unlock_irqrestore(&pgd_lock, flags);
1202 }
1203
1204 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1205 {
1206         spin_lock(&next->page_table_lock);
1207         xen_pgd_pin(next);
1208         spin_unlock(&next->page_table_lock);
1209 }
1210
1211 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1212 {
1213         spin_lock(&mm->page_table_lock);
1214         xen_pgd_pin(mm);
1215         spin_unlock(&mm->page_table_lock);
1216 }
1217
1218
1219 #ifdef CONFIG_SMP
1220 /* Another cpu may still have their %cr3 pointing at the pagetable, so
1221    we need to repoint it somewhere else before we can unpin it. */
1222 static void drop_other_mm_ref(void *info)
1223 {
1224         struct mm_struct *mm = info;
1225         struct mm_struct *active_mm;
1226
1227         active_mm = percpu_read(cpu_tlbstate.active_mm);
1228
1229         if (active_mm == mm)
1230                 leave_mm(smp_processor_id());
1231
1232         /* If this cpu still has a stale cr3 reference, then make sure
1233            it has been flushed. */
1234         if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
1235                 load_cr3(swapper_pg_dir);
1236 }
1237
1238 static void xen_drop_mm_ref(struct mm_struct *mm)
1239 {
1240         cpumask_var_t mask;
1241         unsigned cpu;
1242
1243         if (current->active_mm == mm) {
1244                 if (current->mm == mm)
1245                         load_cr3(swapper_pg_dir);
1246                 else
1247                         leave_mm(smp_processor_id());
1248         }
1249
1250         /* Get the "official" set of cpus referring to our pagetable. */
1251         if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1252                 for_each_online_cpu(cpu) {
1253                         if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1254                             && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1255                                 continue;
1256                         smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1257                 }
1258                 return;
1259         }
1260         cpumask_copy(mask, mm_cpumask(mm));
1261
1262         /* It's possible that a vcpu may have a stale reference to our
1263            cr3, because its in lazy mode, and it hasn't yet flushed
1264            its set of pending hypercalls yet.  In this case, we can
1265            look at its actual current cr3 value, and force it to flush
1266            if needed. */
1267         for_each_online_cpu(cpu) {
1268                 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1269                         cpumask_set_cpu(cpu, mask);
1270         }
1271
1272         if (!cpumask_empty(mask))
1273                 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1274         free_cpumask_var(mask);
1275 }
1276 #else
1277 static void xen_drop_mm_ref(struct mm_struct *mm)
1278 {
1279         if (current->active_mm == mm)
1280                 load_cr3(swapper_pg_dir);
1281 }
1282 #endif
1283
1284 /*
1285  * While a process runs, Xen pins its pagetables, which means that the
1286  * hypervisor forces it to be read-only, and it controls all updates
1287  * to it.  This means that all pagetable updates have to go via the
1288  * hypervisor, which is moderately expensive.
1289  *
1290  * Since we're pulling the pagetable down, we switch to use init_mm,
1291  * unpin old process pagetable and mark it all read-write, which
1292  * allows further operations on it to be simple memory accesses.
1293  *
1294  * The only subtle point is that another CPU may be still using the
1295  * pagetable because of lazy tlb flushing.  This means we need need to
1296  * switch all CPUs off this pagetable before we can unpin it.
1297  */
1298 void xen_exit_mmap(struct mm_struct *mm)
1299 {
1300         get_cpu();              /* make sure we don't move around */
1301         xen_drop_mm_ref(mm);
1302         put_cpu();
1303
1304         spin_lock(&mm->page_table_lock);
1305
1306         /* pgd may not be pinned in the error exit path of execve */
1307         if (xen_page_pinned(mm->pgd))
1308                 xen_pgd_unpin(mm);
1309
1310         spin_unlock(&mm->page_table_lock);
1311 }
1312
1313 static __init void xen_pagetable_setup_start(pgd_t *base)
1314 {
1315 }
1316
1317 static void xen_post_allocator_init(void);
1318
1319 static __init void xen_pagetable_setup_done(pgd_t *base)
1320 {
1321         xen_setup_shared_info();
1322         xen_post_allocator_init();
1323 }
1324
1325 static void xen_write_cr2(unsigned long cr2)
1326 {
1327         percpu_read(xen_vcpu)->arch.cr2 = cr2;
1328 }
1329
1330 static unsigned long xen_read_cr2(void)
1331 {
1332         return percpu_read(xen_vcpu)->arch.cr2;
1333 }
1334
1335 unsigned long xen_read_cr2_direct(void)
1336 {
1337         return percpu_read(xen_vcpu_info.arch.cr2);
1338 }
1339
1340 static void xen_flush_tlb(void)
1341 {
1342         struct mmuext_op *op;
1343         struct multicall_space mcs;
1344
1345         preempt_disable();
1346
1347         mcs = xen_mc_entry(sizeof(*op));
1348
1349         op = mcs.args;
1350         op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1351         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1352
1353         xen_mc_issue(PARAVIRT_LAZY_MMU);
1354
1355         preempt_enable();
1356 }
1357
1358 static void xen_flush_tlb_single(unsigned long addr)
1359 {
1360         struct mmuext_op *op;
1361         struct multicall_space mcs;
1362
1363         preempt_disable();
1364
1365         mcs = xen_mc_entry(sizeof(*op));
1366         op = mcs.args;
1367         op->cmd = MMUEXT_INVLPG_LOCAL;
1368         op->arg1.linear_addr = addr & PAGE_MASK;
1369         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1370
1371         xen_mc_issue(PARAVIRT_LAZY_MMU);
1372
1373         preempt_enable();
1374 }
1375
1376 static void xen_flush_tlb_others(const struct cpumask *cpus,
1377                                  struct mm_struct *mm, unsigned long va)
1378 {
1379         struct {
1380                 struct mmuext_op op;
1381                 DECLARE_BITMAP(mask, NR_CPUS);
1382         } *args;
1383         struct multicall_space mcs;
1384
1385         if (cpumask_empty(cpus))
1386                 return;         /* nothing to do */
1387
1388         mcs = xen_mc_entry(sizeof(*args));
1389         args = mcs.args;
1390         args->op.arg2.vcpumask = to_cpumask(args->mask);
1391
1392         /* Remove us, and any offline CPUS. */
1393         cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1394         cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1395
1396         if (va == TLB_FLUSH_ALL) {
1397                 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1398         } else {
1399                 args->op.cmd = MMUEXT_INVLPG_MULTI;
1400                 args->op.arg1.linear_addr = va;
1401         }
1402
1403         MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1404
1405         xen_mc_issue(PARAVIRT_LAZY_MMU);
1406 }
1407
1408 static unsigned long xen_read_cr3(void)
1409 {
1410         return percpu_read(xen_cr3);
1411 }
1412
1413 static void set_current_cr3(void *v)
1414 {
1415         percpu_write(xen_current_cr3, (unsigned long)v);
1416 }
1417
1418 static void __xen_write_cr3(bool kernel, unsigned long cr3)
1419 {
1420         struct mmuext_op *op;
1421         struct multicall_space mcs;
1422         unsigned long mfn;
1423
1424         if (cr3)
1425                 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1426         else
1427                 mfn = 0;
1428
1429         WARN_ON(mfn == 0 && kernel);
1430
1431         mcs = __xen_mc_entry(sizeof(*op));
1432
1433         op = mcs.args;
1434         op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1435         op->arg1.mfn = mfn;
1436
1437         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1438
1439         if (kernel) {
1440                 percpu_write(xen_cr3, cr3);
1441
1442                 /* Update xen_current_cr3 once the batch has actually
1443                    been submitted. */
1444                 xen_mc_callback(set_current_cr3, (void *)cr3);
1445         }
1446 }
1447
1448 static void xen_write_cr3(unsigned long cr3)
1449 {
1450         BUG_ON(preemptible());
1451
1452         xen_mc_batch();  /* disables interrupts */
1453
1454         /* Update while interrupts are disabled, so its atomic with
1455            respect to ipis */
1456         percpu_write(xen_cr3, cr3);
1457
1458         __xen_write_cr3(true, cr3);
1459
1460 #ifdef CONFIG_X86_64
1461         {
1462                 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1463                 if (user_pgd)
1464                         __xen_write_cr3(false, __pa(user_pgd));
1465                 else
1466                         __xen_write_cr3(false, 0);
1467         }
1468 #endif
1469
1470         xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1471 }
1472
1473 static int xen_pgd_alloc(struct mm_struct *mm)
1474 {
1475         pgd_t *pgd = mm->pgd;
1476         int ret = 0;
1477
1478         BUG_ON(PagePinned(virt_to_page(pgd)));
1479
1480 #ifdef CONFIG_X86_64
1481         {
1482                 struct page *page = virt_to_page(pgd);
1483                 pgd_t *user_pgd;
1484
1485                 BUG_ON(page->private != 0);
1486
1487                 ret = -ENOMEM;
1488
1489                 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1490                 page->private = (unsigned long)user_pgd;
1491
1492                 if (user_pgd != NULL) {
1493                         user_pgd[pgd_index(VSYSCALL_START)] =
1494                                 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1495                         ret = 0;
1496                 }
1497
1498                 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1499         }
1500 #endif
1501
1502         return ret;
1503 }
1504
1505 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1506 {
1507 #ifdef CONFIG_X86_64
1508         pgd_t *user_pgd = xen_get_user_pgd(pgd);
1509
1510         if (user_pgd)
1511                 free_page((unsigned long)user_pgd);
1512 #endif
1513 }
1514
1515 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1516 {
1517         unsigned long pfn = pte_pfn(pte);
1518
1519 #ifdef CONFIG_X86_32
1520         /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1521         if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1522                 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1523                                pte_val_ma(pte));
1524 #endif
1525
1526         /*
1527          * If the new pfn is within the range of the newly allocated
1528          * kernel pagetable, and it isn't being mapped into an
1529          * early_ioremap fixmap slot, make sure it is RO.
1530          */
1531         if (!is_early_ioremap_ptep(ptep) &&
1532             pfn >= e820_table_start && pfn < e820_table_end)
1533                 pte = pte_wrprotect(pte);
1534
1535         return pte;
1536 }
1537
1538 /* Init-time set_pte while constructing initial pagetables, which
1539    doesn't allow RO pagetable pages to be remapped RW */
1540 static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1541 {
1542         pte = mask_rw_pte(ptep, pte);
1543
1544         xen_set_pte(ptep, pte);
1545 }
1546
1547 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1548 {
1549         struct mmuext_op op;
1550         op.cmd = cmd;
1551         op.arg1.mfn = pfn_to_mfn(pfn);
1552         if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1553                 BUG();
1554 }
1555
1556 /* Early in boot, while setting up the initial pagetable, assume
1557    everything is pinned. */
1558 static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1559 {
1560 #ifdef CONFIG_FLATMEM
1561         BUG_ON(mem_map);        /* should only be used early */
1562 #endif
1563         make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1564         pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1565 }
1566
1567 /* Used for pmd and pud */
1568 static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1569 {
1570 #ifdef CONFIG_FLATMEM
1571         BUG_ON(mem_map);        /* should only be used early */
1572 #endif
1573         make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1574 }
1575
1576 /* Early release_pte assumes that all pts are pinned, since there's
1577    only init_mm and anything attached to that is pinned. */
1578 static __init void xen_release_pte_init(unsigned long pfn)
1579 {
1580         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1581         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1582 }
1583
1584 static __init void xen_release_pmd_init(unsigned long pfn)
1585 {
1586         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1587 }
1588
1589 /* This needs to make sure the new pte page is pinned iff its being
1590    attached to a pinned pagetable. */
1591 static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1592 {
1593         struct page *page = pfn_to_page(pfn);
1594
1595         if (PagePinned(virt_to_page(mm->pgd))) {
1596                 SetPagePinned(page);
1597
1598                 if (!PageHighMem(page)) {
1599                         make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1600                         if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1601                                 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1602                 } else {
1603                         /* make sure there are no stray mappings of
1604                            this page */
1605                         kmap_flush_unused();
1606                 }
1607         }
1608 }
1609
1610 static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1611 {
1612         xen_alloc_ptpage(mm, pfn, PT_PTE);
1613 }
1614
1615 static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1616 {
1617         xen_alloc_ptpage(mm, pfn, PT_PMD);
1618 }
1619
1620 /* This should never happen until we're OK to use struct page */
1621 static void xen_release_ptpage(unsigned long pfn, unsigned level)
1622 {
1623         struct page *page = pfn_to_page(pfn);
1624
1625         if (PagePinned(page)) {
1626                 if (!PageHighMem(page)) {
1627                         if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1628                                 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1629                         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1630                 }
1631                 ClearPagePinned(page);
1632         }
1633 }
1634
1635 static void xen_release_pte(unsigned long pfn)
1636 {
1637         xen_release_ptpage(pfn, PT_PTE);
1638 }
1639
1640 static void xen_release_pmd(unsigned long pfn)
1641 {
1642         xen_release_ptpage(pfn, PT_PMD);
1643 }
1644
1645 #if PAGETABLE_LEVELS == 4
1646 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1647 {
1648         xen_alloc_ptpage(mm, pfn, PT_PUD);
1649 }
1650
1651 static void xen_release_pud(unsigned long pfn)
1652 {
1653         xen_release_ptpage(pfn, PT_PUD);
1654 }
1655 #endif
1656
1657 void __init xen_reserve_top(void)
1658 {
1659 #ifdef CONFIG_X86_32
1660         unsigned long top = HYPERVISOR_VIRT_START;
1661         struct xen_platform_parameters pp;
1662
1663         if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1664                 top = pp.virt_start;
1665
1666         reserve_top_address(-top);
1667 #endif  /* CONFIG_X86_32 */
1668 }
1669
1670 /*
1671  * Like __va(), but returns address in the kernel mapping (which is
1672  * all we have until the physical memory mapping has been set up.
1673  */
1674 static void *__ka(phys_addr_t paddr)
1675 {
1676 #ifdef CONFIG_X86_64
1677         return (void *)(paddr + __START_KERNEL_map);
1678 #else
1679         return __va(paddr);
1680 #endif
1681 }
1682
1683 /* Convert a machine address to physical address */
1684 static unsigned long m2p(phys_addr_t maddr)
1685 {
1686         phys_addr_t paddr;
1687
1688         maddr &= PTE_PFN_MASK;
1689         paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1690
1691         return paddr;
1692 }
1693
1694 /* Convert a machine address to kernel virtual */
1695 static void *m2v(phys_addr_t maddr)
1696 {
1697         return __ka(m2p(maddr));
1698 }
1699
1700 static void set_page_prot(void *addr, pgprot_t prot)
1701 {
1702         unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1703         pte_t pte = pfn_pte(pfn, prot);
1704
1705         if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1706                 BUG();
1707 }
1708
1709 static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1710 {
1711         unsigned pmdidx, pteidx;
1712         unsigned ident_pte;
1713         unsigned long pfn;
1714
1715         ident_pte = 0;
1716         pfn = 0;
1717         for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1718                 pte_t *pte_page;
1719
1720                 /* Reuse or allocate a page of ptes */
1721                 if (pmd_present(pmd[pmdidx]))
1722                         pte_page = m2v(pmd[pmdidx].pmd);
1723                 else {
1724                         /* Check for free pte pages */
1725                         if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1726                                 break;
1727
1728                         pte_page = &level1_ident_pgt[ident_pte];
1729                         ident_pte += PTRS_PER_PTE;
1730
1731                         pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1732                 }
1733
1734                 /* Install mappings */
1735                 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1736                         pte_t pte;
1737
1738                         if (pfn > max_pfn_mapped)
1739                                 max_pfn_mapped = pfn;
1740
1741                         if (!pte_none(pte_page[pteidx]))
1742                                 continue;
1743
1744                         pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1745                         pte_page[pteidx] = pte;
1746                 }
1747         }
1748
1749         for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1750                 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1751
1752         set_page_prot(pmd, PAGE_KERNEL_RO);
1753 }
1754
1755 #ifdef CONFIG_X86_64
1756 static void convert_pfn_mfn(void *v)
1757 {
1758         pte_t *pte = v;
1759         int i;
1760
1761         /* All levels are converted the same way, so just treat them
1762            as ptes. */
1763         for (i = 0; i < PTRS_PER_PTE; i++)
1764                 pte[i] = xen_make_pte(pte[i].pte);
1765 }
1766
1767 /*
1768  * Set up the inital kernel pagetable.
1769  *
1770  * We can construct this by grafting the Xen provided pagetable into
1771  * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
1772  * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
1773  * means that only the kernel has a physical mapping to start with -
1774  * but that's enough to get __va working.  We need to fill in the rest
1775  * of the physical mapping once some sort of allocator has been set
1776  * up.
1777  */
1778 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1779                                          unsigned long max_pfn)
1780 {
1781         pud_t *l3;
1782         pmd_t *l2;
1783
1784         /* Zap identity mapping */
1785         init_level4_pgt[0] = __pgd(0);
1786
1787         /* Pre-constructed entries are in pfn, so convert to mfn */
1788         convert_pfn_mfn(init_level4_pgt);
1789         convert_pfn_mfn(level3_ident_pgt);
1790         convert_pfn_mfn(level3_kernel_pgt);
1791
1792         l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1793         l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1794
1795         memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1796         memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1797
1798         l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1799         l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1800         memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1801
1802         /* Set up identity map */
1803         xen_map_identity_early(level2_ident_pgt, max_pfn);
1804
1805         /* Make pagetable pieces RO */
1806         set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1807         set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1808         set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1809         set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1810         set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1811         set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1812
1813         /* Pin down new L4 */
1814         pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1815                           PFN_DOWN(__pa_symbol(init_level4_pgt)));
1816
1817         /* Unpin Xen-provided one */
1818         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1819
1820         /* Switch over */
1821         pgd = init_level4_pgt;
1822
1823         /*
1824          * At this stage there can be no user pgd, and no page
1825          * structure to attach it to, so make sure we just set kernel
1826          * pgd.
1827          */
1828         xen_mc_batch();
1829         __xen_write_cr3(true, __pa(pgd));
1830         xen_mc_issue(PARAVIRT_LAZY_CPU);
1831
1832         memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1833                       __pa(xen_start_info->pt_base +
1834                            xen_start_info->nr_pt_frames * PAGE_SIZE),
1835                       "XEN PAGETABLES");
1836
1837         return pgd;
1838 }
1839 #else   /* !CONFIG_X86_64 */
1840 static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1841
1842 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1843                                          unsigned long max_pfn)
1844 {
1845         pmd_t *kernel_pmd;
1846
1847         max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1848                                   xen_start_info->nr_pt_frames * PAGE_SIZE +
1849                                   512*1024);
1850
1851         kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1852         memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1853
1854         xen_map_identity_early(level2_kernel_pgt, max_pfn);
1855
1856         memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1857         set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1858                         __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1859
1860         set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1861         set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1862         set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1863
1864         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1865
1866         xen_write_cr3(__pa(swapper_pg_dir));
1867
1868         pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1869
1870         memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1871                       __pa(xen_start_info->pt_base +
1872                            xen_start_info->nr_pt_frames * PAGE_SIZE),
1873                       "XEN PAGETABLES");
1874
1875         return swapper_pg_dir;
1876 }
1877 #endif  /* CONFIG_X86_64 */
1878
1879 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1880 {
1881         pte_t pte;
1882
1883         phys >>= PAGE_SHIFT;
1884
1885         switch (idx) {
1886         case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1887 #ifdef CONFIG_X86_F00F_BUG
1888         case FIX_F00F_IDT:
1889 #endif
1890 #ifdef CONFIG_X86_32
1891         case FIX_WP_TEST:
1892         case FIX_VDSO:
1893 # ifdef CONFIG_HIGHMEM
1894         case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1895 # endif
1896 #else
1897         case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1898 #endif
1899 #ifdef CONFIG_X86_LOCAL_APIC
1900         case FIX_APIC_BASE:     /* maps dummy local APIC */
1901 #endif
1902         case FIX_TEXT_POKE0:
1903         case FIX_TEXT_POKE1:
1904                 /* All local page mappings */
1905                 pte = pfn_pte(phys, prot);
1906                 break;
1907
1908         case FIX_PARAVIRT_BOOTMAP:
1909                 /* This is an MFN, but it isn't an IO mapping from the
1910                    IO domain */
1911                 pte = mfn_pte(phys, prot);
1912                 break;
1913
1914         default:
1915                 /* By default, set_fixmap is used for hardware mappings */
1916                 pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
1917                 break;
1918         }
1919
1920         __native_set_fixmap(idx, pte);
1921
1922 #ifdef CONFIG_X86_64
1923         /* Replicate changes to map the vsyscall page into the user
1924            pagetable vsyscall mapping. */
1925         if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1926                 unsigned long vaddr = __fix_to_virt(idx);
1927                 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1928         }
1929 #endif
1930 }
1931
1932 static __init void xen_post_allocator_init(void)
1933 {
1934         pv_mmu_ops.set_pte = xen_set_pte;
1935         pv_mmu_ops.set_pmd = xen_set_pmd;
1936         pv_mmu_ops.set_pud = xen_set_pud;
1937 #if PAGETABLE_LEVELS == 4
1938         pv_mmu_ops.set_pgd = xen_set_pgd;
1939 #endif
1940
1941         /* This will work as long as patching hasn't happened yet
1942            (which it hasn't) */
1943         pv_mmu_ops.alloc_pte = xen_alloc_pte;
1944         pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1945         pv_mmu_ops.release_pte = xen_release_pte;
1946         pv_mmu_ops.release_pmd = xen_release_pmd;
1947 #if PAGETABLE_LEVELS == 4
1948         pv_mmu_ops.alloc_pud = xen_alloc_pud;
1949         pv_mmu_ops.release_pud = xen_release_pud;
1950 #endif
1951
1952 #ifdef CONFIG_X86_64
1953         SetPagePinned(virt_to_page(level3_user_vsyscall));
1954 #endif
1955         xen_mark_init_mm_pinned();
1956 }
1957
1958 static void xen_leave_lazy_mmu(void)
1959 {
1960         preempt_disable();
1961         xen_mc_flush();
1962         paravirt_leave_lazy_mmu();
1963         preempt_enable();
1964 }
1965
1966 static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1967         .read_cr2 = xen_read_cr2,
1968         .write_cr2 = xen_write_cr2,
1969
1970         .read_cr3 = xen_read_cr3,
1971         .write_cr3 = xen_write_cr3,
1972
1973         .flush_tlb_user = xen_flush_tlb,
1974         .flush_tlb_kernel = xen_flush_tlb,
1975         .flush_tlb_single = xen_flush_tlb_single,
1976         .flush_tlb_others = xen_flush_tlb_others,
1977
1978         .pte_update = paravirt_nop,
1979         .pte_update_defer = paravirt_nop,
1980
1981         .pgd_alloc = xen_pgd_alloc,
1982         .pgd_free = xen_pgd_free,
1983
1984         .alloc_pte = xen_alloc_pte_init,
1985         .release_pte = xen_release_pte_init,
1986         .alloc_pmd = xen_alloc_pmd_init,
1987         .release_pmd = xen_release_pmd_init,
1988
1989         .set_pte = xen_set_pte_init,
1990         .set_pte_at = xen_set_pte_at,
1991         .set_pmd = xen_set_pmd_hyper,
1992
1993         .ptep_modify_prot_start = __ptep_modify_prot_start,
1994         .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1995
1996         .pte_val = PV_CALLEE_SAVE(xen_pte_val),
1997         .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
1998
1999         .make_pte = PV_CALLEE_SAVE(xen_make_pte),
2000         .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2001
2002 #ifdef CONFIG_X86_PAE
2003         .set_pte_atomic = xen_set_pte_atomic,
2004         .pte_clear = xen_pte_clear,
2005         .pmd_clear = xen_pmd_clear,
2006 #endif  /* CONFIG_X86_PAE */
2007         .set_pud = xen_set_pud_hyper,
2008
2009         .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2010         .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2011
2012 #if PAGETABLE_LEVELS == 4
2013         .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2014         .make_pud = PV_CALLEE_SAVE(xen_make_pud),
2015         .set_pgd = xen_set_pgd_hyper,
2016
2017         .alloc_pud = xen_alloc_pmd_init,
2018         .release_pud = xen_release_pmd_init,
2019 #endif  /* PAGETABLE_LEVELS == 4 */
2020
2021         .activate_mm = xen_activate_mm,
2022         .dup_mmap = xen_dup_mmap,
2023         .exit_mmap = xen_exit_mmap,
2024
2025         .lazy_mode = {
2026                 .enter = paravirt_enter_lazy_mmu,
2027                 .leave = xen_leave_lazy_mmu,
2028         },
2029
2030         .set_fixmap = xen_set_fixmap,
2031 };
2032
2033 void __init xen_init_mmu_ops(void)
2034 {
2035         x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2036         x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2037         pv_mmu_ops = xen_mmu_ops;
2038
2039         vmap_lazy_unmap = false;
2040 }
2041
2042 /* Protected by xen_reservation_lock. */
2043 #define MAX_CONTIG_ORDER 9 /* 2MB */
2044 static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2045
2046 #define VOID_PTE (mfn_pte(0, __pgprot(0)))
2047 static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2048                                 unsigned long *in_frames,
2049                                 unsigned long *out_frames)
2050 {
2051         int i;
2052         struct multicall_space mcs;
2053
2054         xen_mc_batch();
2055         for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2056                 mcs = __xen_mc_entry(0);
2057
2058                 if (in_frames)
2059                         in_frames[i] = virt_to_mfn(vaddr);
2060
2061                 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2062                 set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2063
2064                 if (out_frames)
2065                         out_frames[i] = virt_to_pfn(vaddr);
2066         }
2067         xen_mc_issue(0);
2068 }
2069
2070 /*
2071  * Update the pfn-to-mfn mappings for a virtual address range, either to
2072  * point to an array of mfns, or contiguously from a single starting
2073  * mfn.
2074  */
2075 static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2076                                      unsigned long *mfns,
2077                                      unsigned long first_mfn)
2078 {
2079         unsigned i, limit;
2080         unsigned long mfn;
2081
2082         xen_mc_batch();
2083
2084         limit = 1u << order;
2085         for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2086                 struct multicall_space mcs;
2087                 unsigned flags;
2088
2089                 mcs = __xen_mc_entry(0);
2090                 if (mfns)
2091                         mfn = mfns[i];
2092                 else
2093                         mfn = first_mfn + i;
2094
2095                 if (i < (limit - 1))
2096                         flags = 0;
2097                 else {
2098                         if (order == 0)
2099                                 flags = UVMF_INVLPG | UVMF_ALL;
2100                         else
2101                                 flags = UVMF_TLB_FLUSH | UVMF_ALL;
2102                 }
2103
2104                 MULTI_update_va_mapping(mcs.mc, vaddr,
2105                                 mfn_pte(mfn, PAGE_KERNEL), flags);
2106
2107                 set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2108         }
2109
2110         xen_mc_issue(0);
2111 }
2112
2113 /*
2114  * Perform the hypercall to exchange a region of our pfns to point to
2115  * memory with the required contiguous alignment.  Takes the pfns as
2116  * input, and populates mfns as output.
2117  *
2118  * Returns a success code indicating whether the hypervisor was able to
2119  * satisfy the request or not.
2120  */
2121 static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2122                                unsigned long *pfns_in,
2123                                unsigned long extents_out,
2124                                unsigned int order_out,
2125                                unsigned long *mfns_out,
2126                                unsigned int address_bits)
2127 {
2128         long rc;
2129         int success;
2130
2131         struct xen_memory_exchange exchange = {
2132                 .in = {
2133                         .nr_extents   = extents_in,
2134                         .extent_order = order_in,
2135                         .extent_start = pfns_in,
2136                         .domid        = DOMID_SELF
2137                 },
2138                 .out = {
2139                         .nr_extents   = extents_out,
2140                         .extent_order = order_out,
2141                         .extent_start = mfns_out,
2142                         .address_bits = address_bits,
2143                         .domid        = DOMID_SELF
2144                 }
2145         };
2146
2147         BUG_ON(extents_in << order_in != extents_out << order_out);
2148
2149         rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2150         success = (exchange.nr_exchanged == extents_in);
2151
2152         BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2153         BUG_ON(success && (rc != 0));
2154
2155         return success;
2156 }
2157
2158 int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2159                                  unsigned int address_bits)
2160 {
2161         unsigned long *in_frames = discontig_frames, out_frame;
2162         unsigned long  flags;
2163         int            success;
2164
2165         /*
2166          * Currently an auto-translated guest will not perform I/O, nor will
2167          * it require PAE page directories below 4GB. Therefore any calls to
2168          * this function are redundant and can be ignored.
2169          */
2170
2171         if (xen_feature(XENFEAT_auto_translated_physmap))
2172                 return 0;
2173
2174         if (unlikely(order > MAX_CONTIG_ORDER))
2175                 return -ENOMEM;
2176
2177         memset((void *) vstart, 0, PAGE_SIZE << order);
2178
2179         spin_lock_irqsave(&xen_reservation_lock, flags);
2180
2181         /* 1. Zap current PTEs, remembering MFNs. */
2182         xen_zap_pfn_range(vstart, order, in_frames, NULL);
2183
2184         /* 2. Get a new contiguous memory extent. */
2185         out_frame = virt_to_pfn(vstart);
2186         success = xen_exchange_memory(1UL << order, 0, in_frames,
2187                                       1, order, &out_frame,
2188                                       address_bits);
2189
2190         /* 3. Map the new extent in place of old pages. */
2191         if (success)
2192                 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2193         else
2194                 xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2195
2196         spin_unlock_irqrestore(&xen_reservation_lock, flags);
2197
2198         return success ? 0 : -ENOMEM;
2199 }
2200 EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2201
2202 void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2203 {
2204         unsigned long *out_frames = discontig_frames, in_frame;
2205         unsigned long  flags;
2206         int success;
2207
2208         if (xen_feature(XENFEAT_auto_translated_physmap))
2209                 return;
2210
2211         if (unlikely(order > MAX_CONTIG_ORDER))
2212                 return;
2213
2214         memset((void *) vstart, 0, PAGE_SIZE << order);
2215
2216         spin_lock_irqsave(&xen_reservation_lock, flags);
2217
2218         /* 1. Find start MFN of contiguous extent. */
2219         in_frame = virt_to_mfn(vstart);
2220
2221         /* 2. Zap current PTEs. */
2222         xen_zap_pfn_range(vstart, order, NULL, out_frames);
2223
2224         /* 3. Do the exchange for non-contiguous MFNs. */
2225         success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2226                                         0, out_frames, 0);
2227
2228         /* 4. Map new pages in place of old pages. */
2229         if (success)
2230                 xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2231         else
2232                 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2233
2234         spin_unlock_irqrestore(&xen_reservation_lock, flags);
2235 }
2236 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2237
2238 #ifdef CONFIG_XEN_PVHVM
2239 static void xen_hvm_exit_mmap(struct mm_struct *mm)
2240 {
2241         struct xen_hvm_pagetable_dying a;
2242         int rc;
2243
2244         a.domid = DOMID_SELF;
2245         a.gpa = __pa(mm->pgd);
2246         rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2247         WARN_ON_ONCE(rc < 0);
2248 }
2249
2250 static int is_pagetable_dying_supported(void)
2251 {
2252         struct xen_hvm_pagetable_dying a;
2253         int rc = 0;
2254
2255         a.domid = DOMID_SELF;
2256         a.gpa = 0x00;
2257         rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2258         if (rc < 0) {
2259                 printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2260                 return 0;
2261         }
2262         return 1;
2263 }
2264
2265 void __init xen_hvm_init_mmu_ops(void)
2266 {
2267         if (is_pagetable_dying_supported())
2268                 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2269 }
2270 #endif
2271
2272 #ifdef CONFIG_XEN_DEBUG_FS
2273
2274 static struct dentry *d_mmu_debug;
2275
2276 static int __init xen_mmu_debugfs(void)
2277 {
2278         struct dentry *d_xen = xen_init_debugfs();
2279
2280         if (d_xen == NULL)
2281                 return -ENOMEM;
2282
2283         d_mmu_debug = debugfs_create_dir("mmu", d_xen);
2284
2285         debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
2286
2287         debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
2288         debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
2289                            &mmu_stats.pgd_update_pinned);
2290         debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
2291                            &mmu_stats.pgd_update_pinned);
2292
2293         debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
2294         debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
2295                            &mmu_stats.pud_update_pinned);
2296         debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
2297                            &mmu_stats.pud_update_pinned);
2298
2299         debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
2300         debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
2301                            &mmu_stats.pmd_update_pinned);
2302         debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
2303                            &mmu_stats.pmd_update_pinned);
2304
2305         debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
2306 //      debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
2307 //                         &mmu_stats.pte_update_pinned);
2308         debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2309                            &mmu_stats.pte_update_pinned);
2310
2311         debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2312         debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2313                            &mmu_stats.mmu_update_extended);
2314         xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2315                                      mmu_stats.mmu_update_histo, 20);
2316
2317         debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
2318         debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
2319                            &mmu_stats.set_pte_at_batched);
2320         debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
2321                            &mmu_stats.set_pte_at_current);
2322         debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
2323                            &mmu_stats.set_pte_at_kernel);
2324
2325         debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2326         debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2327                            &mmu_stats.prot_commit_batched);
2328
2329         return 0;
2330 }
2331 fs_initcall(xen_mmu_debugfs);
2332
2333 #endif  /* CONFIG_XEN_DEBUG_FS */