arch/x86/xen/mmu.c

   1 /*
   2  * Xen mmu operations
   3  *
   4  * This file contains the various mmu fetch and update operations.
   5  * The most important job they must perform is the mapping between the
   6  * domain's pfn and the overall machine mfns.
   7  *
   8  * Xen allows guests to directly update the pagetable, in a controlled
   9  * fashion.  In other words, the guest modifies the same pagetable
  10  * that the CPU actually uses, which eliminates the overhead of having
  11  * a separate shadow pagetable.
  12  *
  13  * In order to allow this, it falls on the guest domain to map its
  14  * notion of a "physical" pfn - which is just a domain-local linear
  15  * address - into a real "machine address" which the CPU's MMU can
  16  * use.
  17  *
  18  * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
  19  * inserted directly into the pagetable.  When creating a new
  20  * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
  21  * when reading the content back with __(pgd|pmd|pte)_val, it converts
  22  * the mfn back into a pfn.
  23  *
  24  * The other constraint is that all pages which make up a pagetable
  25  * must be mapped read-only in the guest.  This prevents uncontrolled
  26  * guest updates to the pagetable.  Xen strictly enforces this, and
  27  * will disallow any pagetable update which will end up mapping a
  28  * pagetable page RW, and will disallow using any writable page as a
  29  * pagetable.
  30  *
  31  * Naively, when loading %cr3 with the base of a new pagetable, Xen
  32  * would need to validate the whole pagetable before going on.
  33  * Naturally, this is quite slow.  The solution is to "pin" a
  34  * pagetable, which enforces all the constraints on the pagetable even
  35  * when it is not actively in use.  This menas that Xen can be assured
  36  * that it is still valid when you do load it into %cr3, and doesn't
  37  * need to revalidate it.
  38  *
  39  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  40  */
  41 #include <linux/sched.h>
  42 #include <linux/highmem.h>
  43 #include <linux/debugfs.h>
  44 #include <linux/bug.h>
  45 #include <linux/vmalloc.h>
  46 #include <linux/module.h>
  47 #include <linux/gfp.h>
  48 #include <linux/memblock.h>
  49 #include <linux/seq_file.h>
  50
  51 #include <asm/pgtable.h>
  52 #include <asm/tlbflush.h>
  53 #include <asm/fixmap.h>
  54 #include <asm/mmu_context.h>
  55 #include <asm/setup.h>
  56 #include <asm/paravirt.h>
  57 #include <asm/e820.h>
  58 #include <asm/linkage.h>
  59 #include <asm/page.h>
  60 #include <asm/init.h>
  61 #include <asm/pat.h>
  62
  63 #include <asm/xen/hypercall.h>
  64 #include <asm/xen/hypervisor.h>
  65
  66 #include <xen/xen.h>
  67 #include <xen/page.h>
  68 #include <xen/interface/xen.h>
  69 #include <xen/interface/hvm/hvm_op.h>
  70 #include <xen/interface/version.h>
  71 #include <xen/interface/memory.h>
  72 #include <xen/hvc-console.h>
  73
  74 #include "multicalls.h"
  75 #include "mmu.h"
  76 #include "debugfs.h"
  77
  78 #define MMU_UPDATE_HISTO        30
  79
  80 /*
  81  * Protects atomic reservation decrease/increase against concurrent increases.
  82  * Also protects non-atomic updates of current_pages and balloon lists.
  83  */
  84 DEFINE_SPINLOCK(xen_reservation_lock);
  85
  86 #ifdef CONFIG_XEN_DEBUG_FS
  87
  88 static struct {
  89         u32 pgd_update;
  90         u32 pgd_update_pinned;
  91         u32 pgd_update_batched;
  92
  93         u32 pud_update;
  94         u32 pud_update_pinned;
  95         u32 pud_update_batched;
  96
  97         u32 pmd_update;
  98         u32 pmd_update_pinned;
  99         u32 pmd_update_batched;
 100
 101         u32 pte_update;
 102         u32 pte_update_pinned;
 103         u32 pte_update_batched;
 104
 105         u32 mmu_update;
 106         u32 mmu_update_extended;
 107         u32 mmu_update_histo[MMU_UPDATE_HISTO];
 108
 109         u32 prot_commit;
 110         u32 prot_commit_batched;
 111 } mmu_stats;
 112
 113 static u8 zero_stats;
 114
 115 static inline void check_zero(void)
 116 {
 117         if (unlikely(zero_stats)) {
 118                 memset(&mmu_stats, 0, sizeof(mmu_stats));
 119                 zero_stats = 0;
 120         }
 121 }
 122
 123 #define ADD_STATS(elem, val)                    \
 124         do { check_zero(); mmu_stats.elem += (val); } while(0)
 125
 126 #else  /* !CONFIG_XEN_DEBUG_FS */
 127
 128 #define ADD_STATS(elem, val)    do { (void)(val); } while(0)
 129
 130 #endif /* CONFIG_XEN_DEBUG_FS */
 131
 132
 133 /*
 134  * Identity map, in addition to plain kernel map.  This needs to be
 135  * large enough to allocate page table pages to allocate the rest.
 136  * Each page can map 2MB.
 137  */
 138 #define LEVEL1_IDENT_ENTRIES    (PTRS_PER_PTE * 4)
 139 static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
 140
 141 #ifdef CONFIG_X86_64
 142 /* l3 pud for userspace vsyscall mapping */
 143 static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
 144 #endif /* CONFIG_X86_64 */
 145
 146 /*
 147  * Note about cr3 (pagetable base) values:
 148  *
 149  * xen_cr3 contains the current logical cr3 value; it contains the
 150  * last set cr3.  This may not be the current effective cr3, because
 151  * its update may be being lazily deferred.  However, a vcpu looking
 152  * at its own cr3 can use this value knowing that it everything will
 153  * be self-consistent.
 154  *
 155  * xen_current_cr3 contains the actual vcpu cr3; it is set once the
 156  * hypercall to set the vcpu cr3 is complete (so it may be a little
 157  * out of date, but it will never be set early).  If one vcpu is
 158  * looking at another vcpu's cr3 value, it should use this variable.
 159  */
 160 DEFINE_PER_CPU(unsigned long, xen_cr3);  /* cr3 stored as physaddr */
 161 DEFINE_PER_CPU(unsigned long, xen_current_cr3);  /* actual vcpu cr3 */
 162
 163
 164 /*
 165  * Just beyond the highest usermode address.  STACK_TOP_MAX has a
 166  * redzone above it, so round it up to a PGD boundary.
 167  */
 168 #define USER_LIMIT      ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
 169
 170 unsigned long arbitrary_virt_to_mfn(void *vaddr)
 171 {
 172         xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
 173
 174         return PFN_DOWN(maddr.maddr);
 175 }
 176
 177 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 178 {
 179         unsigned long address = (unsigned long)vaddr;
 180         unsigned int level;
 181         pte_t *pte;
 182         unsigned offset;
 183
 184         /*
 185          * if the PFN is in the linear mapped vaddr range, we can just use
 186          * the (quick) virt_to_machine() p2m lookup
 187          */
 188         if (virt_addr_valid(vaddr))
 189                 return virt_to_machine(vaddr);
 190
 191         /* otherwise we have to do a (slower) full page-table walk */
 192
 193         pte = lookup_address(address, &level);
 194         BUG_ON(pte == NULL);
 195         offset = address & ~PAGE_MASK;
 196         return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
 197 }
 198 EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
 199
 200 void make_lowmem_page_readonly(void *vaddr)
 201 {
 202         pte_t *pte, ptev;
 203         unsigned long address = (unsigned long)vaddr;
 204         unsigned int level;
 205
 206         pte = lookup_address(address, &level);
 207         if (pte == NULL)
 208                 return;         /* vaddr missing */
 209
 210         ptev = pte_wrprotect(*pte);
 211
 212         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 213                 BUG();
 214 }
 215
 216 void make_lowmem_page_readwrite(void *vaddr)
 217 {
 218         pte_t *pte, ptev;
 219         unsigned long address = (unsigned long)vaddr;
 220         unsigned int level;
 221
 222         pte = lookup_address(address, &level);
 223         if (pte == NULL)
 224                 return;         /* vaddr missing */
 225
 226         ptev = pte_mkwrite(*pte);
 227
 228         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 229                 BUG();
 230 }
 231
 232
 233 static bool xen_page_pinned(void *ptr)
 234 {
 235         struct page *page = virt_to_page(ptr);
 236
 237         return PagePinned(page);
 238 }
 239
 240 void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
 241 {
 242         struct multicall_space mcs;
 243         struct mmu_update *u;
 244
 245         mcs = xen_mc_entry(sizeof(*u));
 246         u = mcs.args;
 247
 248         /* ptep might be kmapped when using 32-bit HIGHPTE */
 249         u->ptr = virt_to_machine(ptep).maddr;
 250         u->val = pte_val_ma(pteval);
 251
 252         MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
 253
 254         xen_mc_issue(PARAVIRT_LAZY_MMU);
 255 }
 256 EXPORT_SYMBOL_GPL(xen_set_domain_pte);
 257
 258 static void xen_extend_mmu_update(const struct mmu_update *update)
 259 {
 260         struct multicall_space mcs;
 261         struct mmu_update *u;
 262
 263         mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
 264
 265         if (mcs.mc != NULL) {
 266                 ADD_STATS(mmu_update_extended, 1);
 267                 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
 268
 269                 mcs.mc->args[1]++;
 270
 271                 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
 272                         ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
 273                 else
 274                         ADD_STATS(mmu_update_histo[0], 1);
 275         } else {
 276                 ADD_STATS(mmu_update, 1);
 277                 mcs = __xen_mc_entry(sizeof(*u));
 278                 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 279                 ADD_STATS(mmu_update_histo[1], 1);
 280         }
 281
 282         u = mcs.args;
 283         *u = *update;
 284 }
 285
 286 static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 287 {
 288         struct mmu_update u;
 289
 290         preempt_disable();
 291
 292         xen_mc_batch();
 293
 294         /* ptr may be ioremapped for 64-bit pagetable setup */
 295         u.ptr = virt_to_machine(ptr).maddr;
 296         u.val = pmd_val_ma(val);
 297         xen_extend_mmu_update(&u);
 298
 299         ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 300
 301         xen_mc_issue(PARAVIRT_LAZY_MMU);
 302
 303         preempt_enable();
 304 }
 305
 306 static void xen_set_pmd(pmd_t *ptr, pmd_t val)
 307 {
 308         ADD_STATS(pmd_update, 1);
 309
 310         /* If page is not pinned, we can just update the entry
 311            directly */
 312         if (!xen_page_pinned(ptr)) {
 313                 *ptr = val;
 314                 return;
 315         }
 316
 317         ADD_STATS(pmd_update_pinned, 1);
 318
 319         xen_set_pmd_hyper(ptr, val);
 320 }
 321
 322 /*
 323  * Associate a virtual page frame with a given physical page frame
 324  * and protection flags for that frame.
 325  */
 326 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 327 {
 328         set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
 329 }
 330
 331 static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
 332 {
 333         struct mmu_update u;
 334
 335         if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
 336                 return false;
 337
 338         xen_mc_batch();
 339
 340         u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
 341         u.val = pte_val_ma(pteval);
 342         xen_extend_mmu_update(&u);
 343
 344         xen_mc_issue(PARAVIRT_LAZY_MMU);
 345
 346         return true;
 347 }
 348
 349 static void xen_set_pte(pte_t *ptep, pte_t pteval)
 350 {
 351         ADD_STATS(pte_update, 1);
 352 //      ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
 353
 354         if (!xen_batched_set_pte(ptep, pteval))
 355                 native_set_pte(ptep, pteval);
 356 }
 357
 358 static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 359                     pte_t *ptep, pte_t pteval)
 360 {
 361         xen_set_pte(ptep, pteval);
 362 }
 363
 364 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
 365                                  unsigned long addr, pte_t *ptep)
 366 {
 367         /* Just return the pte as-is.  We preserve the bits on commit */
 368         return *ptep;
 369 }
 370
 371 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 372                                  pte_t *ptep, pte_t pte)
 373 {
 374         struct mmu_update u;
 375
 376         xen_mc_batch();
 377
 378         u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
 379         u.val = pte_val_ma(pte);
 380         xen_extend_mmu_update(&u);
 381
 382         ADD_STATS(prot_commit, 1);
 383         ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 384
 385         xen_mc_issue(PARAVIRT_LAZY_MMU);
 386 }
 387
 388 /* Assume pteval_t is equivalent to all the other *val_t types. */
 389 static pteval_t pte_mfn_to_pfn(pteval_t val)
 390 {
 391         if (val & _PAGE_PRESENT) {
 392                 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 393                 pteval_t flags = val & PTE_FLAGS_MASK;
 394                 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
 395         }
 396
 397         return val;
 398 }
 399
 400 static pteval_t pte_pfn_to_mfn(pteval_t val)
 401 {
 402         if (val & _PAGE_PRESENT) {
 403                 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 404                 pteval_t flags = val & PTE_FLAGS_MASK;
 405                 unsigned long mfn;
 406
 407                 if (!xen_feature(XENFEAT_auto_translated_physmap))
 408                         mfn = get_phys_to_machine(pfn);
 409                 else
 410                         mfn = pfn;
 411                 /*
 412                  * If there's no mfn for the pfn, then just create an
 413                  * empty non-present pte.  Unfortunately this loses
 414                  * information about the original pfn, so
 415                  * pte_mfn_to_pfn is asymmetric.
 416                  */
 417                 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
 418                         mfn = 0;
 419                         flags = 0;
 420                 } else {
 421                         /*
 422                          * Paramount to do this test _after_ the
 423                          * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
 424                          * IDENTITY_FRAME_BIT resolves to true.
 425                          */
 426                         mfn &= ~FOREIGN_FRAME_BIT;
 427                         if (mfn & IDENTITY_FRAME_BIT) {
 428                                 mfn &= ~IDENTITY_FRAME_BIT;
 429                                 flags |= _PAGE_IOMAP;
 430                         }
 431                 }
 432                 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
 433         }
 434
 435         return val;
 436 }
 437
 438 static pteval_t iomap_pte(pteval_t val)
 439 {
 440         if (val & _PAGE_PRESENT) {
 441                 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 442                 pteval_t flags = val & PTE_FLAGS_MASK;
 443
 444                 /* We assume the pte frame number is a MFN, so
 445                    just use it as-is. */
 446                 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
 447         }
 448
 449         return val;
 450 }
 451
 452 static pteval_t xen_pte_val(pte_t pte)
 453 {
 454         pteval_t pteval = pte.pte;
 455
 456         /* If this is a WC pte, convert back from Xen WC to Linux WC */
 457         if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
 458                 WARN_ON(!pat_enabled);
 459                 pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
 460         }
 461
 462         if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
 463                 return pteval;
 464
 465         return pte_mfn_to_pfn(pteval);
 466 }
 467 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
 468
 469 static pgdval_t xen_pgd_val(pgd_t pgd)
 470 {
 471         return pte_mfn_to_pfn(pgd.pgd);
 472 }
 473 PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
 474
 475 /*
 476  * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
 477  * are reserved for now, to correspond to the Intel-reserved PAT
 478  * types.
 479  *
 480  * We expect Linux's PAT set as follows:
 481  *
 482  * Idx  PTE flags        Linux    Xen    Default
 483  * 0                     WB       WB     WB
 484  * 1            PWT      WC       WT     WT
 485  * 2        PCD          UC-      UC-    UC-
 486  * 3        PCD PWT      UC       UC     UC
 487  * 4    PAT              WB       WC     WB
 488  * 5    PAT     PWT      WC       WP     WT
 489  * 6    PAT PCD          UC-      UC     UC-
 490  * 7    PAT PCD PWT      UC       UC     UC
 491  */
 492
 493 void xen_set_pat(u64 pat)
 494 {
 495         /* We expect Linux to use a PAT setting of
 496          * UC UC- WC WB (ignoring the PAT flag) */
 497         WARN_ON(pat != 0x0007010600070106ull);
 498 }
 499
 500 static pte_t xen_make_pte(pteval_t pte)
 501 {
 502         phys_addr_t addr = (pte & PTE_PFN_MASK);
 503
 504         /* If Linux is trying to set a WC pte, then map to the Xen WC.
 505          * If _PAGE_PAT is set, then it probably means it is really
 506          * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
 507          * things work out OK...
 508          *
 509          * (We should never see kernel mappings with _PAGE_PSE set,
 510          * but we could see hugetlbfs mappings, I think.).
 511          */
 512         if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
 513                 if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
 514                         pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
 515         }
 516
 517         /*
 518          * Unprivileged domains are allowed to do IOMAPpings for
 519          * PCI passthrough, but not map ISA space.  The ISA
 520          * mappings are just dummy local mappings to keep other
 521          * parts of the kernel happy.
 522          */
 523         if (unlikely(pte & _PAGE_IOMAP) &&
 524             (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
 525                 pte = iomap_pte(pte);
 526         } else {
 527                 pte &= ~_PAGE_IOMAP;
 528                 pte = pte_pfn_to_mfn(pte);
 529         }
 530
 531         return native_make_pte(pte);
 532 }
 533 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
 534
 535 #ifdef CONFIG_XEN_DEBUG
 536 pte_t xen_make_pte_debug(pteval_t pte)
 537 {
 538         phys_addr_t addr = (pte & PTE_PFN_MASK);
 539         phys_addr_t other_addr;
 540         bool io_page = false;
 541         pte_t _pte;
 542
 543         if (pte & _PAGE_IOMAP)
 544                 io_page = true;
 545
 546         _pte = xen_make_pte(pte);
 547
 548         if (!addr)
 549                 return _pte;
 550
 551         if (io_page &&
 552             (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
 553                 other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
 554                 WARN_ONCE(addr != other_addr,
 555                         "0x%lx is using VM_IO, but it is 0x%lx!\n",
 556                         (unsigned long)addr, (unsigned long)other_addr);
 557         } else {
 558                 pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
 559                 other_addr = (_pte.pte & PTE_PFN_MASK);
 560                 WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set),
 561                         "0x%lx is missing VM_IO (and wasn't fixed)!\n",
 562                         (unsigned long)addr);
 563         }
 564
 565         return _pte;
 566 }
 567 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
 568 #endif
 569
 570 static pgd_t xen_make_pgd(pgdval_t pgd)
 571 {
 572         pgd = pte_pfn_to_mfn(pgd);
 573         return native_make_pgd(pgd);
 574 }
 575 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
 576
 577 static pmdval_t xen_pmd_val(pmd_t pmd)
 578 {
 579         return pte_mfn_to_pfn(pmd.pmd);
 580 }
 581 PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
 582
 583 static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 584 {
 585         struct mmu_update u;
 586
 587         preempt_disable();
 588
 589         xen_mc_batch();
 590
 591         /* ptr may be ioremapped for 64-bit pagetable setup */
 592         u.ptr = virt_to_machine(ptr).maddr;
 593         u.val = pud_val_ma(val);
 594         xen_extend_mmu_update(&u);
 595
 596         ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 597
 598         xen_mc_issue(PARAVIRT_LAZY_MMU);
 599
 600         preempt_enable();
 601 }
 602
 603 static void xen_set_pud(pud_t *ptr, pud_t val)
 604 {
 605         ADD_STATS(pud_update, 1);
 606
 607         /* If page is not pinned, we can just update the entry
 608            directly */
 609         if (!xen_page_pinned(ptr)) {
 610                 *ptr = val;
 611                 return;
 612         }
 613
 614         ADD_STATS(pud_update_pinned, 1);
 615
 616         xen_set_pud_hyper(ptr, val);
 617 }
 618
 619 #ifdef CONFIG_X86_PAE
 620 static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 621 {
 622         set_64bit((u64 *)ptep, native_pte_val(pte));
 623 }
 624
 625 static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 626 {
 627         if (!xen_batched_set_pte(ptep, native_make_pte(0)))
 628                 native_pte_clear(mm, addr, ptep);
 629 }
 630
 631 static void xen_pmd_clear(pmd_t *pmdp)
 632 {
 633         set_pmd(pmdp, __pmd(0));
 634 }
 635 #endif  /* CONFIG_X86_PAE */
 636
 637 static pmd_t xen_make_pmd(pmdval_t pmd)
 638 {
 639         pmd = pte_pfn_to_mfn(pmd);
 640         return native_make_pmd(pmd);
 641 }
 642 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
 643
 644 #if PAGETABLE_LEVELS == 4
 645 static pudval_t xen_pud_val(pud_t pud)
 646 {
 647         return pte_mfn_to_pfn(pud.pud);
 648 }
 649 PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
 650
 651 static pud_t xen_make_pud(pudval_t pud)
 652 {
 653         pud = pte_pfn_to_mfn(pud);
 654
 655         return native_make_pud(pud);
 656 }
 657 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
 658
 659 static pgd_t *xen_get_user_pgd(pgd_t *pgd)
 660 {
 661         pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
 662         unsigned offset = pgd - pgd_page;
 663         pgd_t *user_ptr = NULL;
 664
 665         if (offset < pgd_index(USER_LIMIT)) {
 666                 struct page *page = virt_to_page(pgd_page);
 667                 user_ptr = (pgd_t *)page->private;
 668                 if (user_ptr)
 669                         user_ptr += offset;
 670         }
 671
 672         return user_ptr;
 673 }
 674
 675 static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 676 {
 677         struct mmu_update u;
 678
 679         u.ptr = virt_to_machine(ptr).maddr;
 680         u.val = pgd_val_ma(val);
 681         xen_extend_mmu_update(&u);
 682 }
 683
 684 /*
 685  * Raw hypercall-based set_pgd, intended for in early boot before
 686  * there's a page structure.  This implies:
 687  *  1. The only existing pagetable is the kernel's
 688  *  2. It is always pinned
 689  *  3. It has no user pagetable attached to it
 690  */
 691 static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 692 {
 693         preempt_disable();
 694
 695         xen_mc_batch();
 696
 697         __xen_set_pgd_hyper(ptr, val);
 698
 699         xen_mc_issue(PARAVIRT_LAZY_MMU);
 700
 701         preempt_enable();
 702 }
 703
 704 static void xen_set_pgd(pgd_t *ptr, pgd_t val)
 705 {
 706         pgd_t *user_ptr = xen_get_user_pgd(ptr);
 707
 708         ADD_STATS(pgd_update, 1);
 709
 710         /* If page is not pinned, we can just update the entry
 711            directly */
 712         if (!xen_page_pinned(ptr)) {
 713                 *ptr = val;
 714                 if (user_ptr) {
 715                         WARN_ON(xen_page_pinned(user_ptr));
 716                         *user_ptr = val;
 717                 }
 718                 return;
 719         }
 720
 721         ADD_STATS(pgd_update_pinned, 1);
 722         ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 723
 724         /* If it's pinned, then we can at least batch the kernel and
 725            user updates together. */
 726         xen_mc_batch();
 727
 728         __xen_set_pgd_hyper(ptr, val);
 729         if (user_ptr)
 730                 __xen_set_pgd_hyper(user_ptr, val);
 731
 732         xen_mc_issue(PARAVIRT_LAZY_MMU);
 733 }
 734 #endif  /* PAGETABLE_LEVELS == 4 */
 735
 736 /*
 737  * (Yet another) pagetable walker.  This one is intended for pinning a
 738  * pagetable.  This means that it walks a pagetable and calls the
 739  * callback function on each page it finds making up the page table,
 740  * at every level.  It walks the entire pagetable, but it only bothers
 741  * pinning pte pages which are below limit.  In the normal case this
 742  * will be STACK_TOP_MAX, but at boot we need to pin up to
 743  * FIXADDR_TOP.
 744  *
 745  * For 32-bit the important bit is that we don't pin beyond there,
 746  * because then we start getting into Xen's ptes.
 747  *
 748  * For 64-bit, we must skip the Xen hole in the middle of the address
 749  * space, just after the big x86-64 virtual hole.
 750  */
 751 static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
 752                           int (*func)(struct mm_struct *mm, struct page *,
 753                                       enum pt_level),
 754                           unsigned long limit)
 755 {
 756         int flush = 0;
 757         unsigned hole_low, hole_high;
 758         unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
 759         unsigned pgdidx, pudidx, pmdidx;
 760
 761         /* The limit is the last byte to be touched */
 762         limit--;
 763         BUG_ON(limit >= FIXADDR_TOP);
 764
 765         if (xen_feature(XENFEAT_auto_translated_physmap))
 766                 return 0;
 767
 768         /*
 769          * 64-bit has a great big hole in the middle of the address
 770          * space, which contains the Xen mappings.  On 32-bit these
 771          * will end up making a zero-sized hole and so is a no-op.
 772          */
 773         hole_low = pgd_index(USER_LIMIT);
 774         hole_high = pgd_index(PAGE_OFFSET);
 775
 776         pgdidx_limit = pgd_index(limit);
 777 #if PTRS_PER_PUD > 1
 778         pudidx_limit = pud_index(limit);
 779 #else
 780         pudidx_limit = 0;
 781 #endif
 782 #if PTRS_PER_PMD > 1
 783         pmdidx_limit = pmd_index(limit);
 784 #else
 785         pmdidx_limit = 0;
 786 #endif
 787
 788         for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
 789                 pud_t *pud;
 790
 791                 if (pgdidx >= hole_low && pgdidx < hole_high)
 792                         continue;
 793
 794                 if (!pgd_val(pgd[pgdidx]))
 795                         continue;
 796
 797                 pud = pud_offset(&pgd[pgdidx], 0);
 798
 799                 if (PTRS_PER_PUD > 1) /* not folded */
 800                         flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
 801
 802                 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
 803                         pmd_t *pmd;
 804
 805                         if (pgdidx == pgdidx_limit &&
 806                             pudidx > pudidx_limit)
 807                                 goto out;
 808
 809                         if (pud_none(pud[pudidx]))
 810                                 continue;
 811
 812                         pmd = pmd_offset(&pud[pudidx], 0);
 813
 814                         if (PTRS_PER_PMD > 1) /* not folded */
 815                                 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
 816
 817                         for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
 818                                 struct page *pte;
 819
 820                                 if (pgdidx == pgdidx_limit &&
 821                                     pudidx == pudidx_limit &&
 822                                     pmdidx > pmdidx_limit)
 823                                         goto out;
 824
 825                                 if (pmd_none(pmd[pmdidx]))
 826                                         continue;
 827
 828                                 pte = pmd_page(pmd[pmdidx]);
 829                                 flush |= (*func)(mm, pte, PT_PTE);
 830                         }
 831                 }
 832         }
 833
 834 out:
 835         /* Do the top level last, so that the callbacks can use it as
 836            a cue to do final things like tlb flushes. */
 837         flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
 838
 839         return flush;
 840 }
 841
 842 static int xen_pgd_walk(struct mm_struct *mm,
 843                         int (*func)(struct mm_struct *mm, struct page *,
 844                                     enum pt_level),
 845                         unsigned long limit)
 846 {
 847         return __xen_pgd_walk(mm, mm->pgd, func, limit);
 848 }
 849
 850 /* If we're using split pte locks, then take the page's lock and
 851    return a pointer to it.  Otherwise return NULL. */
 852 static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
 853 {
 854         spinlock_t *ptl = NULL;
 855
 856 #if USE_SPLIT_PTLOCKS
 857         ptl = __pte_lockptr(page);
 858         spin_lock_nest_lock(ptl, &mm->page_table_lock);
 859 #endif
 860
 861         return ptl;
 862 }
 863
 864 static void xen_pte_unlock(void *v)
 865 {
 866         spinlock_t *ptl = v;
 867         spin_unlock(ptl);
 868 }
 869
 870 static void xen_do_pin(unsigned level, unsigned long pfn)
 871 {
 872         struct mmuext_op *op;
 873         struct multicall_space mcs;
 874
 875         mcs = __xen_mc_entry(sizeof(*op));
 876         op = mcs.args;
 877         op->cmd = level;
 878         op->arg1.mfn = pfn_to_mfn(pfn);
 879         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 880 }
 881
 882 static int xen_pin_page(struct mm_struct *mm, struct page *page,
 883                         enum pt_level level)
 884 {
 885         unsigned pgfl = TestSetPagePinned(page);
 886         int flush;
 887
 888         if (pgfl)
 889                 flush = 0;              /* already pinned */
 890         else if (PageHighMem(page))
 891                 /* kmaps need flushing if we found an unpinned
 892                    highpage */
 893                 flush = 1;
 894         else {
 895                 void *pt = lowmem_page_address(page);
 896                 unsigned long pfn = page_to_pfn(page);
 897                 struct multicall_space mcs = __xen_mc_entry(0);
 898                 spinlock_t *ptl;
 899
 900                 flush = 0;
 901
 902                 /*
 903                  * We need to hold the pagetable lock between the time
 904                  * we make the pagetable RO and when we actually pin
 905                  * it.  If we don't, then other users may come in and
 906                  * attempt to update the pagetable by writing it,
 907                  * which will fail because the memory is RO but not
 908                  * pinned, so Xen won't do the trap'n'emulate.
 909                  *
 910                  * If we're using split pte locks, we can't hold the
 911                  * entire pagetable's worth of locks during the
 912                  * traverse, because we may wrap the preempt count (8
 913                  * bits).  The solution is to mark RO and pin each PTE
 914                  * page while holding the lock.  This means the number
 915                  * of locks we end up holding is never more than a
 916                  * batch size (~32 entries, at present).
 917                  *
 918                  * If we're not using split pte locks, we needn't pin
 919                  * the PTE pages independently, because we're
 920                  * protected by the overall pagetable lock.
 921                  */
 922                 ptl = NULL;
 923                 if (level == PT_PTE)
 924                         ptl = xen_pte_lock(page, mm);
 925
 926                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 927                                         pfn_pte(pfn, PAGE_KERNEL_RO),
 928                                         level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 929
 930                 if (ptl) {
 931                         xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
 932
 933                         /* Queue a deferred unlock for when this batch
 934                            is completed. */
 935                         xen_mc_callback(xen_pte_unlock, ptl);
 936                 }
 937         }
 938
 939         return flush;
 940 }
 941
 942 /* This is called just after a mm has been created, but it has not
 943    been used yet.  We need to make sure that its pagetable is all
 944    read-only, and can be pinned. */
 945 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
 946 {
 947         xen_mc_batch();
 948
 949         if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
 950                 /* re-enable interrupts for flushing */
 951                 xen_mc_issue(0);
 952
 953                 kmap_flush_unused();
 954
 955                 xen_mc_batch();
 956         }
 957
 958 #ifdef CONFIG_X86_64
 959         {
 960                 pgd_t *user_pgd = xen_get_user_pgd(pgd);
 961
 962                 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
 963
 964                 if (user_pgd) {
 965                         xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
 966                         xen_do_pin(MMUEXT_PIN_L4_TABLE,
 967                                    PFN_DOWN(__pa(user_pgd)));
 968                 }
 969         }
 970 #else /* CONFIG_X86_32 */
 971 #ifdef CONFIG_X86_PAE
 972         /* Need to make sure unshared kernel PMD is pinnable */
 973         xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
 974                      PT_PMD);
 975 #endif
 976         xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
 977 #endif /* CONFIG_X86_64 */
 978         xen_mc_issue(0);
 979 }
 980
 981 static void xen_pgd_pin(struct mm_struct *mm)
 982 {
 983         __xen_pgd_pin(mm, mm->pgd);
 984 }
 985
 986 /*
 987  * On save, we need to pin all pagetables to make sure they get their
 988  * mfns turned into pfns.  Search the list for any unpinned pgds and pin
 989  * them (unpinned pgds are not currently in use, probably because the
 990  * process is under construction or destruction).
 991  *
 992  * Expected to be called in stop_machine() ("equivalent to taking
 993  * every spinlock in the system"), so the locking doesn't really
 994  * matter all that much.
 995  */
 996 void xen_mm_pin_all(void)
 997 {
 998         struct page *page;
 999
1000         spin_lock(&pgd_lock);
1001
1002         list_for_each_entry(page, &pgd_list, lru) {
1003                 if (!PagePinned(page)) {
1004                         __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
1005                         SetPageSavePinned(page);
1006                 }
1007         }
1008
1009         spin_unlock(&pgd_lock);
1010 }
1011
1012 /*
1013  * The init_mm pagetable is really pinned as soon as its created, but
1014  * that's before we have page structures to store the bits.  So do all
1015  * the book-keeping now.
1016  */
1017 static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
1018                                   enum pt_level level)
1019 {
1020         SetPagePinned(page);
1021         return 0;
1022 }
1023
1024 static void __init xen_mark_init_mm_pinned(void)
1025 {
1026         xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
1027 }
1028
1029 static int xen_unpin_page(struct mm_struct *mm, struct page *page,
1030                           enum pt_level level)
1031 {
1032         unsigned pgfl = TestClearPagePinned(page);
1033
1034         if (pgfl && !PageHighMem(page)) {
1035                 void *pt = lowmem_page_address(page);
1036                 unsigned long pfn = page_to_pfn(page);
1037                 spinlock_t *ptl = NULL;
1038                 struct multicall_space mcs;
1039
1040                 /*
1041                  * Do the converse to pin_page.  If we're using split
1042                  * pte locks, we must be holding the lock for while
1043                  * the pte page is unpinned but still RO to prevent
1044                  * concurrent updates from seeing it in this
1045                  * partially-pinned state.
1046                  */
1047                 if (level == PT_PTE) {
1048                         ptl = xen_pte_lock(page, mm);
1049
1050                         if (ptl)
1051                                 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
1052                 }
1053
1054                 mcs = __xen_mc_entry(0);
1055
1056                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1057                                         pfn_pte(pfn, PAGE_KERNEL),
1058                                         level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1059
1060                 if (ptl) {
1061                         /* unlock when batch completed */
1062                         xen_mc_callback(xen_pte_unlock, ptl);
1063                 }
1064         }
1065
1066         return 0;               /* never need to flush on unpin */
1067 }
1068
1069 /* Release a pagetables pages back as normal RW */
1070 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
1071 {
1072         xen_mc_batch();
1073
1074         xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1075
1076 #ifdef CONFIG_X86_64
1077         {
1078                 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1079
1080                 if (user_pgd) {
1081                         xen_do_pin(MMUEXT_UNPIN_TABLE,
1082                                    PFN_DOWN(__pa(user_pgd)));
1083                         xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
1084                 }
1085         }
1086 #endif
1087
1088 #ifdef CONFIG_X86_PAE
1089         /* Need to make sure unshared kernel PMD is unpinned */
1090         xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
1091                        PT_PMD);
1092 #endif
1093
1094         __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
1095
1096         xen_mc_issue(0);
1097 }
1098
1099 static void xen_pgd_unpin(struct mm_struct *mm)
1100 {
1101         __xen_pgd_unpin(mm, mm->pgd);
1102 }
1103
1104 /*
1105  * On resume, undo any pinning done at save, so that the rest of the
1106  * kernel doesn't see any unexpected pinned pagetables.
1107  */
1108 void xen_mm_unpin_all(void)
1109 {
1110         struct page *page;
1111
1112         spin_lock(&pgd_lock);
1113
1114         list_for_each_entry(page, &pgd_list, lru) {
1115                 if (PageSavePinned(page)) {
1116                         BUG_ON(!PagePinned(page));
1117                         __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
1118                         ClearPageSavePinned(page);
1119                 }
1120         }
1121
1122         spin_unlock(&pgd_lock);
1123 }
1124
1125 static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1126 {
1127         spin_lock(&next->page_table_lock);
1128         xen_pgd_pin(next);
1129         spin_unlock(&next->page_table_lock);
1130 }
1131
1132 static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1133 {
1134         spin_lock(&mm->page_table_lock);
1135         xen_pgd_pin(mm);
1136         spin_unlock(&mm->page_table_lock);
1137 }
1138
1139
1140 #ifdef CONFIG_SMP
1141 /* Another cpu may still have their %cr3 pointing at the pagetable, so
1142    we need to repoint it somewhere else before we can unpin it. */
1143 static void drop_other_mm_ref(void *info)
1144 {
1145         struct mm_struct *mm = info;
1146         struct mm_struct *active_mm;
1147
1148         active_mm = percpu_read(cpu_tlbstate.active_mm);
1149
1150         if (active_mm == mm)
1151                 leave_mm(smp_processor_id());
1152
1153         /* If this cpu still has a stale cr3 reference, then make sure
1154            it has been flushed. */
1155         if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
1156                 load_cr3(swapper_pg_dir);
1157 }
1158
1159 static void xen_drop_mm_ref(struct mm_struct *mm)
1160 {
1161         cpumask_var_t mask;
1162         unsigned cpu;
1163
1164         if (current->active_mm == mm) {
1165                 if (current->mm == mm)
1166                         load_cr3(swapper_pg_dir);
1167                 else
1168                         leave_mm(smp_processor_id());
1169         }
1170
1171         /* Get the "official" set of cpus referring to our pagetable. */
1172         if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1173                 for_each_online_cpu(cpu) {
1174                         if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1175                             && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1176                                 continue;
1177                         smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1178                 }
1179                 return;
1180         }
1181         cpumask_copy(mask, mm_cpumask(mm));
1182
1183         /* It's possible that a vcpu may have a stale reference to our
1184            cr3, because its in lazy mode, and it hasn't yet flushed
1185            its set of pending hypercalls yet.  In this case, we can
1186            look at its actual current cr3 value, and force it to flush
1187            if needed. */
1188         for_each_online_cpu(cpu) {
1189                 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1190                         cpumask_set_cpu(cpu, mask);
1191         }
1192
1193         if (!cpumask_empty(mask))
1194                 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1195         free_cpumask_var(mask);
1196 }
1197 #else
1198 static void xen_drop_mm_ref(struct mm_struct *mm)
1199 {
1200         if (current->active_mm == mm)
1201                 load_cr3(swapper_pg_dir);
1202 }
1203 #endif
1204
1205 /*
1206  * While a process runs, Xen pins its pagetables, which means that the
1207  * hypervisor forces it to be read-only, and it controls all updates
1208  * to it.  This means that all pagetable updates have to go via the
1209  * hypervisor, which is moderately expensive.
1210  *
1211  * Since we're pulling the pagetable down, we switch to use init_mm,
1212  * unpin old process pagetable and mark it all read-write, which
1213  * allows further operations on it to be simple memory accesses.
1214  *
1215  * The only subtle point is that another CPU may be still using the
1216  * pagetable because of lazy tlb flushing.  This means we need need to
1217  * switch all CPUs off this pagetable before we can unpin it.
1218  */
1219 static void xen_exit_mmap(struct mm_struct *mm)
1220 {
1221         get_cpu();              /* make sure we don't move around */
1222         xen_drop_mm_ref(mm);
1223         put_cpu();
1224
1225         spin_lock(&mm->page_table_lock);
1226
1227         /* pgd may not be pinned in the error exit path of execve */
1228         if (xen_page_pinned(mm->pgd))
1229                 xen_pgd_unpin(mm);
1230
1231         spin_unlock(&mm->page_table_lock);
1232 }
1233
1234 static __init void xen_pagetable_setup_start(pgd_t *base)
1235 {
1236 }
1237
1238 static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
1239 {
1240         /* reserve the range used */
1241         native_pagetable_reserve(start, end);
1242
1243         /* set as RW the rest */
1244         printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
1245                         PFN_PHYS(pgt_buf_top));
1246         while (end < PFN_PHYS(pgt_buf_top)) {
1247                 make_lowmem_page_readwrite(__va(end));
1248                 end += PAGE_SIZE;
1249         }
1250 }
1251
1252 static void xen_post_allocator_init(void);
1253
1254 static __init void xen_pagetable_setup_done(pgd_t *base)
1255 {
1256         xen_setup_shared_info();
1257         xen_post_allocator_init();
1258 }
1259
1260 static void xen_write_cr2(unsigned long cr2)
1261 {
1262         percpu_read(xen_vcpu)->arch.cr2 = cr2;
1263 }
1264
1265 static unsigned long xen_read_cr2(void)
1266 {
1267         return percpu_read(xen_vcpu)->arch.cr2;
1268 }
1269
1270 unsigned long xen_read_cr2_direct(void)
1271 {
1272         return percpu_read(xen_vcpu_info.arch.cr2);
1273 }
1274
1275 static void xen_flush_tlb(void)
1276 {
1277         struct mmuext_op *op;
1278         struct multicall_space mcs;
1279
1280         preempt_disable();
1281
1282         mcs = xen_mc_entry(sizeof(*op));
1283
1284         op = mcs.args;
1285         op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1286         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1287
1288         xen_mc_issue(PARAVIRT_LAZY_MMU);
1289
1290         preempt_enable();
1291 }
1292
1293 static void xen_flush_tlb_single(unsigned long addr)
1294 {
1295         struct mmuext_op *op;
1296         struct multicall_space mcs;
1297
1298         preempt_disable();
1299
1300         mcs = xen_mc_entry(sizeof(*op));
1301         op = mcs.args;
1302         op->cmd = MMUEXT_INVLPG_LOCAL;
1303         op->arg1.linear_addr = addr & PAGE_MASK;
1304         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1305
1306         xen_mc_issue(PARAVIRT_LAZY_MMU);
1307
1308         preempt_enable();
1309 }
1310
1311 static void xen_flush_tlb_others(const struct cpumask *cpus,
1312                                  struct mm_struct *mm, unsigned long va)
1313 {
1314         struct {
1315                 struct mmuext_op op;
1316                 DECLARE_BITMAP(mask, NR_CPUS);
1317         } *args;
1318         struct multicall_space mcs;
1319
1320         if (cpumask_empty(cpus))
1321                 return;         /* nothing to do */
1322
1323         mcs = xen_mc_entry(sizeof(*args));
1324         args = mcs.args;
1325         args->op.arg2.vcpumask = to_cpumask(args->mask);
1326
1327         /* Remove us, and any offline CPUS. */
1328         cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1329         cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1330
1331         if (va == TLB_FLUSH_ALL) {
1332                 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1333         } else {
1334                 args->op.cmd = MMUEXT_INVLPG_MULTI;
1335                 args->op.arg1.linear_addr = va;
1336         }
1337
1338         MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1339
1340         xen_mc_issue(PARAVIRT_LAZY_MMU);
1341 }
1342
1343 static unsigned long xen_read_cr3(void)
1344 {
1345         return percpu_read(xen_cr3);
1346 }
1347
1348 static void set_current_cr3(void *v)
1349 {
1350         percpu_write(xen_current_cr3, (unsigned long)v);
1351 }
1352
1353 static void __xen_write_cr3(bool kernel, unsigned long cr3)
1354 {
1355         struct mmuext_op *op;
1356         struct multicall_space mcs;
1357         unsigned long mfn;
1358
1359         if (cr3)
1360                 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1361         else
1362                 mfn = 0;
1363
1364         WARN_ON(mfn == 0 && kernel);
1365
1366         mcs = __xen_mc_entry(sizeof(*op));
1367
1368         op = mcs.args;
1369         op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1370         op->arg1.mfn = mfn;
1371
1372         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1373
1374         if (kernel) {
1375                 percpu_write(xen_cr3, cr3);
1376
1377                 /* Update xen_current_cr3 once the batch has actually
1378                    been submitted. */
1379                 xen_mc_callback(set_current_cr3, (void *)cr3);
1380         }
1381 }
1382
1383 static void xen_write_cr3(unsigned long cr3)
1384 {
1385         BUG_ON(preemptible());
1386
1387         xen_mc_batch();  /* disables interrupts */
1388
1389         /* Update while interrupts are disabled, so its atomic with
1390            respect to ipis */
1391         percpu_write(xen_cr3, cr3);
1392
1393         __xen_write_cr3(true, cr3);
1394
1395 #ifdef CONFIG_X86_64
1396         {
1397                 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1398                 if (user_pgd)
1399                         __xen_write_cr3(false, __pa(user_pgd));
1400                 else
1401                         __xen_write_cr3(false, 0);
1402         }
1403 #endif
1404
1405         xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1406 }
1407
1408 static int xen_pgd_alloc(struct mm_struct *mm)
1409 {
1410         pgd_t *pgd = mm->pgd;
1411         int ret = 0;
1412
1413         BUG_ON(PagePinned(virt_to_page(pgd)));
1414
1415 #ifdef CONFIG_X86_64
1416         {
1417                 struct page *page = virt_to_page(pgd);
1418                 pgd_t *user_pgd;
1419
1420                 BUG_ON(page->private != 0);
1421
1422                 ret = -ENOMEM;
1423
1424                 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1425                 page->private = (unsigned long)user_pgd;
1426
1427                 if (user_pgd != NULL) {
1428                         user_pgd[pgd_index(VSYSCALL_START)] =
1429                                 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1430                         ret = 0;
1431                 }
1432
1433                 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1434         }
1435 #endif
1436
1437         return ret;
1438 }
1439
1440 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1441 {
1442 #ifdef CONFIG_X86_64
1443         pgd_t *user_pgd = xen_get_user_pgd(pgd);
1444
1445         if (user_pgd)
1446                 free_page((unsigned long)user_pgd);
1447 #endif
1448 }
1449
1450 #ifdef CONFIG_X86_32
1451 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1452 {
1453         /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1454         if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1455                 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1456                                pte_val_ma(pte));
1457
1458         return pte;
1459 }
1460 #else /* CONFIG_X86_64 */
1461 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1462 {
1463         unsigned long pfn = pte_pfn(pte);
1464
1465         /*
1466          * If the new pfn is within the range of the newly allocated
1467          * kernel pagetable, and it isn't being mapped into an
1468          * early_ioremap fixmap slot as a freshly allocated page, make sure
1469          * it is RO.
1470          */
1471         if (((!is_early_ioremap_ptep(ptep) &&
1472                         pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
1473                         (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
1474                 pte = pte_wrprotect(pte);
1475
1476         return pte;
1477 }
1478 #endif /* CONFIG_X86_64 */
1479
1480 /* Init-time set_pte while constructing initial pagetables, which
1481    doesn't allow RO pagetable pages to be remapped RW */
1482 static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1483 {
1484         pte = mask_rw_pte(ptep, pte);
1485
1486         xen_set_pte(ptep, pte);
1487 }
1488
1489 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1490 {
1491         struct mmuext_op op;
1492         op.cmd = cmd;
1493         op.arg1.mfn = pfn_to_mfn(pfn);
1494         if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1495                 BUG();
1496 }
1497
1498 /* Early in boot, while setting up the initial pagetable, assume
1499    everything is pinned. */
1500 static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1501 {
1502 #ifdef CONFIG_FLATMEM
1503         BUG_ON(mem_map);        /* should only be used early */
1504 #endif
1505         make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1506         pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1507 }
1508
1509 /* Used for pmd and pud */
1510 static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1511 {
1512 #ifdef CONFIG_FLATMEM
1513         BUG_ON(mem_map);        /* should only be used early */
1514 #endif
1515         make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1516 }
1517
1518 /* Early release_pte assumes that all pts are pinned, since there's
1519    only init_mm and anything attached to that is pinned. */
1520 static __init void xen_release_pte_init(unsigned long pfn)
1521 {
1522         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1523         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1524 }
1525
1526 static __init void xen_release_pmd_init(unsigned long pfn)
1527 {
1528         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1529 }
1530
1531 /* This needs to make sure the new pte page is pinned iff its being
1532    attached to a pinned pagetable. */
1533 static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1534 {
1535         struct page *page = pfn_to_page(pfn);
1536
1537         if (PagePinned(virt_to_page(mm->pgd))) {
1538                 SetPagePinned(page);
1539
1540                 if (!PageHighMem(page)) {
1541                         make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1542                         if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1543                                 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1544                 } else {
1545                         /* make sure there are no stray mappings of
1546                            this page */
1547                         kmap_flush_unused();
1548                 }
1549         }
1550 }
1551
1552 static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1553 {
1554         xen_alloc_ptpage(mm, pfn, PT_PTE);
1555 }
1556
1557 static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1558 {
1559         xen_alloc_ptpage(mm, pfn, PT_PMD);
1560 }
1561
1562 /* This should never happen until we're OK to use struct page */
1563 static void xen_release_ptpage(unsigned long pfn, unsigned level)
1564 {
1565         struct page *page = pfn_to_page(pfn);
1566
1567         if (PagePinned(page)) {
1568                 if (!PageHighMem(page)) {
1569                         if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1570                                 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1571                         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1572                 }
1573                 ClearPagePinned(page);
1574         }
1575 }
1576
1577 static void xen_release_pte(unsigned long pfn)
1578 {
1579         xen_release_ptpage(pfn, PT_PTE);
1580 }
1581
1582 static void xen_release_pmd(unsigned long pfn)
1583 {
1584         xen_release_ptpage(pfn, PT_PMD);
1585 }
1586
1587 #if PAGETABLE_LEVELS == 4
1588 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1589 {
1590         xen_alloc_ptpage(mm, pfn, PT_PUD);
1591 }
1592
1593 static void xen_release_pud(unsigned long pfn)
1594 {
1595         xen_release_ptpage(pfn, PT_PUD);
1596 }
1597 #endif
1598
1599 void __init xen_reserve_top(void)
1600 {
1601 #ifdef CONFIG_X86_32
1602         unsigned long top = HYPERVISOR_VIRT_START;
1603         struct xen_platform_parameters pp;
1604
1605         if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1606                 top = pp.virt_start;
1607
1608         reserve_top_address(-top);
1609 #endif  /* CONFIG_X86_32 */
1610 }
1611
1612 /*
1613  * Like __va(), but returns address in the kernel mapping (which is
1614  * all we have until the physical memory mapping has been set up.
1615  */
1616 static void *__ka(phys_addr_t paddr)
1617 {
1618 #ifdef CONFIG_X86_64
1619         return (void *)(paddr + __START_KERNEL_map);
1620 #else
1621         return __va(paddr);
1622 #endif
1623 }
1624
1625 /* Convert a machine address to physical address */
1626 static unsigned long m2p(phys_addr_t maddr)
1627 {
1628         phys_addr_t paddr;
1629
1630         maddr &= PTE_PFN_MASK;
1631         paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1632
1633         return paddr;
1634 }
1635
1636 /* Convert a machine address to kernel virtual */
1637 static void *m2v(phys_addr_t maddr)
1638 {
1639         return __ka(m2p(maddr));
1640 }
1641
1642 /* Set the page permissions on an identity-mapped pages */
1643 static void set_page_prot(void *addr, pgprot_t prot)
1644 {
1645         unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1646         pte_t pte = pfn_pte(pfn, prot);
1647
1648         if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1649                 BUG();
1650 }
1651
1652 static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1653 {
1654         unsigned pmdidx, pteidx;
1655         unsigned ident_pte;
1656         unsigned long pfn;
1657
1658         level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1659                                       PAGE_SIZE);
1660
1661         ident_pte = 0;
1662         pfn = 0;
1663         for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1664                 pte_t *pte_page;
1665
1666                 /* Reuse or allocate a page of ptes */
1667                 if (pmd_present(pmd[pmdidx]))
1668                         pte_page = m2v(pmd[pmdidx].pmd);
1669                 else {
1670                         /* Check for free pte pages */
1671                         if (ident_pte == LEVEL1_IDENT_ENTRIES)
1672                                 break;
1673
1674                         pte_page = &level1_ident_pgt[ident_pte];
1675                         ident_pte += PTRS_PER_PTE;
1676
1677                         pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1678                 }
1679
1680                 /* Install mappings */
1681                 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1682                         pte_t pte;
1683
1684                         if (!pte_none(pte_page[pteidx]))
1685                                 continue;
1686
1687                         pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1688                         pte_page[pteidx] = pte;
1689                 }
1690         }
1691
1692         for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1693                 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1694
1695         set_page_prot(pmd, PAGE_KERNEL_RO);
1696 }
1697
1698 void __init xen_setup_machphys_mapping(void)
1699 {
1700         struct xen_machphys_mapping mapping;
1701         unsigned long machine_to_phys_nr_ents;
1702
1703         if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1704                 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1705                 machine_to_phys_nr_ents = mapping.max_mfn + 1;
1706         } else {
1707                 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
1708         }
1709         machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
1710 }
1711
1712 #ifdef CONFIG_X86_64
1713 static void convert_pfn_mfn(void *v)
1714 {
1715         pte_t *pte = v;
1716         int i;
1717
1718         /* All levels are converted the same way, so just treat them
1719            as ptes. */
1720         for (i = 0; i < PTRS_PER_PTE; i++)
1721                 pte[i] = xen_make_pte(pte[i].pte);
1722 }
1723
1724 /*
1725  * Set up the initial kernel pagetable.
1726  *
1727  * We can construct this by grafting the Xen provided pagetable into
1728  * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
1729  * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
1730  * means that only the kernel has a physical mapping to start with -
1731  * but that's enough to get __va working.  We need to fill in the rest
1732  * of the physical mapping once some sort of allocator has been set
1733  * up.
1734  */
1735 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1736                                          unsigned long max_pfn)
1737 {
1738         pud_t *l3;
1739         pmd_t *l2;
1740
1741         /* max_pfn_mapped is the last pfn mapped in the initial memory
1742          * mappings. Considering that on Xen after the kernel mappings we
1743          * have the mappings of some pages that don't exist in pfn space, we
1744          * set max_pfn_mapped to the last real pfn mapped. */
1745         max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1746
1747         /* Zap identity mapping */
1748         init_level4_pgt[0] = __pgd(0);
1749
1750         /* Pre-constructed entries are in pfn, so convert to mfn */
1751         convert_pfn_mfn(init_level4_pgt);
1752         convert_pfn_mfn(level3_ident_pgt);
1753         convert_pfn_mfn(level3_kernel_pgt);
1754
1755         l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1756         l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1757
1758         memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1759         memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1760
1761         l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1762         l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1763         memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1764
1765         /* Set up identity map */
1766         xen_map_identity_early(level2_ident_pgt, max_pfn);
1767
1768         /* Make pagetable pieces RO */
1769         set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1770         set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1771         set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1772         set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1773         set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1774         set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1775
1776         /* Pin down new L4 */
1777         pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1778                           PFN_DOWN(__pa_symbol(init_level4_pgt)));
1779
1780         /* Unpin Xen-provided one */
1781         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1782
1783         /* Switch over */
1784         pgd = init_level4_pgt;
1785
1786         /*
1787          * At this stage there can be no user pgd, and no page
1788          * structure to attach it to, so make sure we just set kernel
1789          * pgd.
1790          */
1791         xen_mc_batch();
1792         __xen_write_cr3(true, __pa(pgd));
1793         xen_mc_issue(PARAVIRT_LAZY_CPU);
1794
1795         memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1796                       __pa(xen_start_info->pt_base +
1797                            xen_start_info->nr_pt_frames * PAGE_SIZE),
1798                       "XEN PAGETABLES");
1799
1800         return pgd;
1801 }
1802 #else   /* !CONFIG_X86_64 */
1803 static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1804 static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
1805
1806 static __init void xen_write_cr3_init(unsigned long cr3)
1807 {
1808         unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
1809
1810         BUG_ON(read_cr3() != __pa(initial_page_table));
1811         BUG_ON(cr3 != __pa(swapper_pg_dir));
1812
1813         /*
1814          * We are switching to swapper_pg_dir for the first time (from
1815          * initial_page_table) and therefore need to mark that page
1816          * read-only and then pin it.
1817          *
1818          * Xen disallows sharing of kernel PMDs for PAE
1819          * guests. Therefore we must copy the kernel PMD from
1820          * initial_page_table into a new kernel PMD to be used in
1821          * swapper_pg_dir.
1822          */
1823         swapper_kernel_pmd =
1824                 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1825         memcpy(swapper_kernel_pmd, initial_kernel_pmd,
1826                sizeof(pmd_t) * PTRS_PER_PMD);
1827         swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
1828                 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
1829         set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
1830
1831         set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1832         xen_write_cr3(cr3);
1833         pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
1834
1835         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
1836                           PFN_DOWN(__pa(initial_page_table)));
1837         set_page_prot(initial_page_table, PAGE_KERNEL);
1838         set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
1839
1840         pv_mmu_ops.write_cr3 = &xen_write_cr3;
1841 }
1842
1843 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1844                                          unsigned long max_pfn)
1845 {
1846         pmd_t *kernel_pmd;
1847
1848         initial_kernel_pmd =
1849                 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1850
1851         max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1852
1853         kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1854         memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1855
1856         xen_map_identity_early(initial_kernel_pmd, max_pfn);
1857
1858         memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1859         initial_page_table[KERNEL_PGD_BOUNDARY] =
1860                 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
1861
1862         set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
1863         set_page_prot(initial_page_table, PAGE_KERNEL_RO);
1864         set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1865
1866         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1867
1868         pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
1869                           PFN_DOWN(__pa(initial_page_table)));
1870         xen_write_cr3(__pa(initial_page_table));
1871
1872         memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1873                       __pa(xen_start_info->pt_base +
1874                            xen_start_info->nr_pt_frames * PAGE_SIZE),
1875                       "XEN PAGETABLES");
1876
1877         return initial_page_table;
1878 }
1879 #endif  /* CONFIG_X86_64 */
1880
1881 static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
1882
1883 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1884 {
1885         pte_t pte;
1886
1887         phys >>= PAGE_SHIFT;
1888
1889         switch (idx) {
1890         case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1891 #ifdef CONFIG_X86_F00F_BUG
1892         case FIX_F00F_IDT:
1893 #endif
1894 #ifdef CONFIG_X86_32
1895         case FIX_WP_TEST:
1896         case FIX_VDSO:
1897 # ifdef CONFIG_HIGHMEM
1898         case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1899 # endif
1900 #else
1901         case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1902 #endif
1903         case FIX_TEXT_POKE0:
1904         case FIX_TEXT_POKE1:
1905                 /* All local page mappings */
1906                 pte = pfn_pte(phys, prot);
1907                 break;
1908
1909 #ifdef CONFIG_X86_LOCAL_APIC
1910         case FIX_APIC_BASE:     /* maps dummy local APIC */
1911                 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1912                 break;
1913 #endif
1914
1915 #ifdef CONFIG_X86_IO_APIC
1916         case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
1917                 /*
1918                  * We just don't map the IO APIC - all access is via
1919                  * hypercalls.  Keep the address in the pte for reference.
1920                  */
1921                 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1922                 break;
1923 #endif
1924
1925         case FIX_PARAVIRT_BOOTMAP:
1926                 /* This is an MFN, but it isn't an IO mapping from the
1927                    IO domain */
1928                 pte = mfn_pte(phys, prot);
1929                 break;
1930
1931         default:
1932                 /* By default, set_fixmap is used for hardware mappings */
1933                 pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
1934                 break;
1935         }
1936
1937         __native_set_fixmap(idx, pte);
1938
1939 #ifdef CONFIG_X86_64
1940         /* Replicate changes to map the vsyscall page into the user
1941            pagetable vsyscall mapping. */
1942         if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1943                 unsigned long vaddr = __fix_to_virt(idx);
1944                 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1945         }
1946 #endif
1947 }
1948
1949 __init void xen_ident_map_ISA(void)
1950 {
1951         unsigned long pa;
1952
1953         /*
1954          * If we're dom0, then linear map the ISA machine addresses into
1955          * the kernel's address space.
1956          */
1957         if (!xen_initial_domain())
1958                 return;
1959
1960         xen_raw_printk("Xen: setup ISA identity maps\n");
1961
1962         for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
1963                 pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
1964
1965                 if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
1966                         BUG();
1967         }
1968
1969         xen_flush_tlb();
1970 }
1971
1972 static __init void xen_post_allocator_init(void)
1973 {
1974 #ifdef CONFIG_XEN_DEBUG
1975         pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
1976 #endif
1977         pv_mmu_ops.set_pte = xen_set_pte;
1978         pv_mmu_ops.set_pmd = xen_set_pmd;
1979         pv_mmu_ops.set_pud = xen_set_pud;
1980 #if PAGETABLE_LEVELS == 4
1981         pv_mmu_ops.set_pgd = xen_set_pgd;
1982 #endif
1983
1984         /* This will work as long as patching hasn't happened yet
1985            (which it hasn't) */
1986         pv_mmu_ops.alloc_pte = xen_alloc_pte;
1987         pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1988         pv_mmu_ops.release_pte = xen_release_pte;
1989         pv_mmu_ops.release_pmd = xen_release_pmd;
1990 #if PAGETABLE_LEVELS == 4
1991         pv_mmu_ops.alloc_pud = xen_alloc_pud;
1992         pv_mmu_ops.release_pud = xen_release_pud;
1993 #endif
1994
1995 #ifdef CONFIG_X86_64
1996         SetPagePinned(virt_to_page(level3_user_vsyscall));
1997 #endif
1998         xen_mark_init_mm_pinned();
1999 }
2000
2001 static void xen_leave_lazy_mmu(void)
2002 {
2003         preempt_disable();
2004         xen_mc_flush();
2005         paravirt_leave_lazy_mmu();
2006         preempt_enable();
2007 }
2008
2009 static const struct pv_mmu_ops xen_mmu_ops __initdata = {
2010         .read_cr2 = xen_read_cr2,
2011         .write_cr2 = xen_write_cr2,
2012
2013         .read_cr3 = xen_read_cr3,
2014 #ifdef CONFIG_X86_32
2015         .write_cr3 = xen_write_cr3_init,
2016 #else
2017         .write_cr3 = xen_write_cr3,
2018 #endif
2019
2020         .flush_tlb_user = xen_flush_tlb,
2021         .flush_tlb_kernel = xen_flush_tlb,
2022         .flush_tlb_single = xen_flush_tlb_single,
2023         .flush_tlb_others = xen_flush_tlb_others,
2024
2025         .pte_update = paravirt_nop,
2026         .pte_update_defer = paravirt_nop,
2027
2028         .pgd_alloc = xen_pgd_alloc,
2029         .pgd_free = xen_pgd_free,
2030
2031         .alloc_pte = xen_alloc_pte_init,
2032         .release_pte = xen_release_pte_init,
2033         .alloc_pmd = xen_alloc_pmd_init,
2034         .release_pmd = xen_release_pmd_init,
2035
2036         .set_pte = xen_set_pte_init,
2037         .set_pte_at = xen_set_pte_at,
2038         .set_pmd = xen_set_pmd_hyper,
2039
2040         .ptep_modify_prot_start = __ptep_modify_prot_start,
2041         .ptep_modify_prot_commit = __ptep_modify_prot_commit,
2042
2043         .pte_val = PV_CALLEE_SAVE(xen_pte_val),
2044         .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2045
2046         .make_pte = PV_CALLEE_SAVE(xen_make_pte),
2047         .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2048
2049 #ifdef CONFIG_X86_PAE
2050         .set_pte_atomic = xen_set_pte_atomic,
2051         .pte_clear = xen_pte_clear,
2052         .pmd_clear = xen_pmd_clear,
2053 #endif  /* CONFIG_X86_PAE */
2054         .set_pud = xen_set_pud_hyper,
2055
2056         .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2057         .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2058
2059 #if PAGETABLE_LEVELS == 4
2060         .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2061         .make_pud = PV_CALLEE_SAVE(xen_make_pud),
2062         .set_pgd = xen_set_pgd_hyper,
2063
2064         .alloc_pud = xen_alloc_pmd_init,
2065         .release_pud = xen_release_pmd_init,
2066 #endif  /* PAGETABLE_LEVELS == 4 */
2067
2068         .activate_mm = xen_activate_mm,
2069         .dup_mmap = xen_dup_mmap,
2070         .exit_mmap = xen_exit_mmap,
2071
2072         .lazy_mode = {
2073                 .enter = paravirt_enter_lazy_mmu,
2074                 .leave = xen_leave_lazy_mmu,
2075         },
2076
2077         .set_fixmap = xen_set_fixmap,
2078 };
2079
2080 void __init xen_init_mmu_ops(void)
2081 {
2082         x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
2083         x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2084         x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2085         pv_mmu_ops = xen_mmu_ops;
2086
2087         memset(dummy_mapping, 0xff, PAGE_SIZE);
2088 }
2089
2090 /* Protected by xen_reservation_lock. */
2091 #define MAX_CONTIG_ORDER 9 /* 2MB */
2092 static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2093
2094 #define VOID_PTE (mfn_pte(0, __pgprot(0)))
2095 static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2096                                 unsigned long *in_frames,
2097                                 unsigned long *out_frames)
2098 {
2099         int i;
2100         struct multicall_space mcs;
2101
2102         xen_mc_batch();
2103         for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2104                 mcs = __xen_mc_entry(0);
2105
2106                 if (in_frames)
2107                         in_frames[i] = virt_to_mfn(vaddr);
2108
2109                 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2110                 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2111
2112                 if (out_frames)
2113                         out_frames[i] = virt_to_pfn(vaddr);
2114         }
2115         xen_mc_issue(0);
2116 }
2117
2118 /*
2119  * Update the pfn-to-mfn mappings for a virtual address range, either to
2120  * point to an array of mfns, or contiguously from a single starting
2121  * mfn.
2122  */
2123 static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2124                                      unsigned long *mfns,
2125                                      unsigned long first_mfn)
2126 {
2127         unsigned i, limit;
2128         unsigned long mfn;
2129
2130         xen_mc_batch();
2131
2132         limit = 1u << order;
2133         for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2134                 struct multicall_space mcs;
2135                 unsigned flags;
2136
2137                 mcs = __xen_mc_entry(0);
2138                 if (mfns)
2139                         mfn = mfns[i];
2140                 else
2141                         mfn = first_mfn + i;
2142
2143                 if (i < (limit - 1))
2144                         flags = 0;
2145                 else {
2146                         if (order == 0)
2147                                 flags = UVMF_INVLPG | UVMF_ALL;
2148                         else
2149                                 flags = UVMF_TLB_FLUSH | UVMF_ALL;
2150                 }
2151
2152                 MULTI_update_va_mapping(mcs.mc, vaddr,
2153                                 mfn_pte(mfn, PAGE_KERNEL), flags);
2154
2155                 set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2156         }
2157
2158         xen_mc_issue(0);
2159 }
2160
2161 /*
2162  * Perform the hypercall to exchange a region of our pfns to point to
2163  * memory with the required contiguous alignment.  Takes the pfns as
2164  * input, and populates mfns as output.
2165  *
2166  * Returns a success code indicating whether the hypervisor was able to
2167  * satisfy the request or not.
2168  */
2169 static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2170                                unsigned long *pfns_in,
2171                                unsigned long extents_out,
2172                                unsigned int order_out,
2173                                unsigned long *mfns_out,
2174                                unsigned int address_bits)
2175 {
2176         long rc;
2177         int success;
2178
2179         struct xen_memory_exchange exchange = {
2180                 .in = {
2181                         .nr_extents   = extents_in,
2182                         .extent_order = order_in,
2183                         .extent_start = pfns_in,
2184                         .domid        = DOMID_SELF
2185                 },
2186                 .out = {
2187                         .nr_extents   = extents_out,
2188                         .extent_order = order_out,
2189                         .extent_start = mfns_out,
2190                         .address_bits = address_bits,
2191                         .domid        = DOMID_SELF
2192                 }
2193         };
2194
2195         BUG_ON(extents_in << order_in != extents_out << order_out);
2196
2197         rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2198         success = (exchange.nr_exchanged == extents_in);
2199
2200         BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2201         BUG_ON(success && (rc != 0));
2202
2203         return success;
2204 }
2205
2206 int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2207                                  unsigned int address_bits)
2208 {
2209         unsigned long *in_frames = discontig_frames, out_frame;
2210         unsigned long  flags;
2211         int            success;
2212
2213         /*
2214          * Currently an auto-translated guest will not perform I/O, nor will
2215          * it require PAE page directories below 4GB. Therefore any calls to
2216          * this function are redundant and can be ignored.
2217          */
2218
2219         if (xen_feature(XENFEAT_auto_translated_physmap))
2220                 return 0;
2221
2222         if (unlikely(order > MAX_CONTIG_ORDER))
2223                 return -ENOMEM;
2224
2225         memset((void *) vstart, 0, PAGE_SIZE << order);
2226
2227         spin_lock_irqsave(&xen_reservation_lock, flags);
2228
2229         /* 1. Zap current PTEs, remembering MFNs. */
2230         xen_zap_pfn_range(vstart, order, in_frames, NULL);
2231
2232         /* 2. Get a new contiguous memory extent. */
2233         out_frame = virt_to_pfn(vstart);
2234         success = xen_exchange_memory(1UL << order, 0, in_frames,
2235                                       1, order, &out_frame,
2236                                       address_bits);
2237
2238         /* 3. Map the new extent in place of old pages. */
2239         if (success)
2240                 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2241         else
2242                 xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2243
2244         spin_unlock_irqrestore(&xen_reservation_lock, flags);
2245
2246         return success ? 0 : -ENOMEM;
2247 }
2248 EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2249
2250 void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2251 {
2252         unsigned long *out_frames = discontig_frames, in_frame;
2253         unsigned long  flags;
2254         int success;
2255
2256         if (xen_feature(XENFEAT_auto_translated_physmap))
2257                 return;
2258
2259         if (unlikely(order > MAX_CONTIG_ORDER))
2260                 return;
2261
2262         memset((void *) vstart, 0, PAGE_SIZE << order);
2263
2264         spin_lock_irqsave(&xen_reservation_lock, flags);
2265
2266         /* 1. Find start MFN of contiguous extent. */
2267         in_frame = virt_to_mfn(vstart);
2268
2269         /* 2. Zap current PTEs. */
2270         xen_zap_pfn_range(vstart, order, NULL, out_frames);
2271
2272         /* 3. Do the exchange for non-contiguous MFNs. */
2273         success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2274                                         0, out_frames, 0);
2275
2276         /* 4. Map new pages in place of old pages. */
2277         if (success)
2278                 xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2279         else
2280                 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2281
2282         spin_unlock_irqrestore(&xen_reservation_lock, flags);
2283 }
2284 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2285
2286 #ifdef CONFIG_XEN_PVHVM
2287 static void xen_hvm_exit_mmap(struct mm_struct *mm)
2288 {
2289         struct xen_hvm_pagetable_dying a;
2290         int rc;
2291
2292         a.domid = DOMID_SELF;
2293         a.gpa = __pa(mm->pgd);
2294         rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2295         WARN_ON_ONCE(rc < 0);
2296 }
2297
2298 static int is_pagetable_dying_supported(void)
2299 {
2300         struct xen_hvm_pagetable_dying a;
2301         int rc = 0;
2302
2303         a.domid = DOMID_SELF;
2304         a.gpa = 0x00;
2305         rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2306         if (rc < 0) {
2307                 printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2308                 return 0;
2309         }
2310         return 1;
2311 }
2312
2313 void __init xen_hvm_init_mmu_ops(void)
2314 {
2315         if (is_pagetable_dying_supported())
2316                 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2317 }
2318 #endif
2319
2320 #define REMAP_BATCH_SIZE 16
2321
2322 struct remap_data {
2323         unsigned long mfn;
2324         pgprot_t prot;
2325         struct mmu_update *mmu_update;
2326 };
2327
2328 static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2329                                  unsigned long addr, void *data)
2330 {
2331         struct remap_data *rmd = data;
2332         pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
2333
2334         rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
2335         rmd->mmu_update->val = pte_val_ma(pte);
2336         rmd->mmu_update++;
2337
2338         return 0;
2339 }
2340
2341 int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2342                                unsigned long addr,
2343                                unsigned long mfn, int nr,
2344                                pgprot_t prot, unsigned domid)
2345 {
2346         struct remap_data rmd;
2347         struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2348         int batch;
2349         unsigned long range;
2350         int err = 0;
2351
2352         prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2353
2354         BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
2355                                 (VM_PFNMAP | VM_RESERVED | VM_IO)));
2356
2357         rmd.mfn = mfn;
2358         rmd.prot = prot;
2359
2360         while (nr) {
2361                 batch = min(REMAP_BATCH_SIZE, nr);
2362                 range = (unsigned long)batch << PAGE_SHIFT;
2363
2364                 rmd.mmu_update = mmu_update;
2365                 err = apply_to_page_range(vma->vm_mm, addr, range,
2366                                           remap_area_mfn_pte_fn, &rmd);
2367                 if (err)
2368                         goto out;
2369
2370                 err = -EFAULT;
2371                 if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
2372                         goto out;
2373
2374                 nr -= batch;
2375                 addr += range;
2376         }
2377
2378         err = 0;
2379 out:
2380
2381         flush_tlb_all();
2382
2383         return err;
2384 }
2385 EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2386
2387 #ifdef CONFIG_XEN_DEBUG_FS
2388
2389 static int p2m_dump_open(struct inode *inode, struct file *filp)
2390 {
2391         return single_open(filp, p2m_dump_show, NULL);
2392 }
2393
2394 static const struct file_operations p2m_dump_fops = {
2395         .open           = p2m_dump_open,
2396         .read           = seq_read,
2397         .llseek         = seq_lseek,
2398         .release        = single_release,
2399 };
2400
2401 static struct dentry *d_mmu_debug;
2402
2403 static int __init xen_mmu_debugfs(void)
2404 {
2405         struct dentry *d_xen = xen_init_debugfs();
2406
2407         if (d_xen == NULL)
2408                 return -ENOMEM;
2409
2410         d_mmu_debug = debugfs_create_dir("mmu", d_xen);
2411
2412         debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
2413
2414         debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
2415         debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
2416                            &mmu_stats.pgd_update_pinned);
2417         debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
2418                            &mmu_stats.pgd_update_pinned);
2419
2420         debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
2421         debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
2422                            &mmu_stats.pud_update_pinned);
2423         debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
2424                            &mmu_stats.pud_update_pinned);
2425
2426         debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
2427         debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
2428                            &mmu_stats.pmd_update_pinned);
2429         debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
2430                            &mmu_stats.pmd_update_pinned);
2431
2432         debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
2433 //      debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
2434 //                         &mmu_stats.pte_update_pinned);
2435         debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2436                            &mmu_stats.pte_update_pinned);
2437
2438         debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2439         debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2440                            &mmu_stats.mmu_update_extended);
2441         xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2442                                      mmu_stats.mmu_update_histo, 20);
2443
2444         debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2445         debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2446                            &mmu_stats.prot_commit_batched);
2447
2448         debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
2449         return 0;
2450 }
2451 fs_initcall(xen_mmu_debugfs);
2452
2453 #endif  /* CONFIG_XEN_DEBUG_FS */