arch/powerpc/mm/hugetlbpage.c

   1 /*
   2  * PPC64 (POWER4) Huge TLB Page Support for Kernel.
   3  *
   4  * Copyright (C) 2003 David Gibson, IBM Corporation.
   5  *
   6  * Based on the IA-32 version:
   7  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
   8  */
   9
  10 #include <linux/init.h>
  11 #include <linux/fs.h>
  12 #include <linux/mm.h>
  13 #include <linux/hugetlb.h>
  14 #include <linux/pagemap.h>
  15 #include <linux/slab.h>
  16 #include <linux/err.h>
  17 #include <linux/sysctl.h>
  18 #include <asm/mman.h>
  19 #include <asm/pgalloc.h>
  20 #include <asm/tlb.h>
  21 #include <asm/tlbflush.h>
  22 #include <asm/mmu_context.h>
  23 #include <asm/machdep.h>
  24 #include <asm/cputable.h>
  25 #include <asm/spu.h>
  26
  27 #define PAGE_SHIFT_64K  16
  28 #define PAGE_SHIFT_16M  24
  29 #define PAGE_SHIFT_16G  34
  30
  31 #define NUM_LOW_AREAS   (0x100000000UL >> SID_SHIFT)
  32 #define NUM_HIGH_AREAS  (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
  33 #define MAX_NUMBER_GPAGES       1024
  34
  35 /* Tracks the 16G pages after the device tree is scanned and before the
  36  * huge_boot_pages list is ready.  */
  37 static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
  38 static unsigned nr_gpages;
  39
  40 /* Array of valid huge page sizes - non-zero value(hugepte_shift) is
  41  * stored for the huge page sizes that are valid.
  42  */
  43 unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
  44
  45 #define hugepte_shift                   mmu_huge_psizes
  46 #define PTRS_PER_HUGEPTE(psize)         (1 << hugepte_shift[psize])
  47 #define HUGEPTE_TABLE_SIZE(psize)       (sizeof(pte_t) << hugepte_shift[psize])
  48
  49 #define HUGEPD_SHIFT(psize)             (mmu_psize_to_shift(psize) \
  50                                                 + hugepte_shift[psize])
  51 #define HUGEPD_SIZE(psize)              (1UL << HUGEPD_SHIFT(psize))
  52 #define HUGEPD_MASK(psize)              (~(HUGEPD_SIZE(psize)-1))
  53
  54 /* Subtract one from array size because we don't need a cache for 4K since
  55  * is not a huge page size */
  56 #define HUGE_PGTABLE_INDEX(psize)       (HUGEPTE_CACHE_NUM + psize - 1)
  57 #define HUGEPTE_CACHE_NAME(psize)       (huge_pgtable_cache_name[psize])
  58
  59 static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
  60         "unused_4K", "hugepte_cache_64K", "unused_64K_AP",
  61         "hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G"
  62 };
  63
  64 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  65  * will choke on pointers to hugepte tables, which is handy for
  66  * catching screwups early. */
  67 #define HUGEPD_OK       0x1
  68
  69 typedef struct { unsigned long pd; } hugepd_t;
  70
  71 #define hugepd_none(hpd)        ((hpd).pd == 0)
  72
  73 static inline int shift_to_mmu_psize(unsigned int shift)
  74 {
  75         switch (shift) {
  76 #ifndef CONFIG_PPC_64K_PAGES
  77         case PAGE_SHIFT_64K:
  78             return MMU_PAGE_64K;
  79 #endif
  80         case PAGE_SHIFT_16M:
  81             return MMU_PAGE_16M;
  82         case PAGE_SHIFT_16G:
  83             return MMU_PAGE_16G;
  84         }
  85         return -1;
  86 }
  87
  88 static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
  89 {
  90         if (mmu_psize_defs[mmu_psize].shift)
  91                 return mmu_psize_defs[mmu_psize].shift;
  92         BUG();
  93 }
  94
  95 static inline pte_t *hugepd_page(hugepd_t hpd)
  96 {
  97         BUG_ON(!(hpd.pd & HUGEPD_OK));
  98         return (pte_t *)(hpd.pd & ~HUGEPD_OK);
  99 }
 100
 101 static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
 102                                     struct hstate *hstate)
 103 {
 104         unsigned int shift = huge_page_shift(hstate);
 105         int psize = shift_to_mmu_psize(shift);
 106         unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
 107         pte_t *dir = hugepd_page(*hpdp);
 108
 109         return dir + idx;
 110 }
 111
 112 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 113                            unsigned long address, unsigned int psize)
 114 {
 115         pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)],
 116                                       GFP_KERNEL|__GFP_REPEAT);
 117
 118         if (! new)
 119                 return -ENOMEM;
 120
 121         spin_lock(&mm->page_table_lock);
 122         if (!hugepd_none(*hpdp))
 123                 kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new);
 124         else
 125                 hpdp->pd = (unsigned long)new | HUGEPD_OK;
 126         spin_unlock(&mm->page_table_lock);
 127         return 0;
 128 }
 129
 130
 131 static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate)
 132 {
 133         if (huge_page_shift(hstate) < PUD_SHIFT)
 134                 return pud_offset(pgd, addr);
 135         else
 136                 return (pud_t *) pgd;
 137 }
 138 static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr,
 139                          struct hstate *hstate)
 140 {
 141         if (huge_page_shift(hstate) < PUD_SHIFT)
 142                 return pud_alloc(mm, pgd, addr);
 143         else
 144                 return (pud_t *) pgd;
 145 }
 146 static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate)
 147 {
 148         if (huge_page_shift(hstate) < PMD_SHIFT)
 149                 return pmd_offset(pud, addr);
 150         else
 151                 return (pmd_t *) pud;
 152 }
 153 static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr,
 154                          struct hstate *hstate)
 155 {
 156         if (huge_page_shift(hstate) < PMD_SHIFT)
 157                 return pmd_alloc(mm, pud, addr);
 158         else
 159                 return (pmd_t *) pud;
 160 }
 161
 162 /* Build list of addresses of gigantic pages.  This function is used in early
 163  * boot before the buddy or bootmem allocator is setup.
 164  */
 165 void add_gpage(unsigned long addr, unsigned long page_size,
 166         unsigned long number_of_pages)
 167 {
 168         if (!addr)
 169                 return;
 170         while (number_of_pages > 0) {
 171                 gpage_freearray[nr_gpages] = addr;
 172                 nr_gpages++;
 173                 number_of_pages--;
 174                 addr += page_size;
 175         }
 176 }
 177
 178 /* Moves the gigantic page addresses from the temporary list to the
 179  * huge_boot_pages list.
 180  */
 181 int alloc_bootmem_huge_page(struct hstate *hstate)
 182 {
 183         struct huge_bootmem_page *m;
 184         if (nr_gpages == 0)
 185                 return 0;
 186         m = phys_to_virt(gpage_freearray[--nr_gpages]);
 187         gpage_freearray[nr_gpages] = 0;
 188         list_add(&m->list, &huge_boot_pages);
 189         m->hstate = hstate;
 190         return 1;
 191 }
 192
 193
 194 /* Modelled after find_linux_pte() */
 195 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 196 {
 197         pgd_t *pg;
 198         pud_t *pu;
 199         pmd_t *pm;
 200
 201         unsigned int psize;
 202         unsigned int shift;
 203         unsigned long sz;
 204         struct hstate *hstate;
 205         psize = get_slice_psize(mm, addr);
 206         shift = mmu_psize_to_shift(psize);
 207         sz = ((1UL) << shift);
 208         hstate = size_to_hstate(sz);
 209
 210         addr &= hstate->mask;
 211
 212         pg = pgd_offset(mm, addr);
 213         if (!pgd_none(*pg)) {
 214                 pu = hpud_offset(pg, addr, hstate);
 215                 if (!pud_none(*pu)) {
 216                         pm = hpmd_offset(pu, addr, hstate);
 217                         if (!pmd_none(*pm))
 218                                 return hugepte_offset((hugepd_t *)pm, addr,
 219                                                       hstate);
 220                 }
 221         }
 222
 223         return NULL;
 224 }
 225
 226 pte_t *huge_pte_alloc(struct mm_struct *mm,
 227                         unsigned long addr, unsigned long sz)
 228 {
 229         pgd_t *pg;
 230         pud_t *pu;
 231         pmd_t *pm;
 232         hugepd_t *hpdp = NULL;
 233         struct hstate *hstate;
 234         unsigned int psize;
 235         hstate = size_to_hstate(sz);
 236
 237         psize = get_slice_psize(mm, addr);
 238         BUG_ON(!mmu_huge_psizes[psize]);
 239
 240         addr &= hstate->mask;
 241
 242         pg = pgd_offset(mm, addr);
 243         pu = hpud_alloc(mm, pg, addr, hstate);
 244
 245         if (pu) {
 246                 pm = hpmd_alloc(mm, pu, addr, hstate);
 247                 if (pm)
 248                         hpdp = (hugepd_t *)pm;
 249         }
 250
 251         if (! hpdp)
 252                 return NULL;
 253
 254         if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
 255                 return NULL;
 256
 257         return hugepte_offset(hpdp, addr, hstate);
 258 }
 259
 260 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 261 {
 262         return 0;
 263 }
 264
 265 static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
 266                                unsigned int psize)
 267 {
 268         pte_t *hugepte = hugepd_page(*hpdp);
 269
 270         hpdp->pd = 0;
 271         tlb->need_flush = 1;
 272         pgtable_free_tlb(tlb, pgtable_free_cache(hugepte,
 273                                                  HUGEPTE_CACHE_NUM+psize-1,
 274                                                  PGF_CACHENUM_MASK));
 275 }
 276
 277 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 278                                    unsigned long addr, unsigned long end,
 279                                    unsigned long floor, unsigned long ceiling,
 280                                    unsigned int psize)
 281 {
 282         pmd_t *pmd;
 283         unsigned long next;
 284         unsigned long start;
 285
 286         start = addr;
 287         pmd = pmd_offset(pud, addr);
 288         do {
 289                 next = pmd_addr_end(addr, end);
 290                 if (pmd_none(*pmd))
 291                         continue;
 292                 free_hugepte_range(tlb, (hugepd_t *)pmd, psize);
 293         } while (pmd++, addr = next, addr != end);
 294
 295         start &= PUD_MASK;
 296         if (start < floor)
 297                 return;
 298         if (ceiling) {
 299                 ceiling &= PUD_MASK;
 300                 if (!ceiling)
 301                         return;
 302         }
 303         if (end - 1 > ceiling - 1)
 304                 return;
 305
 306         pmd = pmd_offset(pud, start);
 307         pud_clear(pud);
 308         pmd_free_tlb(tlb, pmd, start);
 309 }
 310
 311 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 312                                    unsigned long addr, unsigned long end,
 313                                    unsigned long floor, unsigned long ceiling)
 314 {
 315         pud_t *pud;
 316         unsigned long next;
 317         unsigned long start;
 318         unsigned int shift;
 319         unsigned int psize = get_slice_psize(tlb->mm, addr);
 320         shift = mmu_psize_to_shift(psize);
 321
 322         start = addr;
 323         pud = pud_offset(pgd, addr);
 324         do {
 325                 next = pud_addr_end(addr, end);
 326                 if (shift < PMD_SHIFT) {
 327                         if (pud_none_or_clear_bad(pud))
 328                                 continue;
 329                         hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
 330                                                ceiling, psize);
 331                 } else {
 332                         if (pud_none(*pud))
 333                                 continue;
 334                         free_hugepte_range(tlb, (hugepd_t *)pud, psize);
 335                 }
 336         } while (pud++, addr = next, addr != end);
 337
 338         start &= PGDIR_MASK;
 339         if (start < floor)
 340                 return;
 341         if (ceiling) {
 342                 ceiling &= PGDIR_MASK;
 343                 if (!ceiling)
 344                         return;
 345         }
 346         if (end - 1 > ceiling - 1)
 347                 return;
 348
 349         pud = pud_offset(pgd, start);
 350         pgd_clear(pgd);
 351         pud_free_tlb(tlb, pud, start);
 352 }
 353
 354 /*
 355  * This function frees user-level page tables of a process.
 356  *
 357  * Must be called with pagetable lock held.
 358  */
 359 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 360                             unsigned long addr, unsigned long end,
 361                             unsigned long floor, unsigned long ceiling)
 362 {
 363         pgd_t *pgd;
 364         unsigned long next;
 365         unsigned long start;
 366
 367         /*
 368          * Comments below take from the normal free_pgd_range().  They
 369          * apply here too.  The tests against HUGEPD_MASK below are
 370          * essential, because we *don't* test for this at the bottom
 371          * level.  Without them we'll attempt to free a hugepte table
 372          * when we unmap just part of it, even if there are other
 373          * active mappings using it.
 374          *
 375          * The next few lines have given us lots of grief...
 376          *
 377          * Why are we testing HUGEPD* at this top level?  Because
 378          * often there will be no work to do at all, and we'd prefer
 379          * not to go all the way down to the bottom just to discover
 380          * that.
 381          *
 382          * Why all these "- 1"s?  Because 0 represents both the bottom
 383          * of the address space and the top of it (using -1 for the
 384          * top wouldn't help much: the masks would do the wrong thing).
 385          * The rule is that addr 0 and floor 0 refer to the bottom of
 386          * the address space, but end 0 and ceiling 0 refer to the top
 387          * Comparisons need to use "end - 1" and "ceiling - 1" (though
 388          * that end 0 case should be mythical).
 389          *
 390          * Wherever addr is brought up or ceiling brought down, we
 391          * must be careful to reject "the opposite 0" before it
 392          * confuses the subsequent tests.  But what about where end is
 393          * brought down by HUGEPD_SIZE below? no, end can't go down to
 394          * 0 there.
 395          *
 396          * Whereas we round start (addr) and ceiling down, by different
 397          * masks at different levels, in order to test whether a table
 398          * now has no other vmas using it, so can be freed, we don't
 399          * bother to round floor or end up - the tests don't need that.
 400          */
 401         unsigned int psize = get_slice_psize(tlb->mm, addr);
 402
 403         addr &= HUGEPD_MASK(psize);
 404         if (addr < floor) {
 405                 addr += HUGEPD_SIZE(psize);
 406                 if (!addr)
 407                         return;
 408         }
 409         if (ceiling) {
 410                 ceiling &= HUGEPD_MASK(psize);
 411                 if (!ceiling)
 412                         return;
 413         }
 414         if (end - 1 > ceiling - 1)
 415                 end -= HUGEPD_SIZE(psize);
 416         if (addr > end - 1)
 417                 return;
 418
 419         start = addr;
 420         pgd = pgd_offset(tlb->mm, addr);
 421         do {
 422                 psize = get_slice_psize(tlb->mm, addr);
 423                 BUG_ON(!mmu_huge_psizes[psize]);
 424                 next = pgd_addr_end(addr, end);
 425                 if (mmu_psize_to_shift(psize) < PUD_SHIFT) {
 426                         if (pgd_none_or_clear_bad(pgd))
 427                                 continue;
 428                         hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 429                 } else {
 430                         if (pgd_none(*pgd))
 431                                 continue;
 432                         free_hugepte_range(tlb, (hugepd_t *)pgd, psize);
 433                 }
 434         } while (pgd++, addr = next, addr != end);
 435 }
 436
 437 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 438                      pte_t *ptep, pte_t pte)
 439 {
 440         if (pte_present(*ptep)) {
 441                 /* We open-code pte_clear because we need to pass the right
 442                  * argument to hpte_need_flush (huge / !huge). Might not be
 443                  * necessary anymore if we make hpte_need_flush() get the
 444                  * page size from the slices
 445                  */
 446                 unsigned int psize = get_slice_psize(mm, addr);
 447                 unsigned int shift = mmu_psize_to_shift(psize);
 448                 unsigned long sz = ((1UL) << shift);
 449                 struct hstate *hstate = size_to_hstate(sz);
 450                 pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1);
 451         }
 452         *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 453 }
 454
 455 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 456                               pte_t *ptep)
 457 {
 458         unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
 459         return __pte(old);
 460 }
 461
 462 struct page *
 463 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 464 {
 465         pte_t *ptep;
 466         struct page *page;
 467         unsigned int mmu_psize = get_slice_psize(mm, address);
 468
 469         /* Verify it is a huge page else bail. */
 470         if (!mmu_huge_psizes[mmu_psize])
 471                 return ERR_PTR(-EINVAL);
 472
 473         ptep = huge_pte_offset(mm, address);
 474         page = pte_page(*ptep);
 475         if (page) {
 476                 unsigned int shift = mmu_psize_to_shift(mmu_psize);
 477                 unsigned long sz = ((1UL) << shift);
 478                 page += (address % sz) / PAGE_SIZE;
 479         }
 480
 481         return page;
 482 }
 483
 484 int pmd_huge(pmd_t pmd)
 485 {
 486         return 0;
 487 }
 488
 489 int pud_huge(pud_t pud)
 490 {
 491         return 0;
 492 }
 493
 494 struct page *
 495 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 496                 pmd_t *pmd, int write)
 497 {
 498         BUG();
 499         return NULL;
 500 }
 501
 502
 503 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 504                                         unsigned long len, unsigned long pgoff,
 505                                         unsigned long flags)
 506 {
 507         struct hstate *hstate = hstate_file(file);
 508         int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
 509
 510         if (!mmu_huge_psizes[mmu_psize])
 511                 return -EINVAL;
 512         return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
 513 }
 514
 515 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
 516 {
 517         unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
 518
 519         return 1UL << mmu_psize_to_shift(psize);
 520 }
 521
 522 /*
 523  * Called by asm hashtable.S for doing lazy icache flush
 524  */
 525 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
 526                                         pte_t pte, int trap, unsigned long sz)
 527 {
 528         struct page *page;
 529         int i;
 530
 531         if (!pfn_valid(pte_pfn(pte)))
 532                 return rflags;
 533
 534         page = pte_page(pte);
 535
 536         /* page is dirty */
 537         if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
 538                 if (trap == 0x400) {
 539                         for (i = 0; i < (sz / PAGE_SIZE); i++)
 540                                 __flush_dcache_icache(page_address(page+i));
 541                         set_bit(PG_arch_1, &page->flags);
 542                 } else {
 543                         rflags |= HPTE_R_N;
 544                 }
 545         }
 546         return rflags;
 547 }
 548
 549 int hash_huge_page(struct mm_struct *mm, unsigned long access,
 550                    unsigned long ea, unsigned long vsid, int local,
 551                    unsigned long trap)
 552 {
 553         pte_t *ptep;
 554         unsigned long old_pte, new_pte;
 555         unsigned long va, rflags, pa, sz;
 556         long slot;
 557         int err = 1;
 558         int ssize = user_segment_size(ea);
 559         unsigned int mmu_psize;
 560         int shift;
 561         mmu_psize = get_slice_psize(mm, ea);
 562
 563         if (!mmu_huge_psizes[mmu_psize])
 564                 goto out;
 565         ptep = huge_pte_offset(mm, ea);
 566
 567         /* Search the Linux page table for a match with va */
 568         va = hpt_va(ea, vsid, ssize);
 569
 570         /*
 571          * If no pte found or not present, send the problem up to
 572          * do_page_fault
 573          */
 574         if (unlikely(!ptep || pte_none(*ptep)))
 575                 goto out;
 576
 577         /*
 578          * Check the user's access rights to the page.  If access should be
 579          * prevented then send the problem up to do_page_fault.
 580          */
 581         if (unlikely(access & ~pte_val(*ptep)))
 582                 goto out;
 583         /*
 584          * At this point, we have a pte (old_pte) which can be used to build
 585          * or update an HPTE. There are 2 cases:
 586          *
 587          * 1. There is a valid (present) pte with no associated HPTE (this is
 588          *      the most common case)
 589          * 2. There is a valid (present) pte with an associated HPTE. The
 590          *      current values of the pp bits in the HPTE prevent access
 591          *      because we are doing software DIRTY bit management and the
 592          *      page is currently not DIRTY.
 593          */
 594
 595
 596         do {
 597                 old_pte = pte_val(*ptep);
 598                 if (old_pte & _PAGE_BUSY)
 599                         goto out;
 600                 new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
 601         } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
 602                                          old_pte, new_pte));
 603
 604         rflags = 0x2 | (!(new_pte & _PAGE_RW));
 605         /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
 606         rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
 607         shift = mmu_psize_to_shift(mmu_psize);
 608         sz = ((1UL) << shift);
 609         if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 610                 /* No CPU has hugepages but lacks no execute, so we
 611                  * don't need to worry about that case */
 612                 rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
 613                                                        trap, sz);
 614
 615         /* Check if pte already has an hpte (case 2) */
 616         if (unlikely(old_pte & _PAGE_HASHPTE)) {
 617                 /* There MIGHT be an HPTE for this pte */
 618                 unsigned long hash, slot;
 619
 620                 hash = hpt_hash(va, shift, ssize);
 621                 if (old_pte & _PAGE_F_SECOND)
 622                         hash = ~hash;
 623                 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 624                 slot += (old_pte & _PAGE_F_GIX) >> 12;
 625
 626                 if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
 627                                          ssize, local) == -1)
 628                         old_pte &= ~_PAGE_HPTEFLAGS;
 629         }
 630
 631         if (likely(!(old_pte & _PAGE_HASHPTE))) {
 632                 unsigned long hash = hpt_hash(va, shift, ssize);
 633                 unsigned long hpte_group;
 634
 635                 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
 636
 637 repeat:
 638                 hpte_group = ((hash & htab_hash_mask) *
 639                               HPTES_PER_GROUP) & ~0x7UL;
 640
 641                 /* clear HPTE slot informations in new PTE */
 642 #ifdef CONFIG_PPC_64K_PAGES
 643                 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
 644 #else
 645                 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
 646 #endif
 647                 /* Add in WIMG bits */
 648                 rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
 649                                       _PAGE_COHERENT | _PAGE_GUARDED));
 650
 651                 /* Insert into the hash table, primary slot */
 652                 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
 653                                           mmu_psize, ssize);
 654
 655                 /* Primary is full, try the secondary */
 656                 if (unlikely(slot == -1)) {
 657                         hpte_group = ((~hash & htab_hash_mask) *
 658                                       HPTES_PER_GROUP) & ~0x7UL;
 659                         slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
 660                                                   HPTE_V_SECONDARY,
 661                                                   mmu_psize, ssize);
 662                         if (slot == -1) {
 663                                 if (mftb() & 0x1)
 664                                         hpte_group = ((hash & htab_hash_mask) *
 665                                                       HPTES_PER_GROUP)&~0x7UL;
 666
 667                                 ppc_md.hpte_remove(hpte_group);
 668                                 goto repeat;
 669                         }
 670                 }
 671
 672                 if (unlikely(slot == -2))
 673                         panic("hash_huge_page: pte_insert failed\n");
 674
 675                 new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
 676         }
 677
 678         /*
 679          * No need to use ldarx/stdcx here
 680          */
 681         *ptep = __pte(new_pte & ~_PAGE_BUSY);
 682
 683         err = 0;
 684
 685  out:
 686         return err;
 687 }
 688
 689 static void __init set_huge_psize(int psize)
 690 {
 691         /* Check that it is a page size supported by the hardware and
 692          * that it fits within pagetable limits. */
 693         if (mmu_psize_defs[psize].shift &&
 694                 mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
 695                 (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
 696                  mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
 697                  mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
 698                 /* Return if huge page size has already been setup or is the
 699                  * same as the base page size. */
 700                 if (mmu_huge_psizes[psize] ||
 701                    mmu_psize_defs[psize].shift == PAGE_SHIFT)
 702                         return;
 703                 hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
 704
 705                 switch (mmu_psize_defs[psize].shift) {
 706                 case PAGE_SHIFT_64K:
 707                     /* We only allow 64k hpages with 4k base page,
 708                      * which was checked above, and always put them
 709                      * at the PMD */
 710                     hugepte_shift[psize] = PMD_SHIFT;
 711                     break;
 712                 case PAGE_SHIFT_16M:
 713                     /* 16M pages can be at two different levels
 714                      * of pagestables based on base page size */
 715                     if (PAGE_SHIFT == PAGE_SHIFT_64K)
 716                             hugepte_shift[psize] = PMD_SHIFT;
 717                     else /* 4k base page */
 718                             hugepte_shift[psize] = PUD_SHIFT;
 719                     break;
 720                 case PAGE_SHIFT_16G:
 721                     /* 16G pages are always at PGD level */
 722                     hugepte_shift[psize] = PGDIR_SHIFT;
 723                     break;
 724                 }
 725                 hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
 726         } else
 727                 hugepte_shift[psize] = 0;
 728 }
 729
 730 static int __init hugepage_setup_sz(char *str)
 731 {
 732         unsigned long long size;
 733         int mmu_psize;
 734         int shift;
 735
 736         size = memparse(str, &str);
 737
 738         shift = __ffs(size);
 739         mmu_psize = shift_to_mmu_psize(shift);
 740         if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
 741                 set_huge_psize(mmu_psize);
 742         else
 743                 printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
 744
 745         return 1;
 746 }
 747 __setup("hugepagesz=", hugepage_setup_sz);
 748
 749 static int __init hugetlbpage_init(void)
 750 {
 751         unsigned int psize;
 752
 753         if (!cpu_has_feature(CPU_FTR_16M_PAGE))
 754                 return -ENODEV;
 755
 756         /* Add supported huge page sizes.  Need to change HUGE_MAX_HSTATE
 757          * and adjust PTE_NONCACHE_NUM if the number of supported huge page
 758          * sizes changes.
 759          */
 760         set_huge_psize(MMU_PAGE_16M);
 761         set_huge_psize(MMU_PAGE_16G);
 762
 763         /* Temporarily disable support for 64K huge pages when 64K SPU local
 764          * store support is enabled as the current implementation conflicts.
 765          */
 766 #ifndef CONFIG_SPU_FS_64K_LS
 767         set_huge_psize(MMU_PAGE_64K);
 768 #endif
 769
 770         for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 771                 if (mmu_huge_psizes[psize]) {
 772                         pgtable_cache[HUGE_PGTABLE_INDEX(psize)] =
 773                                 kmem_cache_create(
 774                                         HUGEPTE_CACHE_NAME(psize),
 775                                         HUGEPTE_TABLE_SIZE(psize),
 776                                         HUGEPTE_TABLE_SIZE(psize),
 777                                         0,
 778                                         NULL);
 779                         if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)])
 780                                 panic("hugetlbpage_init(): could not create %s"\
 781                                       "\n", HUGEPTE_CACHE_NAME(psize));
 782                 }
 783         }
 784
 785         return 0;
 786 }
 787
 788 module_init(hugetlbpage_init);