arch/tile/mm/init.c

   1 /*
   2  * Copyright (C) 1995  Linus Torvalds
   3  * Copyright 2010 Tilera Corporation. All Rights Reserved.
   4  *
   5  *   This program is free software; you can redistribute it and/or
   6  *   modify it under the terms of the GNU General Public License
   7  *   as published by the Free Software Foundation, version 2.
   8  *
   9  *   This program is distributed in the hope that it will be useful, but
  10  *   WITHOUT ANY WARRANTY; without even the implied warranty of
  11  *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
  12  *   NON INFRINGEMENT.  See the GNU General Public License for
  13  *   more details.
  14  */
  15
  16 #include <linux/module.h>
  17 #include <linux/signal.h>
  18 #include <linux/sched.h>
  19 #include <linux/kernel.h>
  20 #include <linux/errno.h>
  21 #include <linux/string.h>
  22 #include <linux/types.h>
  23 #include <linux/ptrace.h>
  24 #include <linux/mman.h>
  25 #include <linux/mm.h>
  26 #include <linux/hugetlb.h>
  27 #include <linux/swap.h>
  28 #include <linux/smp.h>
  29 #include <linux/init.h>
  30 #include <linux/highmem.h>
  31 #include <linux/pagemap.h>
  32 #include <linux/poison.h>
  33 #include <linux/bootmem.h>
  34 #include <linux/slab.h>
  35 #include <linux/proc_fs.h>
  36 #include <linux/efi.h>
  37 #include <linux/memory_hotplug.h>
  38 #include <linux/uaccess.h>
  39 #include <asm/mmu_context.h>
  40 #include <asm/processor.h>
  41 #include <asm/pgtable.h>
  42 #include <asm/pgalloc.h>
  43 #include <asm/dma.h>
  44 #include <asm/fixmap.h>
  45 #include <asm/tlb.h>
  46 #include <asm/tlbflush.h>
  47 #include <asm/sections.h>
  48 #include <asm/setup.h>
  49 #include <asm/homecache.h>
  50 #include <hv/hypervisor.h>
  51 #include <arch/chip.h>
  52
  53 #include "migrate.h"
  54
  55 #define clear_pgd(pmdptr) (*(pmdptr) = hv_pte(0))
  56
  57 #ifndef __tilegx__
  58 unsigned long VMALLOC_RESERVE = CONFIG_VMALLOC_RESERVE;
  59 EXPORT_SYMBOL(VMALLOC_RESERVE);
  60 #endif
  61
  62 /* Create an L2 page table */
  63 static pte_t * __init alloc_pte(void)
  64 {
  65         return __alloc_bootmem(L2_KERNEL_PGTABLE_SIZE, HV_PAGE_TABLE_ALIGN, 0);
  66 }
  67
  68 /*
  69  * L2 page tables per controller.  We allocate these all at once from
  70  * the bootmem allocator and store them here.  This saves on kernel L2
  71  * page table memory, compared to allocating a full 64K page per L2
  72  * page table, and also means that in cases where we use huge pages,
  73  * we are guaranteed to later be able to shatter those huge pages and
  74  * switch to using these page tables instead, without requiring
  75  * further allocation.  Each l2_ptes[] entry points to the first page
  76  * table for the first hugepage-size piece of memory on the
  77  * controller; other page tables are just indexed directly, i.e. the
  78  * L2 page tables are contiguous in memory for each controller.
  79  */
  80 static pte_t *l2_ptes[MAX_NUMNODES];
  81 static int num_l2_ptes[MAX_NUMNODES];
  82
  83 static void init_prealloc_ptes(int node, int pages)
  84 {
  85         BUG_ON(pages & (PTRS_PER_PTE - 1));
  86         if (pages) {
  87                 num_l2_ptes[node] = pages;
  88                 l2_ptes[node] = __alloc_bootmem(pages * sizeof(pte_t),
  89                                                 HV_PAGE_TABLE_ALIGN, 0);
  90         }
  91 }
  92
  93 pte_t *get_prealloc_pte(unsigned long pfn)
  94 {
  95         int node = pfn_to_nid(pfn);
  96         pfn &= ~(-1UL << (NR_PA_HIGHBIT_SHIFT - PAGE_SHIFT));
  97         BUG_ON(node >= MAX_NUMNODES);
  98         BUG_ON(pfn >= num_l2_ptes[node]);
  99         return &l2_ptes[node][pfn];
 100 }
 101
 102 /*
 103  * What caching do we expect pages from the heap to have when
 104  * they are allocated during bootup?  (Once we've installed the
 105  * "real" swapper_pg_dir.)
 106  */
 107 static int initial_heap_home(void)
 108 {
 109 #if CHIP_HAS_CBOX_HOME_MAP()
 110         if (hash_default)
 111                 return PAGE_HOME_HASH;
 112 #endif
 113         return smp_processor_id();
 114 }
 115
 116 /*
 117  * Place a pointer to an L2 page table in a middle page
 118  * directory entry.
 119  */
 120 static void __init assign_pte(pmd_t *pmd, pte_t *page_table)
 121 {
 122         phys_addr_t pa = __pa(page_table);
 123         unsigned long l2_ptfn = pa >> HV_LOG2_PAGE_TABLE_ALIGN;
 124         pte_t pteval = hv_pte_set_ptfn(__pgprot(_PAGE_TABLE), l2_ptfn);
 125         BUG_ON((pa & (HV_PAGE_TABLE_ALIGN-1)) != 0);
 126         pteval = pte_set_home(pteval, initial_heap_home());
 127         *(pte_t *)pmd = pteval;
 128         if (page_table != (pte_t *)pmd_page_vaddr(*pmd))
 129                 BUG();
 130 }
 131
 132 #ifdef __tilegx__
 133
 134 static inline pmd_t *alloc_pmd(void)
 135 {
 136         return __alloc_bootmem(L1_KERNEL_PGTABLE_SIZE, HV_PAGE_TABLE_ALIGN, 0);
 137 }
 138
 139 static inline void assign_pmd(pud_t *pud, pmd_t *pmd)
 140 {
 141         assign_pte((pmd_t *)pud, (pte_t *)pmd);
 142 }
 143
 144 #endif /* __tilegx__ */
 145
 146 /* Replace the given pmd with a full PTE table. */
 147 void __init shatter_pmd(pmd_t *pmd)
 148 {
 149         pte_t *pte = get_prealloc_pte(pte_pfn(*(pte_t *)pmd));
 150         assign_pte(pmd, pte);
 151 }
 152
 153 #ifdef CONFIG_HIGHMEM
 154 /*
 155  * This function initializes a certain range of kernel virtual memory
 156  * with new bootmem page tables, everywhere page tables are missing in
 157  * the given range.
 158  */
 159
 160 /*
 161  * NOTE: The pagetables are allocated contiguous on the physical space
 162  * so we can cache the place of the first one and move around without
 163  * checking the pgd every time.
 164  */
 165 static void __init page_table_range_init(unsigned long start,
 166                                          unsigned long end, pgd_t *pgd_base)
 167 {
 168         pgd_t *pgd;
 169         int pgd_idx;
 170         unsigned long vaddr;
 171
 172         vaddr = start;
 173         pgd_idx = pgd_index(vaddr);
 174         pgd = pgd_base + pgd_idx;
 175
 176         for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
 177                 pmd_t *pmd = pmd_offset(pud_offset(pgd, vaddr), vaddr);
 178                 if (pmd_none(*pmd))
 179                         assign_pte(pmd, alloc_pte());
 180                 vaddr += PMD_SIZE;
 181         }
 182 }
 183 #endif /* CONFIG_HIGHMEM */
 184
 185
 186 #if CHIP_HAS_CBOX_HOME_MAP()
 187
 188 static int __initdata ktext_hash = 1;  /* .text pages */
 189 static int __initdata kdata_hash = 1;  /* .data and .bss pages */
 190 int __write_once hash_default = 1;     /* kernel allocator pages */
 191 EXPORT_SYMBOL(hash_default);
 192 int __write_once kstack_hash = 1;      /* if no homecaching, use h4h */
 193 #endif /* CHIP_HAS_CBOX_HOME_MAP */
 194
 195 /*
 196  * CPUs to use to for striping the pages of kernel data.  If hash-for-home
 197  * is available, this is only relevant if kcache_hash sets up the
 198  * .data and .bss to be page-homed, and we don't want the default mode
 199  * of using the full set of kernel cpus for the striping.
 200  */
 201 static __initdata struct cpumask kdata_mask;
 202 static __initdata int kdata_arg_seen;
 203
 204 int __write_once kdata_huge;       /* if no homecaching, small pages */
 205
 206
 207 /* Combine a generic pgprot_t with cache home to get a cache-aware pgprot. */
 208 static pgprot_t __init construct_pgprot(pgprot_t prot, int home)
 209 {
 210         prot = pte_set_home(prot, home);
 211 #if CHIP_HAS_CBOX_HOME_MAP()
 212         if (home == PAGE_HOME_IMMUTABLE) {
 213                 if (ktext_hash)
 214                         prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_HASH_L3);
 215                 else
 216                         prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_NO_L3);
 217         }
 218 #endif
 219         return prot;
 220 }
 221
 222 /*
 223  * For a given kernel data VA, how should it be cached?
 224  * We return the complete pgprot_t with caching bits set.
 225  */
 226 static pgprot_t __init init_pgprot(ulong address)
 227 {
 228         int cpu;
 229         unsigned long page;
 230         enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
 231
 232 #if CHIP_HAS_CBOX_HOME_MAP()
 233         /* For kdata=huge, everything is just hash-for-home. */
 234         if (kdata_huge)
 235                 return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
 236 #endif
 237
 238         /* We map the aliased pages of permanent text inaccessible. */
 239         if (address < (ulong) _sinittext - CODE_DELTA)
 240                 return PAGE_NONE;
 241
 242         /*
 243          * We map read-only data non-coherent for performance.  We could
 244          * use neighborhood caching on TILE64, but it's not clear it's a win.
 245          */
 246         if ((address >= (ulong) __start_rodata &&
 247              address < (ulong) __end_rodata) ||
 248             address == (ulong) empty_zero_page) {
 249                 return construct_pgprot(PAGE_KERNEL_RO, PAGE_HOME_IMMUTABLE);
 250         }
 251
 252 #ifndef __tilegx__
 253 #if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
 254         /* Force the atomic_locks[] array page to be hash-for-home. */
 255         if (address == (ulong) atomic_locks)
 256                 return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
 257 #endif
 258 #endif
 259
 260         /*
 261          * Everything else that isn't data or bss is heap, so mark it
 262          * with the initial heap home (hash-for-home, or this cpu).  This
 263          * includes any addresses after the loaded image and any address before
 264          * _einitdata, since we already captured the case of text before
 265          * _sinittext, and __pa(einittext) is approximately __pa(sinitdata).
 266          *
 267          * All the LOWMEM pages that we mark this way will get their
 268          * struct page homecache properly marked later, in set_page_homes().
 269          * The HIGHMEM pages we leave with a default zero for their
 270          * homes, but with a zero free_time we don't have to actually
 271          * do a flush action the first time we use them, either.
 272          */
 273         if (address >= (ulong) _end || address < (ulong) _einitdata)
 274                 return construct_pgprot(PAGE_KERNEL, initial_heap_home());
 275
 276 #if CHIP_HAS_CBOX_HOME_MAP()
 277         /* Use hash-for-home if requested for data/bss. */
 278         if (kdata_hash)
 279                 return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
 280 #endif
 281
 282         /*
 283          * Make the w1data homed like heap to start with, to avoid
 284          * making it part of the page-striped data area when we're just
 285          * going to convert it to read-only soon anyway.
 286          */
 287         if (address >= (ulong)__w1data_begin && address < (ulong)__w1data_end)
 288                 return construct_pgprot(PAGE_KERNEL, initial_heap_home());
 289
 290         /*
 291          * Otherwise we just hand out consecutive cpus.  To avoid
 292          * requiring this function to hold state, we just walk forward from
 293          * _sdata by PAGE_SIZE, skipping the readonly and init data, to reach
 294          * the requested address, while walking cpu home around kdata_mask.
 295          * This is typically no more than a dozen or so iterations.
 296          */
 297         page = (((ulong)__w1data_end) + PAGE_SIZE - 1) & PAGE_MASK;
 298         BUG_ON(address < page || address >= (ulong)_end);
 299         cpu = cpumask_first(&kdata_mask);
 300         for (; page < address; page += PAGE_SIZE) {
 301                 if (page >= (ulong)&init_thread_union &&
 302                     page < (ulong)&init_thread_union + THREAD_SIZE)
 303                         continue;
 304                 if (page == (ulong)empty_zero_page)
 305                         continue;
 306 #ifndef __tilegx__
 307 #if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
 308                 if (page == (ulong)atomic_locks)
 309                         continue;
 310 #endif
 311 #endif
 312                 cpu = cpumask_next(cpu, &kdata_mask);
 313                 if (cpu == NR_CPUS)
 314                         cpu = cpumask_first(&kdata_mask);
 315         }
 316         return construct_pgprot(PAGE_KERNEL, cpu);
 317 }
 318
 319 /*
 320  * This function sets up how we cache the kernel text.  If we have
 321  * hash-for-home support, normally that is used instead (see the
 322  * kcache_hash boot flag for more information).  But if we end up
 323  * using a page-based caching technique, this option sets up the
 324  * details of that.  In addition, the "ktext=nocache" option may
 325  * always be used to disable local caching of text pages, if desired.
 326  */
 327
 328 static int __initdata ktext_arg_seen;
 329 static int __initdata ktext_small;
 330 static int __initdata ktext_local;
 331 static int __initdata ktext_all;
 332 static int __initdata ktext_nondataplane;
 333 static int __initdata ktext_nocache;
 334 static struct cpumask __initdata ktext_mask;
 335
 336 static int __init setup_ktext(char *str)
 337 {
 338         if (str == NULL)
 339                 return -EINVAL;
 340
 341         /* If you have a leading "nocache", turn off ktext caching */
 342         if (strncmp(str, "nocache", 7) == 0) {
 343                 ktext_nocache = 1;
 344                 pr_info("ktext: disabling local caching of kernel text\n");
 345                 str += 7;
 346                 if (*str == ',')
 347                         ++str;
 348                 if (*str == '\0')
 349                         return 0;
 350         }
 351
 352         ktext_arg_seen = 1;
 353
 354         /* Default setting on Tile64: use a huge page */
 355         if (strcmp(str, "huge") == 0)
 356                 pr_info("ktext: using one huge locally cached page\n");
 357
 358         /* Pay TLB cost but get no cache benefit: cache small pages locally */
 359         else if (strcmp(str, "local") == 0) {
 360                 ktext_small = 1;
 361                 ktext_local = 1;
 362                 pr_info("ktext: using small pages with local caching\n");
 363         }
 364
 365         /* Neighborhood cache ktext pages on all cpus. */
 366         else if (strcmp(str, "all") == 0) {
 367                 ktext_small = 1;
 368                 ktext_all = 1;
 369                 pr_info("ktext: using maximal caching neighborhood\n");
 370         }
 371
 372
 373         /* Neighborhood ktext pages on specified mask */
 374         else if (cpulist_parse(str, &ktext_mask) == 0) {
 375                 char buf[NR_CPUS * 5];
 376                 cpulist_scnprintf(buf, sizeof(buf), &ktext_mask);
 377                 if (cpumask_weight(&ktext_mask) > 1) {
 378                         ktext_small = 1;
 379                         pr_info("ktext: using caching neighborhood %s "
 380                                "with small pages\n", buf);
 381                 } else {
 382                         pr_info("ktext: caching on cpu %s with one huge page\n",
 383                                buf);
 384                 }
 385         }
 386
 387         else if (*str)
 388                 return -EINVAL;
 389
 390         return 0;
 391 }
 392
 393 early_param("ktext", setup_ktext);
 394
 395
 396 static inline pgprot_t ktext_set_nocache(pgprot_t prot)
 397 {
 398         if (!ktext_nocache)
 399                 prot = hv_pte_set_nc(prot);
 400 #if CHIP_HAS_NC_AND_NOALLOC_BITS()
 401         else
 402                 prot = hv_pte_set_no_alloc_l2(prot);
 403 #endif
 404         return prot;
 405 }
 406
 407 #ifndef __tilegx__
 408 static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
 409 {
 410         return pmd_offset(pud_offset(&pgtables[pgd_index(va)], va), va);
 411 }
 412 #else
 413 static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
 414 {
 415         pud_t *pud = pud_offset(&pgtables[pgd_index(va)], va);
 416         if (pud_none(*pud))
 417                 assign_pmd(pud, alloc_pmd());
 418         return pmd_offset(pud, va);
 419 }
 420 #endif
 421
 422 /* Temporary page table we use for staging. */
 423 static pgd_t pgtables[PTRS_PER_PGD]
 424  __attribute__((aligned(HV_PAGE_TABLE_ALIGN)));
 425
 426 /*
 427  * This maps the physical memory to kernel virtual address space, a total
 428  * of max_low_pfn pages, by creating page tables starting from address
 429  * PAGE_OFFSET.
 430  *
 431  * This routine transitions us from using a set of compiled-in large
 432  * pages to using some more precise caching, including removing access
 433  * to code pages mapped at PAGE_OFFSET (executed only at MEM_SV_START)
 434  * marking read-only data as locally cacheable, striping the remaining
 435  * .data and .bss across all the available tiles, and removing access
 436  * to pages above the top of RAM (thus ensuring a page fault from a bad
 437  * virtual address rather than a hypervisor shoot down for accessing
 438  * memory outside the assigned limits).
 439  */
 440 static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 441 {
 442         unsigned long long irqmask;
 443         unsigned long address, pfn;
 444         pmd_t *pmd;
 445         pte_t *pte;
 446         int pte_ofs;
 447         const struct cpumask *my_cpu_mask = cpumask_of(smp_processor_id());
 448         struct cpumask kstripe_mask;
 449         int rc, i;
 450
 451 #if CHIP_HAS_CBOX_HOME_MAP()
 452         if (ktext_arg_seen && ktext_hash) {
 453                 pr_warning("warning: \"ktext\" boot argument ignored"
 454                            " if \"kcache_hash\" sets up text hash-for-home\n");
 455                 ktext_small = 0;
 456         }
 457
 458         if (kdata_arg_seen && kdata_hash) {
 459                 pr_warning("warning: \"kdata\" boot argument ignored"
 460                            " if \"kcache_hash\" sets up data hash-for-home\n");
 461         }
 462
 463         if (kdata_huge && !hash_default) {
 464                 pr_warning("warning: disabling \"kdata=huge\"; requires"
 465                           " kcache_hash=all or =allbutstack\n");
 466                 kdata_huge = 0;
 467         }
 468 #endif
 469
 470         /*
 471          * Set up a mask for cpus to use for kernel striping.
 472          * This is normally all cpus, but minus dataplane cpus if any.
 473          * If the dataplane covers the whole chip, we stripe over
 474          * the whole chip too.
 475          */
 476         cpumask_copy(&kstripe_mask, cpu_possible_mask);
 477         if (!kdata_arg_seen)
 478                 kdata_mask = kstripe_mask;
 479
 480         /* Allocate and fill in L2 page tables */
 481         for (i = 0; i < MAX_NUMNODES; ++i) {
 482 #ifdef CONFIG_HIGHMEM
 483                 unsigned long end_pfn = node_lowmem_end_pfn[i];
 484 #else
 485                 unsigned long end_pfn = node_end_pfn[i];
 486 #endif
 487                 unsigned long end_huge_pfn = 0;
 488
 489                 /* Pre-shatter the last huge page to allow per-cpu pages. */
 490                 if (kdata_huge)
 491                         end_huge_pfn = end_pfn - (HPAGE_SIZE >> PAGE_SHIFT);
 492
 493                 pfn = node_start_pfn[i];
 494
 495                 /* Allocate enough memory to hold L2 page tables for node. */
 496                 init_prealloc_ptes(i, end_pfn - pfn);
 497
 498                 address = (unsigned long) pfn_to_kaddr(pfn);
 499                 while (pfn < end_pfn) {
 500                         BUG_ON(address & (HPAGE_SIZE-1));
 501                         pmd = get_pmd(pgtables, address);
 502                         pte = get_prealloc_pte(pfn);
 503                         if (pfn < end_huge_pfn) {
 504                                 pgprot_t prot = init_pgprot(address);
 505                                 *(pte_t *)pmd = pte_mkhuge(pfn_pte(pfn, prot));
 506                                 for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE;
 507                                      pfn++, pte_ofs++, address += PAGE_SIZE)
 508                                         pte[pte_ofs] = pfn_pte(pfn, prot);
 509                         } else {
 510                                 if (kdata_huge)
 511                                         printk(KERN_DEBUG "pre-shattered huge"
 512                                                " page at %#lx\n", address);
 513                                 for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE;
 514                                      pfn++, pte_ofs++, address += PAGE_SIZE) {
 515                                         pgprot_t prot = init_pgprot(address);
 516                                         pte[pte_ofs] = pfn_pte(pfn, prot);
 517                                 }
 518                                 assign_pte(pmd, pte);
 519                         }
 520                 }
 521         }
 522
 523         /*
 524          * Set or check ktext_map now that we have cpu_possible_mask
 525          * and kstripe_mask to work with.
 526          */
 527         if (ktext_all)
 528                 cpumask_copy(&ktext_mask, cpu_possible_mask);
 529         else if (ktext_nondataplane)
 530                 ktext_mask = kstripe_mask;
 531         else if (!cpumask_empty(&ktext_mask)) {
 532                 /* Sanity-check any mask that was requested */
 533                 struct cpumask bad;
 534                 cpumask_andnot(&bad, &ktext_mask, cpu_possible_mask);
 535                 cpumask_and(&ktext_mask, &ktext_mask, cpu_possible_mask);
 536                 if (!cpumask_empty(&bad)) {
 537                         char buf[NR_CPUS * 5];
 538                         cpulist_scnprintf(buf, sizeof(buf), &bad);
 539                         pr_info("ktext: not using unavailable cpus %s\n", buf);
 540                 }
 541                 if (cpumask_empty(&ktext_mask)) {
 542                         pr_warning("ktext: no valid cpus; caching on %d.\n",
 543                                    smp_processor_id());
 544                         cpumask_copy(&ktext_mask,
 545                                      cpumask_of(smp_processor_id()));
 546                 }
 547         }
 548
 549         address = MEM_SV_INTRPT;
 550         pmd = get_pmd(pgtables, address);
 551         pfn = 0;  /* code starts at PA 0 */
 552         if (ktext_small) {
 553                 /* Allocate an L2 PTE for the kernel text */
 554                 int cpu = 0;
 555                 pgprot_t prot = construct_pgprot(PAGE_KERNEL_EXEC,
 556                                                  PAGE_HOME_IMMUTABLE);
 557
 558                 if (ktext_local) {
 559                         if (ktext_nocache)
 560                                 prot = hv_pte_set_mode(prot,
 561                                                        HV_PTE_MODE_UNCACHED);
 562                         else
 563                                 prot = hv_pte_set_mode(prot,
 564                                                        HV_PTE_MODE_CACHE_NO_L3);
 565                 } else {
 566                         prot = hv_pte_set_mode(prot,
 567                                                HV_PTE_MODE_CACHE_TILE_L3);
 568                         cpu = cpumask_first(&ktext_mask);
 569
 570                         prot = ktext_set_nocache(prot);
 571                 }
 572
 573                 BUG_ON(address != (unsigned long)_stext);
 574                 pte = NULL;
 575                 for (; address < (unsigned long)_einittext;
 576                      pfn++, address += PAGE_SIZE) {
 577                         pte_ofs = pte_index(address);
 578                         if (pte_ofs == 0) {
 579                                 if (pte)
 580                                         assign_pte(pmd++, pte);
 581                                 pte = alloc_pte();
 582                         }
 583                         if (!ktext_local) {
 584                                 prot = set_remote_cache_cpu(prot, cpu);
 585                                 cpu = cpumask_next(cpu, &ktext_mask);
 586                                 if (cpu == NR_CPUS)
 587                                         cpu = cpumask_first(&ktext_mask);
 588                         }
 589                         pte[pte_ofs] = pfn_pte(pfn, prot);
 590                 }
 591                 if (pte)
 592                         assign_pte(pmd, pte);
 593         } else {
 594                 pte_t pteval = pfn_pte(0, PAGE_KERNEL_EXEC);
 595                 pteval = pte_mkhuge(pteval);
 596 #if CHIP_HAS_CBOX_HOME_MAP()
 597                 if (ktext_hash) {
 598                         pteval = hv_pte_set_mode(pteval,
 599                                                  HV_PTE_MODE_CACHE_HASH_L3);
 600                         pteval = ktext_set_nocache(pteval);
 601                 } else
 602 #endif /* CHIP_HAS_CBOX_HOME_MAP() */
 603                 if (cpumask_weight(&ktext_mask) == 1) {
 604                         pteval = set_remote_cache_cpu(pteval,
 605                                               cpumask_first(&ktext_mask));
 606                         pteval = hv_pte_set_mode(pteval,
 607                                                  HV_PTE_MODE_CACHE_TILE_L3);
 608                         pteval = ktext_set_nocache(pteval);
 609                 } else if (ktext_nocache)
 610                         pteval = hv_pte_set_mode(pteval,
 611                                                  HV_PTE_MODE_UNCACHED);
 612                 else
 613                         pteval = hv_pte_set_mode(pteval,
 614                                                  HV_PTE_MODE_CACHE_NO_L3);
 615                 for (; address < (unsigned long)_einittext;
 616                      pfn += PFN_DOWN(HPAGE_SIZE), address += HPAGE_SIZE)
 617                         *(pte_t *)(pmd++) = pfn_pte(pfn, pteval);
 618         }
 619
 620         /* Set swapper_pgprot here so it is flushed to memory right away. */
 621         swapper_pgprot = init_pgprot((unsigned long)swapper_pg_dir);
 622
 623         /*
 624          * Since we may be changing the caching of the stack and page
 625          * table itself, we invoke an assembly helper to do the
 626          * following steps:
 627          *
 628          *  - flush the cache so we start with an empty slate
 629          *  - install pgtables[] as the real page table
 630          *  - flush the TLB so the new page table takes effect
 631          */
 632         irqmask = interrupt_mask_save_mask();
 633         interrupt_mask_set_mask(-1ULL);
 634         rc = flush_and_install_context(__pa(pgtables),
 635                                        init_pgprot((unsigned long)pgtables),
 636                                        __get_cpu_var(current_asid),
 637                                        cpumask_bits(my_cpu_mask));
 638         interrupt_mask_restore_mask(irqmask);
 639         BUG_ON(rc != 0);
 640
 641         /* Copy the page table back to the normal swapper_pg_dir. */
 642         memcpy(pgd_base, pgtables, sizeof(pgtables));
 643         __install_page_table(pgd_base, __get_cpu_var(current_asid),
 644                              swapper_pgprot);
 645
 646         /*
 647          * We just read swapper_pgprot and thus brought it into the cache,
 648          * with its new home & caching mode.  When we start the other CPUs,
 649          * they're going to reference swapper_pgprot via their initial fake
 650          * VA-is-PA mappings, which cache everything locally.  At that
 651          * time, if it's in our cache with a conflicting home, the
 652          * simulator's coherence checker will complain.  So, flush it out
 653          * of our cache; we're not going to ever use it again anyway.
 654          */
 655         __insn_finv(&swapper_pgprot);
 656 }
 657
 658 /*
 659  * devmem_is_allowed() checks to see if /dev/mem access to a certain address
 660  * is valid. The argument is a physical page number.
 661  *
 662  * On Tile, the only valid things for which we can just hand out unchecked
 663  * PTEs are the kernel code and data.  Anything else might change its
 664  * homing with time, and we wouldn't know to adjust the /dev/mem PTEs.
 665  * Note that init_thread_union is released to heap soon after boot,
 666  * so we include it in the init data.
 667  *
 668  * For TILE-Gx, we might want to consider allowing access to PA
 669  * regions corresponding to PCI space, etc.
 670  */
 671 int devmem_is_allowed(unsigned long pagenr)
 672 {
 673         return pagenr < kaddr_to_pfn(_end) &&
 674                 !(pagenr >= kaddr_to_pfn(&init_thread_union) ||
 675                   pagenr < kaddr_to_pfn(_einitdata)) &&
 676                 !(pagenr >= kaddr_to_pfn(_sinittext) ||
 677                   pagenr <= kaddr_to_pfn(_einittext-1));
 678 }
 679
 680 #ifdef CONFIG_HIGHMEM
 681 static void __init permanent_kmaps_init(pgd_t *pgd_base)
 682 {
 683         pgd_t *pgd;
 684         pud_t *pud;
 685         pmd_t *pmd;
 686         pte_t *pte;
 687         unsigned long vaddr;
 688
 689         vaddr = PKMAP_BASE;
 690         page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
 691
 692         pgd = swapper_pg_dir + pgd_index(vaddr);
 693         pud = pud_offset(pgd, vaddr);
 694         pmd = pmd_offset(pud, vaddr);
 695         pte = pte_offset_kernel(pmd, vaddr);
 696         pkmap_page_table = pte;
 697 }
 698 #endif /* CONFIG_HIGHMEM */
 699
 700
 701 #ifndef CONFIG_64BIT
 702 static void __init init_free_pfn_range(unsigned long start, unsigned long end)
 703 {
 704         unsigned long pfn;
 705         struct page *page = pfn_to_page(start);
 706
 707         for (pfn = start; pfn < end; ) {
 708                 /* Optimize by freeing pages in large batches */
 709                 int order = __ffs(pfn);
 710                 int count, i;
 711                 struct page *p;
 712
 713                 if (order >= MAX_ORDER)
 714                         order = MAX_ORDER-1;
 715                 count = 1 << order;
 716                 while (pfn + count > end) {
 717                         count >>= 1;
 718                         --order;
 719                 }
 720                 for (p = page, i = 0; i < count; ++i, ++p) {
 721                         __ClearPageReserved(p);
 722                         /*
 723                          * Hacky direct set to avoid unnecessary
 724                          * lock take/release for EVERY page here.
 725                          */
 726                         p->_count.counter = 0;
 727                         p->_mapcount.counter = -1;
 728                 }
 729                 init_page_count(page);
 730                 __free_pages(page, order);
 731                 totalram_pages += count;
 732
 733                 page += count;
 734                 pfn += count;
 735         }
 736 }
 737
 738 static void __init set_non_bootmem_pages_init(void)
 739 {
 740         struct zone *z;
 741         for_each_zone(z) {
 742                 unsigned long start, end;
 743                 int nid = z->zone_pgdat->node_id;
 744                 int idx = zone_idx(z);
 745
 746                 start = z->zone_start_pfn;
 747                 if (start == 0)
 748                         continue;  /* bootmem */
 749                 end = start + z->spanned_pages;
 750                 if (idx == ZONE_NORMAL) {
 751                         BUG_ON(start != node_start_pfn[nid]);
 752                         start = node_free_pfn[nid];
 753                 }
 754 #ifdef CONFIG_HIGHMEM
 755                 if (idx == ZONE_HIGHMEM)
 756                         totalhigh_pages += z->spanned_pages;
 757 #endif
 758                 if (kdata_huge) {
 759                         unsigned long percpu_pfn = node_percpu_pfn[nid];
 760                         if (start < percpu_pfn && end > percpu_pfn)
 761                                 end = percpu_pfn;
 762                 }
 763 #ifdef CONFIG_PCI
 764                 if (start <= pci_reserve_start_pfn &&
 765                     end > pci_reserve_start_pfn) {
 766                         if (end > pci_reserve_end_pfn)
 767                                 init_free_pfn_range(pci_reserve_end_pfn, end);
 768                         end = pci_reserve_start_pfn;
 769                 }
 770 #endif
 771                 init_free_pfn_range(start, end);
 772         }
 773 }
 774 #endif
 775
 776 /*
 777  * paging_init() sets up the page tables - note that all of lowmem is
 778  * already mapped by head.S.
 779  */
 780 void __init paging_init(void)
 781 {
 782 #ifdef CONFIG_HIGHMEM
 783         unsigned long vaddr, end;
 784 #endif
 785 #ifdef __tilegx__
 786         pud_t *pud;
 787 #endif
 788         pgd_t *pgd_base = swapper_pg_dir;
 789
 790         kernel_physical_mapping_init(pgd_base);
 791
 792 #ifdef CONFIG_HIGHMEM
 793         /*
 794          * Fixed mappings, only the page table structure has to be
 795          * created - mappings will be set by set_fixmap():
 796          */
 797         vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
 798         end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
 799         page_table_range_init(vaddr, end, pgd_base);
 800         permanent_kmaps_init(pgd_base);
 801 #endif
 802
 803 #ifdef __tilegx__
 804         /*
 805          * Since GX allocates just one pmd_t array worth of vmalloc space,
 806          * we go ahead and allocate it statically here, then share it
 807          * globally.  As a result we don't have to worry about any task
 808          * changing init_mm once we get up and running, and there's no
 809          * need for e.g. vmalloc_sync_all().
 810          */
 811         BUILD_BUG_ON(pgd_index(VMALLOC_START) != pgd_index(VMALLOC_END - 1));
 812         pud = pud_offset(pgd_base + pgd_index(VMALLOC_START), VMALLOC_START);
 813         assign_pmd(pud, alloc_pmd());
 814 #endif
 815 }
 816
 817
 818 /*
 819  * Walk the kernel page tables and derive the page_home() from
 820  * the PTEs, so that set_pte() can properly validate the caching
 821  * of all PTEs it sees.
 822  */
 823 void __init set_page_homes(void)
 824 {
 825 }
 826
 827 static void __init set_max_mapnr_init(void)
 828 {
 829 #ifdef CONFIG_FLATMEM
 830         max_mapnr = max_low_pfn;
 831 #endif
 832 }
 833
 834 void __init mem_init(void)
 835 {
 836         int codesize, datasize, initsize;
 837         int i;
 838 #ifndef __tilegx__
 839         void *last;
 840 #endif
 841
 842 #ifdef CONFIG_FLATMEM
 843         BUG_ON(!mem_map);
 844 #endif
 845
 846 #ifdef CONFIG_HIGHMEM
 847         /* check that fixmap and pkmap do not overlap */
 848         if (PKMAP_ADDR(LAST_PKMAP-1) >= FIXADDR_START) {
 849                 pr_err("fixmap and kmap areas overlap"
 850                        " - this will crash\n");
 851                 pr_err("pkstart: %lxh pkend: %lxh fixstart %lxh\n",
 852                        PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP-1),
 853                        FIXADDR_START);
 854                 BUG();
 855         }
 856 #endif
 857
 858         set_max_mapnr_init();
 859
 860         /* this will put all bootmem onto the freelists */
 861         totalram_pages += free_all_bootmem();
 862
 863 #ifndef CONFIG_64BIT
 864         /* count all remaining LOWMEM and give all HIGHMEM to page allocator */
 865         set_non_bootmem_pages_init();
 866 #endif
 867
 868         codesize =  (unsigned long)&_etext - (unsigned long)&_text;
 869         datasize =  (unsigned long)&_end - (unsigned long)&_sdata;
 870         initsize =  (unsigned long)&_einittext - (unsigned long)&_sinittext;
 871         initsize += (unsigned long)&_einitdata - (unsigned long)&_sinitdata;
 872
 873         pr_info("Memory: %luk/%luk available (%dk kernel code, %dk data, %dk init, %ldk highmem)\n",
 874                 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
 875                 num_physpages << (PAGE_SHIFT-10),
 876                 codesize >> 10,
 877                 datasize >> 10,
 878                 initsize >> 10,
 879                 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
 880                );
 881
 882         /*
 883          * In debug mode, dump some interesting memory mappings.
 884          */
 885 #ifdef CONFIG_HIGHMEM
 886         printk(KERN_DEBUG "  KMAP    %#lx - %#lx\n",
 887                FIXADDR_START, FIXADDR_TOP + PAGE_SIZE - 1);
 888         printk(KERN_DEBUG "  PKMAP   %#lx - %#lx\n",
 889                PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP) - 1);
 890 #endif
 891 #ifdef CONFIG_HUGEVMAP
 892         printk(KERN_DEBUG "  HUGEMAP %#lx - %#lx\n",
 893                HUGE_VMAP_BASE, HUGE_VMAP_END - 1);
 894 #endif
 895         printk(KERN_DEBUG "  VMALLOC %#lx - %#lx\n",
 896                _VMALLOC_START, _VMALLOC_END - 1);
 897 #ifdef __tilegx__
 898         for (i = MAX_NUMNODES-1; i >= 0; --i) {
 899                 struct pglist_data *node = &node_data[i];
 900                 if (node->node_present_pages) {
 901                         unsigned long start = (unsigned long)
 902                                 pfn_to_kaddr(node->node_start_pfn);
 903                         unsigned long end = start +
 904                                 (node->node_present_pages << PAGE_SHIFT);
 905                         printk(KERN_DEBUG "  MEM%d    %#lx - %#lx\n",
 906                                i, start, end - 1);
 907                 }
 908         }
 909 #else
 910         last = high_memory;
 911         for (i = MAX_NUMNODES-1; i >= 0; --i) {
 912                 if ((unsigned long)vbase_map[i] != -1UL) {
 913                         printk(KERN_DEBUG "  LOWMEM%d %#lx - %#lx\n",
 914                                i, (unsigned long) (vbase_map[i]),
 915                                (unsigned long) (last-1));
 916                         last = vbase_map[i];
 917                 }
 918         }
 919 #endif
 920
 921 #ifndef __tilegx__
 922         /*
 923          * Convert from using one lock for all atomic operations to
 924          * one per cpu.
 925          */
 926         __init_atomic_per_cpu();
 927 #endif
 928 }
 929
 930 /*
 931  * this is for the non-NUMA, single node SMP system case.
 932  * Specifically, in the case of x86, we will always add
 933  * memory to the highmem for now.
 934  */
 935 #ifndef CONFIG_NEED_MULTIPLE_NODES
 936 int arch_add_memory(u64 start, u64 size)
 937 {
 938         struct pglist_data *pgdata = &contig_page_data;
 939         struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
 940         unsigned long start_pfn = start >> PAGE_SHIFT;
 941         unsigned long nr_pages = size >> PAGE_SHIFT;
 942
 943         return __add_pages(zone, start_pfn, nr_pages);
 944 }
 945
 946 int remove_memory(u64 start, u64 size)
 947 {
 948         return -EINVAL;
 949 }
 950 #endif
 951
 952 struct kmem_cache *pgd_cache;
 953
 954 void __init pgtable_cache_init(void)
 955 {
 956         pgd_cache = kmem_cache_create("pgd", SIZEOF_PGD, SIZEOF_PGD, 0, NULL);
 957         if (!pgd_cache)
 958                 panic("pgtable_cache_init(): Cannot create pgd cache");
 959 }
 960
 961 #if !CHIP_HAS_COHERENT_LOCAL_CACHE()
 962 /*
 963  * The __w1data area holds data that is only written during initialization,
 964  * and is read-only and thus freely cacheable thereafter.  Fix the page
 965  * table entries that cover that region accordingly.
 966  */
 967 static void mark_w1data_ro(void)
 968 {
 969         /* Loop over page table entries */
 970         unsigned long addr = (unsigned long)__w1data_begin;
 971         BUG_ON((addr & (PAGE_SIZE-1)) != 0);
 972         for (; addr <= (unsigned long)__w1data_end - 1; addr += PAGE_SIZE) {
 973                 unsigned long pfn = kaddr_to_pfn((void *)addr);
 974                 pte_t *ptep = virt_to_pte(NULL, addr);
 975                 BUG_ON(pte_huge(*ptep));   /* not relevant for kdata_huge */
 976                 set_pte_at(&init_mm, addr, ptep, pfn_pte(pfn, PAGE_KERNEL_RO));
 977         }
 978 }
 979 #endif
 980
 981 #ifdef CONFIG_DEBUG_PAGEALLOC
 982 static long __write_once initfree;
 983 #else
 984 static long __write_once initfree = 1;
 985 #endif
 986
 987 /* Select whether to free (1) or mark unusable (0) the __init pages. */
 988 static int __init set_initfree(char *str)
 989 {
 990         long val;
 991         if (strict_strtol(str, 0, &val) == 0) {
 992                 initfree = val;
 993                 pr_info("initfree: %s free init pages\n",
 994                         initfree ? "will" : "won't");
 995         }
 996         return 1;
 997 }
 998 __setup("initfree=", set_initfree);
 999
1000 static void free_init_pages(char *what, unsigned long begin, unsigned long end)
1001 {
1002         unsigned long addr = (unsigned long) begin;
1003
1004         if (kdata_huge && !initfree) {
1005                 pr_warning("Warning: ignoring initfree=0:"
1006                            " incompatible with kdata=huge\n");
1007                 initfree = 1;
1008         }
1009         end = (end + PAGE_SIZE - 1) & PAGE_MASK;
1010         local_flush_tlb_pages(NULL, begin, PAGE_SIZE, end - begin);
1011         for (addr = begin; addr < end; addr += PAGE_SIZE) {
1012                 /*
1013                  * Note we just reset the home here directly in the
1014                  * page table.  We know this is safe because our caller
1015                  * just flushed the caches on all the other cpus,
1016                  * and they won't be touching any of these pages.
1017                  */
1018                 int pfn = kaddr_to_pfn((void *)addr);
1019                 struct page *page = pfn_to_page(pfn);
1020                 pte_t *ptep = virt_to_pte(NULL, addr);
1021                 if (!initfree) {
1022                         /*
1023                          * If debugging page accesses then do not free
1024                          * this memory but mark them not present - any
1025                          * buggy init-section access will create a
1026                          * kernel page fault:
1027                          */
1028                         pte_clear(&init_mm, addr, ptep);
1029                         continue;
1030                 }
1031                 __ClearPageReserved(page);
1032                 init_page_count(page);
1033                 if (pte_huge(*ptep))
1034                         BUG_ON(!kdata_huge);
1035                 else
1036                         set_pte_at(&init_mm, addr, ptep,
1037                                    pfn_pte(pfn, PAGE_KERNEL));
1038                 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
1039                 free_page(addr);
1040                 totalram_pages++;
1041         }
1042         pr_info("Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
1043 }
1044
1045 void free_initmem(void)
1046 {
1047         const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET;
1048
1049         /*
1050          * Evict the dirty initdata on the boot cpu, evict the w1data
1051          * wherever it's homed, and evict all the init code everywhere.
1052          * We are guaranteed that no one will touch the init pages any
1053          * more, and although other cpus may be touching the w1data,
1054          * we only actually change the caching on tile64, which won't
1055          * be keeping local copies in the other tiles' caches anyway.
1056          */
1057         homecache_evict(&cpu_cacheable_map);
1058
1059         /* Free the data pages that we won't use again after init. */
1060         free_init_pages("unused kernel data",
1061                         (unsigned long)_sinitdata,
1062                         (unsigned long)_einitdata);
1063
1064         /*
1065          * Free the pages mapped from 0xc0000000 that correspond to code
1066          * pages from MEM_SV_INTRPT that we won't use again after init.
1067          */
1068         free_init_pages("unused kernel text",
1069                         (unsigned long)_sinittext - text_delta,
1070                         (unsigned long)_einittext - text_delta);
1071
1072 #if !CHIP_HAS_COHERENT_LOCAL_CACHE()
1073         /*
1074          * Upgrade the .w1data section to globally cached.
1075          * We don't do this on tilepro, since the cache architecture
1076          * pretty much makes it irrelevant, and in any case we end
1077          * up having racing issues with other tiles that may touch
1078          * the data after we flush the cache but before we update
1079          * the PTEs and flush the TLBs, causing sharer shootdowns
1080          * later.  Even though this is to clean data, it seems like
1081          * an unnecessary complication.
1082          */
1083         mark_w1data_ro();
1084 #endif
1085
1086         /* Do a global TLB flush so everyone sees the changes. */
1087         flush_tlb_all();
1088 }