7c3b8dc52dc81aab303ab166711c46fef08b7c6d
[pandora-kernel.git] / arch / x86 / mm / pageattr.c
1 /*
2  * Copyright 2002 Andi Kleen, SuSE Labs.
3  * Thanks to Ben LaHaise for precious feedback.
4  */
5 #include <linux/highmem.h>
6 #include <linux/bootmem.h>
7 #include <linux/module.h>
8 #include <linux/sched.h>
9 #include <linux/slab.h>
10 #include <linux/mm.h>
11 #include <linux/interrupt.h>
12 #include <linux/seq_file.h>
13 #include <linux/debugfs.h>
14
15 #include <asm/e820.h>
16 #include <asm/processor.h>
17 #include <asm/tlbflush.h>
18 #include <asm/sections.h>
19 #include <asm/uaccess.h>
20 #include <asm/pgalloc.h>
21 #include <asm/proto.h>
22 #include <asm/pat.h>
23
24 /*
25  * The current flushing context - we pass it instead of 5 arguments:
26  */
27 struct cpa_data {
28         unsigned long   vaddr;
29         pgprot_t        mask_set;
30         pgprot_t        mask_clr;
31         int             numpages;
32         int             flushtlb;
33         unsigned long   pfn;
34         unsigned        force_split : 1;
35 };
36
37 #ifdef CONFIG_PROC_FS
38 static unsigned long direct_pages_count[PG_LEVEL_NUM];
39
40 void update_page_count(int level, unsigned long pages)
41 {
42         unsigned long flags;
43
44         /* Protect against CPA */
45         spin_lock_irqsave(&pgd_lock, flags);
46         direct_pages_count[level] += pages;
47         spin_unlock_irqrestore(&pgd_lock, flags);
48 }
49
50 static void split_page_count(int level)
51 {
52         direct_pages_count[level]--;
53         direct_pages_count[level - 1] += PTRS_PER_PTE;
54 }
55
56 int arch_report_meminfo(char *page)
57 {
58         int n = sprintf(page, "DirectMap4k:  %8lu kB\n",
59                         direct_pages_count[PG_LEVEL_4K] << 2);
60 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
61         n += sprintf(page + n, "DirectMap2M:  %8lu kB\n",
62                         direct_pages_count[PG_LEVEL_2M] << 11);
63 #else
64         n += sprintf(page + n, "DirectMap4M:  %8lu kB\n",
65                         direct_pages_count[PG_LEVEL_2M] << 12);
66 #endif
67 #ifdef CONFIG_X86_64
68         if (direct_gbpages)
69                 n += sprintf(page + n, "DirectMap1G:  %8lu kB\n",
70                         direct_pages_count[PG_LEVEL_1G] << 20);
71 #endif
72         return n;
73 }
74 #else
75 static inline void split_page_count(int level) { }
76 #endif
77
78 #ifdef CONFIG_X86_64
79
80 static inline unsigned long highmap_start_pfn(void)
81 {
82         return __pa(_text) >> PAGE_SHIFT;
83 }
84
85 static inline unsigned long highmap_end_pfn(void)
86 {
87         return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
88 }
89
90 #endif
91
92 #ifdef CONFIG_DEBUG_PAGEALLOC
93 # define debug_pagealloc 1
94 #else
95 # define debug_pagealloc 0
96 #endif
97
98 static inline int
99 within(unsigned long addr, unsigned long start, unsigned long end)
100 {
101         return addr >= start && addr < end;
102 }
103
104 /*
105  * Flushing functions
106  */
107
108 /**
109  * clflush_cache_range - flush a cache range with clflush
110  * @addr:       virtual start address
111  * @size:       number of bytes to flush
112  *
113  * clflush is an unordered instruction which needs fencing with mfence
114  * to avoid ordering issues.
115  */
116 void clflush_cache_range(void *vaddr, unsigned int size)
117 {
118         void *vend = vaddr + size - 1;
119
120         mb();
121
122         for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
123                 clflush(vaddr);
124         /*
125          * Flush any possible final partial cacheline:
126          */
127         clflush(vend);
128
129         mb();
130 }
131
132 static void __cpa_flush_all(void *arg)
133 {
134         unsigned long cache = (unsigned long)arg;
135
136         /*
137          * Flush all to work around Errata in early athlons regarding
138          * large page flushing.
139          */
140         __flush_tlb_all();
141
142         if (cache && boot_cpu_data.x86_model >= 4)
143                 wbinvd();
144 }
145
146 static void cpa_flush_all(unsigned long cache)
147 {
148         BUG_ON(irqs_disabled());
149
150         on_each_cpu(__cpa_flush_all, (void *) cache, 1);
151 }
152
153 static void __cpa_flush_range(void *arg)
154 {
155         /*
156          * We could optimize that further and do individual per page
157          * tlb invalidates for a low number of pages. Caveat: we must
158          * flush the high aliases on 64bit as well.
159          */
160         __flush_tlb_all();
161 }
162
163 static void cpa_flush_range(unsigned long start, int numpages, int cache)
164 {
165         unsigned int i, level;
166         unsigned long addr;
167
168         BUG_ON(irqs_disabled());
169         WARN_ON(PAGE_ALIGN(start) != start);
170
171         on_each_cpu(__cpa_flush_range, NULL, 1);
172
173         if (!cache)
174                 return;
175
176         /*
177          * We only need to flush on one CPU,
178          * clflush is a MESI-coherent instruction that
179          * will cause all other CPUs to flush the same
180          * cachelines:
181          */
182         for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
183                 pte_t *pte = lookup_address(addr, &level);
184
185                 /*
186                  * Only flush present addresses:
187                  */
188                 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
189                         clflush_cache_range((void *) addr, PAGE_SIZE);
190         }
191 }
192
193 /*
194  * Certain areas of memory on x86 require very specific protection flags,
195  * for example the BIOS area or kernel text. Callers don't always get this
196  * right (again, ioremap() on BIOS memory is not uncommon) so this function
197  * checks and fixes these known static required protection bits.
198  */
199 static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
200                                    unsigned long pfn)
201 {
202         pgprot_t forbidden = __pgprot(0);
203
204         /*
205          * The BIOS area between 640k and 1Mb needs to be executable for
206          * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
207          */
208         if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
209                 pgprot_val(forbidden) |= _PAGE_NX;
210
211         /*
212          * The kernel text needs to be executable for obvious reasons
213          * Does not cover __inittext since that is gone later on. On
214          * 64bit we do not enforce !NX on the low mapping
215          */
216         if (within(address, (unsigned long)_text, (unsigned long)_etext))
217                 pgprot_val(forbidden) |= _PAGE_NX;
218
219         /*
220          * The .rodata section needs to be read-only. Using the pfn
221          * catches all aliases.
222          */
223         if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
224                    __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
225                 pgprot_val(forbidden) |= _PAGE_RW;
226
227         prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
228
229         return prot;
230 }
231
232 /*
233  * Lookup the page table entry for a virtual address. Return a pointer
234  * to the entry and the level of the mapping.
235  *
236  * Note: We return pud and pmd either when the entry is marked large
237  * or when the present bit is not set. Otherwise we would return a
238  * pointer to a nonexisting mapping.
239  */
240 pte_t *lookup_address(unsigned long address, unsigned int *level)
241 {
242         pgd_t *pgd = pgd_offset_k(address);
243         pud_t *pud;
244         pmd_t *pmd;
245
246         *level = PG_LEVEL_NONE;
247
248         if (pgd_none(*pgd))
249                 return NULL;
250
251         pud = pud_offset(pgd, address);
252         if (pud_none(*pud))
253                 return NULL;
254
255         *level = PG_LEVEL_1G;
256         if (pud_large(*pud) || !pud_present(*pud))
257                 return (pte_t *)pud;
258
259         pmd = pmd_offset(pud, address);
260         if (pmd_none(*pmd))
261                 return NULL;
262
263         *level = PG_LEVEL_2M;
264         if (pmd_large(*pmd) || !pmd_present(*pmd))
265                 return (pte_t *)pmd;
266
267         *level = PG_LEVEL_4K;
268
269         return pte_offset_kernel(pmd, address);
270 }
271 EXPORT_SYMBOL_GPL(lookup_address);
272
273 /*
274  * Set the new pmd in all the pgds we know about:
275  */
276 static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
277 {
278         /* change init_mm */
279         set_pte_atomic(kpte, pte);
280 #ifdef CONFIG_X86_32
281         if (!SHARED_KERNEL_PMD) {
282                 struct page *page;
283
284                 list_for_each_entry(page, &pgd_list, lru) {
285                         pgd_t *pgd;
286                         pud_t *pud;
287                         pmd_t *pmd;
288
289                         pgd = (pgd_t *)page_address(page) + pgd_index(address);
290                         pud = pud_offset(pgd, address);
291                         pmd = pmd_offset(pud, address);
292                         set_pte_atomic((pte_t *)pmd, pte);
293                 }
294         }
295 #endif
296 }
297
298 static int
299 try_preserve_large_page(pte_t *kpte, unsigned long address,
300                         struct cpa_data *cpa)
301 {
302         unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
303         pte_t new_pte, old_pte, *tmp;
304         pgprot_t old_prot, new_prot;
305         int i, do_split = 1;
306         unsigned int level;
307
308         if (cpa->force_split)
309                 return 1;
310
311         spin_lock_irqsave(&pgd_lock, flags);
312         /*
313          * Check for races, another CPU might have split this page
314          * up already:
315          */
316         tmp = lookup_address(address, &level);
317         if (tmp != kpte)
318                 goto out_unlock;
319
320         switch (level) {
321         case PG_LEVEL_2M:
322                 psize = PMD_PAGE_SIZE;
323                 pmask = PMD_PAGE_MASK;
324                 break;
325 #ifdef CONFIG_X86_64
326         case PG_LEVEL_1G:
327                 psize = PUD_PAGE_SIZE;
328                 pmask = PUD_PAGE_MASK;
329                 break;
330 #endif
331         default:
332                 do_split = -EINVAL;
333                 goto out_unlock;
334         }
335
336         /*
337          * Calculate the number of pages, which fit into this large
338          * page starting at address:
339          */
340         nextpage_addr = (address + psize) & pmask;
341         numpages = (nextpage_addr - address) >> PAGE_SHIFT;
342         if (numpages < cpa->numpages)
343                 cpa->numpages = numpages;
344
345         /*
346          * We are safe now. Check whether the new pgprot is the same:
347          */
348         old_pte = *kpte;
349         old_prot = new_prot = pte_pgprot(old_pte);
350
351         pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
352         pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
353
354         /*
355          * old_pte points to the large page base address. So we need
356          * to add the offset of the virtual address:
357          */
358         pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
359         cpa->pfn = pfn;
360
361         new_prot = static_protections(new_prot, address, pfn);
362
363         /*
364          * We need to check the full range, whether
365          * static_protection() requires a different pgprot for one of
366          * the pages in the range we try to preserve:
367          */
368         addr = address + PAGE_SIZE;
369         pfn++;
370         for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
371                 pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
372
373                 if (pgprot_val(chk_prot) != pgprot_val(new_prot))
374                         goto out_unlock;
375         }
376
377         /*
378          * If there are no changes, return. maxpages has been updated
379          * above:
380          */
381         if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
382                 do_split = 0;
383                 goto out_unlock;
384         }
385
386         /*
387          * We need to change the attributes. Check, whether we can
388          * change the large page in one go. We request a split, when
389          * the address is not aligned and the number of pages is
390          * smaller than the number of pages in the large page. Note
391          * that we limited the number of possible pages already to
392          * the number of pages in the large page.
393          */
394         if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
395                 /*
396                  * The address is aligned and the number of pages
397                  * covers the full page.
398                  */
399                 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
400                 __set_pmd_pte(kpte, address, new_pte);
401                 cpa->flushtlb = 1;
402                 do_split = 0;
403         }
404
405 out_unlock:
406         spin_unlock_irqrestore(&pgd_lock, flags);
407
408         return do_split;
409 }
410
411 static LIST_HEAD(page_pool);
412 static unsigned long pool_size, pool_pages, pool_low;
413 static unsigned long pool_used, pool_failed;
414
415 static void cpa_fill_pool(struct page **ret)
416 {
417         gfp_t gfp = GFP_KERNEL;
418         unsigned long flags;
419         struct page *p;
420
421         /*
422          * Avoid recursion (on debug-pagealloc) and also signal
423          * our priority to get to these pagetables:
424          */
425         if (current->flags & PF_MEMALLOC)
426                 return;
427         current->flags |= PF_MEMALLOC;
428
429         /*
430          * Allocate atomically from atomic contexts:
431          */
432         if (in_atomic() || irqs_disabled() || debug_pagealloc)
433                 gfp =  GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
434
435         while (pool_pages < pool_size || (ret && !*ret)) {
436                 p = alloc_pages(gfp, 0);
437                 if (!p) {
438                         pool_failed++;
439                         break;
440                 }
441                 /*
442                  * If the call site needs a page right now, provide it:
443                  */
444                 if (ret && !*ret) {
445                         *ret = p;
446                         continue;
447                 }
448                 spin_lock_irqsave(&pgd_lock, flags);
449                 list_add(&p->lru, &page_pool);
450                 pool_pages++;
451                 spin_unlock_irqrestore(&pgd_lock, flags);
452         }
453
454         current->flags &= ~PF_MEMALLOC;
455 }
456
457 #define SHIFT_MB                (20 - PAGE_SHIFT)
458 #define ROUND_MB_GB             ((1 << 10) - 1)
459 #define SHIFT_MB_GB             10
460 #define POOL_PAGES_PER_GB       16
461
462 void __init cpa_init(void)
463 {
464         struct sysinfo si;
465         unsigned long gb;
466
467         si_meminfo(&si);
468         /*
469          * Calculate the number of pool pages:
470          *
471          * Convert totalram (nr of pages) to MiB and round to the next
472          * GiB. Shift MiB to Gib and multiply the result by
473          * POOL_PAGES_PER_GB:
474          */
475         if (debug_pagealloc) {
476                 gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
477                 pool_size = POOL_PAGES_PER_GB * gb;
478         } else {
479                 pool_size = 1;
480         }
481         pool_low = pool_size;
482
483         cpa_fill_pool(NULL);
484         printk(KERN_DEBUG
485                "CPA: page pool initialized %lu of %lu pages preallocated\n",
486                pool_pages, pool_size);
487 }
488
489 static int split_large_page(pte_t *kpte, unsigned long address)
490 {
491         unsigned long flags, pfn, pfninc = 1;
492         unsigned int i, level;
493         pte_t *pbase, *tmp;
494         pgprot_t ref_prot;
495         struct page *base;
496
497         /*
498          * Get a page from the pool. The pool list is protected by the
499          * pgd_lock, which we have to take anyway for the split
500          * operation:
501          */
502         spin_lock_irqsave(&pgd_lock, flags);
503         if (list_empty(&page_pool)) {
504                 spin_unlock_irqrestore(&pgd_lock, flags);
505                 base = NULL;
506                 cpa_fill_pool(&base);
507                 if (!base)
508                         return -ENOMEM;
509                 spin_lock_irqsave(&pgd_lock, flags);
510         } else {
511                 base = list_first_entry(&page_pool, struct page, lru);
512                 list_del(&base->lru);
513                 pool_pages--;
514
515                 if (pool_pages < pool_low)
516                         pool_low = pool_pages;
517         }
518
519         /*
520          * Check for races, another CPU might have split this page
521          * up for us already:
522          */
523         tmp = lookup_address(address, &level);
524         if (tmp != kpte)
525                 goto out_unlock;
526
527         pbase = (pte_t *)page_address(base);
528         paravirt_alloc_pte(&init_mm, page_to_pfn(base));
529         ref_prot = pte_pgprot(pte_clrhuge(*kpte));
530
531 #ifdef CONFIG_X86_64
532         if (level == PG_LEVEL_1G) {
533                 pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
534                 pgprot_val(ref_prot) |= _PAGE_PSE;
535         }
536 #endif
537
538         /*
539          * Get the target pfn from the original entry:
540          */
541         pfn = pte_pfn(*kpte);
542         for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
543                 set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
544
545         if (address >= (unsigned long)__va(0) &&
546                 address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
547                 split_page_count(level);
548
549 #ifdef CONFIG_X86_64
550         if (address >= (unsigned long)__va(1UL<<32) &&
551                 address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
552                 split_page_count(level);
553 #endif
554
555         /*
556          * Install the new, split up pagetable. Important details here:
557          *
558          * On Intel the NX bit of all levels must be cleared to make a
559          * page executable. See section 4.13.2 of Intel 64 and IA-32
560          * Architectures Software Developer's Manual).
561          *
562          * Mark the entry present. The current mapping might be
563          * set to not present, which we preserved above.
564          */
565         ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
566         pgprot_val(ref_prot) |= _PAGE_PRESENT;
567         __set_pmd_pte(kpte, address, mk_pte(base, ref_prot));
568         base = NULL;
569
570 out_unlock:
571         /*
572          * If we dropped out via the lookup_address check under
573          * pgd_lock then stick the page back into the pool:
574          */
575         if (base) {
576                 list_add(&base->lru, &page_pool);
577                 pool_pages++;
578         } else
579                 pool_used++;
580         spin_unlock_irqrestore(&pgd_lock, flags);
581
582         return 0;
583 }
584
585 static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
586                                int primary)
587 {
588         /*
589          * Ignore all non primary paths.
590          */
591         if (!primary)
592                 return 0;
593
594         /*
595          * Ignore the NULL PTE for kernel identity mapping, as it is expected
596          * to have holes.
597          * Also set numpages to '1' indicating that we processed cpa req for
598          * one virtual address page and its pfn. TBD: numpages can be set based
599          * on the initial value and the level returned by lookup_address().
600          */
601         if (within(vaddr, PAGE_OFFSET,
602                    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
603                 cpa->numpages = 1;
604                 cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
605                 return 0;
606         } else {
607                 WARN(1, KERN_WARNING "CPA: called for zero pte. "
608                         "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
609                         cpa->vaddr);
610
611                 return -EINVAL;
612         }
613 }
614
615 static int __change_page_attr(struct cpa_data *cpa, int primary)
616 {
617         unsigned long address = cpa->vaddr;
618         int do_split, err;
619         unsigned int level;
620         pte_t *kpte, old_pte;
621
622         /*
623          * If we're called with lazy mmu updates enabled, the
624          * in-memory pte state may be stale.  Flush pending updates to
625          * bring them up to date.
626          */
627         arch_flush_lazy_mmu_mode();
628
629 repeat:
630         kpte = lookup_address(address, &level);
631         if (!kpte)
632                 return __cpa_process_fault(cpa, address, primary);
633
634         old_pte = *kpte;
635         if (!pte_val(old_pte))
636                 return __cpa_process_fault(cpa, address, primary);
637
638         if (level == PG_LEVEL_4K) {
639                 pte_t new_pte;
640                 pgprot_t new_prot = pte_pgprot(old_pte);
641                 unsigned long pfn = pte_pfn(old_pte);
642
643                 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
644                 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
645
646                 new_prot = static_protections(new_prot, address, pfn);
647
648                 /*
649                  * We need to keep the pfn from the existing PTE,
650                  * after all we're only going to change it's attributes
651                  * not the memory it points to
652                  */
653                 new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
654                 cpa->pfn = pfn;
655                 /*
656                  * Do we really change anything ?
657                  */
658                 if (pte_val(old_pte) != pte_val(new_pte)) {
659                         set_pte_atomic(kpte, new_pte);
660                         cpa->flushtlb = 1;
661                 }
662                 cpa->numpages = 1;
663                 return 0;
664         }
665
666         /*
667          * Check, whether we can keep the large page intact
668          * and just change the pte:
669          */
670         do_split = try_preserve_large_page(kpte, address, cpa);
671         /*
672          * When the range fits into the existing large page,
673          * return. cp->numpages and cpa->tlbflush have been updated in
674          * try_large_page:
675          */
676         if (do_split <= 0)
677                 return do_split;
678
679         /*
680          * We have to split the large page:
681          */
682         err = split_large_page(kpte, address);
683         if (!err) {
684                 cpa->flushtlb = 1;
685                 goto repeat;
686         }
687
688         return err;
689 }
690
691 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
692
693 static int cpa_process_alias(struct cpa_data *cpa)
694 {
695         struct cpa_data alias_cpa;
696         int ret = 0;
697
698         if (cpa->pfn >= max_pfn_mapped)
699                 return 0;
700
701 #ifdef CONFIG_X86_64
702         if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
703                 return 0;
704 #endif
705         /*
706          * No need to redo, when the primary call touched the direct
707          * mapping already:
708          */
709         if (!(within(cpa->vaddr, PAGE_OFFSET,
710                     PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
711
712                 alias_cpa = *cpa;
713                 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
714
715                 ret = __change_page_attr_set_clr(&alias_cpa, 0);
716         }
717
718 #ifdef CONFIG_X86_64
719         if (ret)
720                 return ret;
721         /*
722          * No need to redo, when the primary call touched the high
723          * mapping already:
724          */
725         if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end))
726                 return 0;
727
728         /*
729          * If the physical address is inside the kernel map, we need
730          * to touch the high mapped kernel as well:
731          */
732         if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn()))
733                 return 0;
734
735         alias_cpa = *cpa;
736         alias_cpa.vaddr =
737                 (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
738
739         /*
740          * The high mapping range is imprecise, so ignore the return value.
741          */
742         __change_page_attr_set_clr(&alias_cpa, 0);
743 #endif
744         return ret;
745 }
746
747 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
748 {
749         int ret, numpages = cpa->numpages;
750
751         while (numpages) {
752                 /*
753                  * Store the remaining nr of pages for the large page
754                  * preservation check.
755                  */
756                 cpa->numpages = numpages;
757
758                 ret = __change_page_attr(cpa, checkalias);
759                 if (ret)
760                         return ret;
761
762                 if (checkalias) {
763                         ret = cpa_process_alias(cpa);
764                         if (ret)
765                                 return ret;
766                 }
767
768                 /*
769                  * Adjust the number of pages with the result of the
770                  * CPA operation. Either a large page has been
771                  * preserved or a single page update happened.
772                  */
773                 BUG_ON(cpa->numpages > numpages);
774                 numpages -= cpa->numpages;
775                 cpa->vaddr += cpa->numpages * PAGE_SIZE;
776         }
777         return 0;
778 }
779
780 static inline int cache_attr(pgprot_t attr)
781 {
782         return pgprot_val(attr) &
783                 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
784 }
785
786 static int change_page_attr_set_clr(unsigned long addr, int numpages,
787                                     pgprot_t mask_set, pgprot_t mask_clr,
788                                     int force_split)
789 {
790         struct cpa_data cpa;
791         int ret, cache, checkalias;
792
793         /*
794          * Check, if we are requested to change a not supported
795          * feature:
796          */
797         mask_set = canon_pgprot(mask_set);
798         mask_clr = canon_pgprot(mask_clr);
799         if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
800                 return 0;
801
802         /* Ensure we are PAGE_SIZE aligned */
803         if (addr & ~PAGE_MASK) {
804                 addr &= PAGE_MASK;
805                 /*
806                  * People should not be passing in unaligned addresses:
807                  */
808                 WARN_ON_ONCE(1);
809         }
810
811         cpa.vaddr = addr;
812         cpa.numpages = numpages;
813         cpa.mask_set = mask_set;
814         cpa.mask_clr = mask_clr;
815         cpa.flushtlb = 0;
816         cpa.force_split = force_split;
817
818         /* No alias checking for _NX bit modifications */
819         checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
820
821         ret = __change_page_attr_set_clr(&cpa, checkalias);
822
823         /*
824          * Check whether we really changed something:
825          */
826         if (!cpa.flushtlb)
827                 goto out;
828
829         /*
830          * No need to flush, when we did not set any of the caching
831          * attributes:
832          */
833         cache = cache_attr(mask_set);
834
835         /*
836          * On success we use clflush, when the CPU supports it to
837          * avoid the wbindv. If the CPU does not support it and in the
838          * error case we fall back to cpa_flush_all (which uses
839          * wbindv):
840          */
841         if (!ret && cpu_has_clflush)
842                 cpa_flush_range(addr, numpages, cache);
843         else
844                 cpa_flush_all(cache);
845
846         /*
847          * If we've been called with lazy mmu updates enabled, then
848          * make sure that everything gets flushed out before we
849          * return.
850          */
851         arch_flush_lazy_mmu_mode();
852
853 out:
854         cpa_fill_pool(NULL);
855
856         return ret;
857 }
858
859 static inline int change_page_attr_set(unsigned long addr, int numpages,
860                                        pgprot_t mask)
861 {
862         return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0);
863 }
864
865 static inline int change_page_attr_clear(unsigned long addr, int numpages,
866                                          pgprot_t mask)
867 {
868         return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
869 }
870
871 int _set_memory_uc(unsigned long addr, int numpages)
872 {
873         /*
874          * for now UC MINUS. see comments in ioremap_nocache()
875          */
876         return change_page_attr_set(addr, numpages,
877                                     __pgprot(_PAGE_CACHE_UC_MINUS));
878 }
879
880 int set_memory_uc(unsigned long addr, int numpages)
881 {
882         /*
883          * for now UC MINUS. see comments in ioremap_nocache()
884          */
885         if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
886                             _PAGE_CACHE_UC_MINUS, NULL))
887                 return -EINVAL;
888
889         return _set_memory_uc(addr, numpages);
890 }
891 EXPORT_SYMBOL(set_memory_uc);
892
893 int _set_memory_wc(unsigned long addr, int numpages)
894 {
895         return change_page_attr_set(addr, numpages,
896                                     __pgprot(_PAGE_CACHE_WC));
897 }
898
899 int set_memory_wc(unsigned long addr, int numpages)
900 {
901         if (!pat_enabled)
902                 return set_memory_uc(addr, numpages);
903
904         if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
905                 _PAGE_CACHE_WC, NULL))
906                 return -EINVAL;
907
908         return _set_memory_wc(addr, numpages);
909 }
910 EXPORT_SYMBOL(set_memory_wc);
911
912 int _set_memory_wb(unsigned long addr, int numpages)
913 {
914         return change_page_attr_clear(addr, numpages,
915                                       __pgprot(_PAGE_CACHE_MASK));
916 }
917
918 int set_memory_wb(unsigned long addr, int numpages)
919 {
920         free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
921
922         return _set_memory_wb(addr, numpages);
923 }
924 EXPORT_SYMBOL(set_memory_wb);
925
926 int set_memory_x(unsigned long addr, int numpages)
927 {
928         return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
929 }
930 EXPORT_SYMBOL(set_memory_x);
931
932 int set_memory_nx(unsigned long addr, int numpages)
933 {
934         return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
935 }
936 EXPORT_SYMBOL(set_memory_nx);
937
938 int set_memory_ro(unsigned long addr, int numpages)
939 {
940         return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
941 }
942
943 int set_memory_rw(unsigned long addr, int numpages)
944 {
945         return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
946 }
947
948 int set_memory_np(unsigned long addr, int numpages)
949 {
950         return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
951 }
952
953 int set_memory_4k(unsigned long addr, int numpages)
954 {
955         return change_page_attr_set_clr(addr, numpages, __pgprot(0),
956                                         __pgprot(0), 1);
957 }
958
959 int set_pages_uc(struct page *page, int numpages)
960 {
961         unsigned long addr = (unsigned long)page_address(page);
962
963         return set_memory_uc(addr, numpages);
964 }
965 EXPORT_SYMBOL(set_pages_uc);
966
967 int set_pages_wb(struct page *page, int numpages)
968 {
969         unsigned long addr = (unsigned long)page_address(page);
970
971         return set_memory_wb(addr, numpages);
972 }
973 EXPORT_SYMBOL(set_pages_wb);
974
975 int set_pages_x(struct page *page, int numpages)
976 {
977         unsigned long addr = (unsigned long)page_address(page);
978
979         return set_memory_x(addr, numpages);
980 }
981 EXPORT_SYMBOL(set_pages_x);
982
983 int set_pages_nx(struct page *page, int numpages)
984 {
985         unsigned long addr = (unsigned long)page_address(page);
986
987         return set_memory_nx(addr, numpages);
988 }
989 EXPORT_SYMBOL(set_pages_nx);
990
991 int set_pages_ro(struct page *page, int numpages)
992 {
993         unsigned long addr = (unsigned long)page_address(page);
994
995         return set_memory_ro(addr, numpages);
996 }
997
998 int set_pages_rw(struct page *page, int numpages)
999 {
1000         unsigned long addr = (unsigned long)page_address(page);
1001
1002         return set_memory_rw(addr, numpages);
1003 }
1004
1005 #ifdef CONFIG_DEBUG_PAGEALLOC
1006
1007 static int __set_pages_p(struct page *page, int numpages)
1008 {
1009         struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
1010                                 .numpages = numpages,
1011                                 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1012                                 .mask_clr = __pgprot(0)};
1013
1014         return __change_page_attr_set_clr(&cpa, 1);
1015 }
1016
1017 static int __set_pages_np(struct page *page, int numpages)
1018 {
1019         struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
1020                                 .numpages = numpages,
1021                                 .mask_set = __pgprot(0),
1022                                 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
1023
1024         return __change_page_attr_set_clr(&cpa, 1);
1025 }
1026
1027 void kernel_map_pages(struct page *page, int numpages, int enable)
1028 {
1029         if (PageHighMem(page))
1030                 return;
1031         if (!enable) {
1032                 debug_check_no_locks_freed(page_address(page),
1033                                            numpages * PAGE_SIZE);
1034         }
1035
1036         /*
1037          * If page allocator is not up yet then do not call c_p_a():
1038          */
1039         if (!debug_pagealloc_enabled)
1040                 return;
1041
1042         /*
1043          * The return value is ignored as the calls cannot fail.
1044          * Large pages are kept enabled at boot time, and are
1045          * split up quickly with DEBUG_PAGEALLOC. If a splitup
1046          * fails here (due to temporary memory shortage) no damage
1047          * is done because we just keep the largepage intact up
1048          * to the next attempt when it will likely be split up:
1049          */
1050         if (enable)
1051                 __set_pages_p(page, numpages);
1052         else
1053                 __set_pages_np(page, numpages);
1054
1055         /*
1056          * We should perform an IPI and flush all tlbs,
1057          * but that can deadlock->flush only current cpu:
1058          */
1059         __flush_tlb_all();
1060
1061         /*
1062          * Try to refill the page pool here. We can do this only after
1063          * the tlb flush.
1064          */
1065         cpa_fill_pool(NULL);
1066 }
1067
1068 #ifdef CONFIG_DEBUG_FS
1069 static int dpa_show(struct seq_file *m, void *v)
1070 {
1071         seq_puts(m, "DEBUG_PAGEALLOC\n");
1072         seq_printf(m, "pool_size     : %lu\n", pool_size);
1073         seq_printf(m, "pool_pages    : %lu\n", pool_pages);
1074         seq_printf(m, "pool_low      : %lu\n", pool_low);
1075         seq_printf(m, "pool_used     : %lu\n", pool_used);
1076         seq_printf(m, "pool_failed   : %lu\n", pool_failed);
1077
1078         return 0;
1079 }
1080
1081 static int dpa_open(struct inode *inode, struct file *filp)
1082 {
1083         return single_open(filp, dpa_show, NULL);
1084 }
1085
1086 static const struct file_operations dpa_fops = {
1087         .open           = dpa_open,
1088         .read           = seq_read,
1089         .llseek         = seq_lseek,
1090         .release        = single_release,
1091 };
1092
1093 static int __init debug_pagealloc_proc_init(void)
1094 {
1095         struct dentry *de;
1096
1097         de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
1098                                  &dpa_fops);
1099         if (!de)
1100                 return -ENOMEM;
1101
1102         return 0;
1103 }
1104 __initcall(debug_pagealloc_proc_init);
1105 #endif
1106
1107 #ifdef CONFIG_HIBERNATION
1108
1109 bool kernel_page_present(struct page *page)
1110 {
1111         unsigned int level;
1112         pte_t *pte;
1113
1114         if (PageHighMem(page))
1115                 return false;
1116
1117         pte = lookup_address((unsigned long)page_address(page), &level);
1118         return (pte_val(*pte) & _PAGE_PRESENT);
1119 }
1120
1121 #endif /* CONFIG_HIBERNATION */
1122
1123 #endif /* CONFIG_DEBUG_PAGEALLOC */
1124
1125 /*
1126  * The testcases use internal knowledge of the implementation that shouldn't
1127  * be exposed to the rest of the kernel. Include these directly here.
1128  */
1129 #ifdef CONFIG_CPA_DEBUG
1130 #include "pageattr-test.c"
1131 #endif