Merge git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6-nmw
[pandora-kernel.git] / arch / x86 / mm / pageattr.c
1 /*
2  * Copyright 2002 Andi Kleen, SuSE Labs.
3  * Thanks to Ben LaHaise for precious feedback.
4  */
5 #include <linux/highmem.h>
6 #include <linux/bootmem.h>
7 #include <linux/module.h>
8 #include <linux/sched.h>
9 #include <linux/slab.h>
10 #include <linux/mm.h>
11 #include <linux/interrupt.h>
12 #include <linux/seq_file.h>
13 #include <linux/debugfs.h>
14
15 #include <asm/e820.h>
16 #include <asm/processor.h>
17 #include <asm/tlbflush.h>
18 #include <asm/sections.h>
19 #include <asm/uaccess.h>
20 #include <asm/pgalloc.h>
21 #include <asm/proto.h>
22 #include <asm/pat.h>
23
24 /*
25  * The current flushing context - we pass it instead of 5 arguments:
26  */
27 struct cpa_data {
28         unsigned long   vaddr;
29         pgprot_t        mask_set;
30         pgprot_t        mask_clr;
31         int             numpages;
32         int             flushtlb;
33         unsigned long   pfn;
34         unsigned        force_split : 1;
35 };
36
37 #ifdef CONFIG_PROC_FS
38 static unsigned long direct_pages_count[PG_LEVEL_NUM];
39
40 void update_page_count(int level, unsigned long pages)
41 {
42         unsigned long flags;
43
44         /* Protect against CPA */
45         spin_lock_irqsave(&pgd_lock, flags);
46         direct_pages_count[level] += pages;
47         spin_unlock_irqrestore(&pgd_lock, flags);
48 }
49
50 static void split_page_count(int level)
51 {
52         direct_pages_count[level]--;
53         direct_pages_count[level - 1] += PTRS_PER_PTE;
54 }
55
56 int arch_report_meminfo(char *page)
57 {
58         int n = sprintf(page, "DirectMap4k:  %8lu kB\n",
59                         direct_pages_count[PG_LEVEL_4K] << 2);
60 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
61         n += sprintf(page + n, "DirectMap2M:  %8lu kB\n",
62                         direct_pages_count[PG_LEVEL_2M] << 11);
63 #else
64         n += sprintf(page + n, "DirectMap4M:  %8lu kB\n",
65                         direct_pages_count[PG_LEVEL_2M] << 12);
66 #endif
67 #ifdef CONFIG_X86_64
68         if (direct_gbpages)
69                 n += sprintf(page + n, "DirectMap1G:  %8lu kB\n",
70                         direct_pages_count[PG_LEVEL_1G] << 20);
71 #endif
72         return n;
73 }
74 #else
75 static inline void split_page_count(int level) { }
76 #endif
77
78 #ifdef CONFIG_X86_64
79
80 static inline unsigned long highmap_start_pfn(void)
81 {
82         return __pa(_text) >> PAGE_SHIFT;
83 }
84
85 static inline unsigned long highmap_end_pfn(void)
86 {
87         return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
88 }
89
90 #endif
91
92 #ifdef CONFIG_DEBUG_PAGEALLOC
93 # define debug_pagealloc 1
94 #else
95 # define debug_pagealloc 0
96 #endif
97
98 static inline int
99 within(unsigned long addr, unsigned long start, unsigned long end)
100 {
101         return addr >= start && addr < end;
102 }
103
104 /*
105  * Flushing functions
106  */
107
108 /**
109  * clflush_cache_range - flush a cache range with clflush
110  * @addr:       virtual start address
111  * @size:       number of bytes to flush
112  *
113  * clflush is an unordered instruction which needs fencing with mfence
114  * to avoid ordering issues.
115  */
116 void clflush_cache_range(void *vaddr, unsigned int size)
117 {
118         void *vend = vaddr + size - 1;
119
120         mb();
121
122         for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
123                 clflush(vaddr);
124         /*
125          * Flush any possible final partial cacheline:
126          */
127         clflush(vend);
128
129         mb();
130 }
131
132 static void __cpa_flush_all(void *arg)
133 {
134         unsigned long cache = (unsigned long)arg;
135
136         /*
137          * Flush all to work around Errata in early athlons regarding
138          * large page flushing.
139          */
140         __flush_tlb_all();
141
142         if (cache && boot_cpu_data.x86_model >= 4)
143                 wbinvd();
144 }
145
146 static void cpa_flush_all(unsigned long cache)
147 {
148         BUG_ON(irqs_disabled());
149
150         on_each_cpu(__cpa_flush_all, (void *) cache, 1);
151 }
152
153 static void __cpa_flush_range(void *arg)
154 {
155         /*
156          * We could optimize that further and do individual per page
157          * tlb invalidates for a low number of pages. Caveat: we must
158          * flush the high aliases on 64bit as well.
159          */
160         __flush_tlb_all();
161 }
162
163 static void cpa_flush_range(unsigned long start, int numpages, int cache)
164 {
165         unsigned int i, level;
166         unsigned long addr;
167
168         BUG_ON(irqs_disabled());
169         WARN_ON(PAGE_ALIGN(start) != start);
170
171         on_each_cpu(__cpa_flush_range, NULL, 1);
172
173         if (!cache)
174                 return;
175
176         /*
177          * We only need to flush on one CPU,
178          * clflush is a MESI-coherent instruction that
179          * will cause all other CPUs to flush the same
180          * cachelines:
181          */
182         for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
183                 pte_t *pte = lookup_address(addr, &level);
184
185                 /*
186                  * Only flush present addresses:
187                  */
188                 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
189                         clflush_cache_range((void *) addr, PAGE_SIZE);
190         }
191 }
192
193 /*
194  * Certain areas of memory on x86 require very specific protection flags,
195  * for example the BIOS area or kernel text. Callers don't always get this
196  * right (again, ioremap() on BIOS memory is not uncommon) so this function
197  * checks and fixes these known static required protection bits.
198  */
199 static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
200                                    unsigned long pfn)
201 {
202         pgprot_t forbidden = __pgprot(0);
203
204         /*
205          * The BIOS area between 640k and 1Mb needs to be executable for
206          * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
207          */
208         if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
209                 pgprot_val(forbidden) |= _PAGE_NX;
210
211         /*
212          * The kernel text needs to be executable for obvious reasons
213          * Does not cover __inittext since that is gone later on. On
214          * 64bit we do not enforce !NX on the low mapping
215          */
216         if (within(address, (unsigned long)_text, (unsigned long)_etext))
217                 pgprot_val(forbidden) |= _PAGE_NX;
218
219         /*
220          * The .rodata section needs to be read-only. Using the pfn
221          * catches all aliases.
222          */
223         if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
224                    __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
225                 pgprot_val(forbidden) |= _PAGE_RW;
226
227         prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
228
229         return prot;
230 }
231
232 /*
233  * Lookup the page table entry for a virtual address. Return a pointer
234  * to the entry and the level of the mapping.
235  *
236  * Note: We return pud and pmd either when the entry is marked large
237  * or when the present bit is not set. Otherwise we would return a
238  * pointer to a nonexisting mapping.
239  */
240 pte_t *lookup_address(unsigned long address, unsigned int *level)
241 {
242         pgd_t *pgd = pgd_offset_k(address);
243         pud_t *pud;
244         pmd_t *pmd;
245
246         *level = PG_LEVEL_NONE;
247
248         if (pgd_none(*pgd))
249                 return NULL;
250
251         pud = pud_offset(pgd, address);
252         if (pud_none(*pud))
253                 return NULL;
254
255         *level = PG_LEVEL_1G;
256         if (pud_large(*pud) || !pud_present(*pud))
257                 return (pte_t *)pud;
258
259         pmd = pmd_offset(pud, address);
260         if (pmd_none(*pmd))
261                 return NULL;
262
263         *level = PG_LEVEL_2M;
264         if (pmd_large(*pmd) || !pmd_present(*pmd))
265                 return (pte_t *)pmd;
266
267         *level = PG_LEVEL_4K;
268
269         return pte_offset_kernel(pmd, address);
270 }
271 EXPORT_SYMBOL_GPL(lookup_address);
272
273 /*
274  * Set the new pmd in all the pgds we know about:
275  */
276 static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
277 {
278         /* change init_mm */
279         set_pte_atomic(kpte, pte);
280 #ifdef CONFIG_X86_32
281         if (!SHARED_KERNEL_PMD) {
282                 struct page *page;
283
284                 list_for_each_entry(page, &pgd_list, lru) {
285                         pgd_t *pgd;
286                         pud_t *pud;
287                         pmd_t *pmd;
288
289                         pgd = (pgd_t *)page_address(page) + pgd_index(address);
290                         pud = pud_offset(pgd, address);
291                         pmd = pmd_offset(pud, address);
292                         set_pte_atomic((pte_t *)pmd, pte);
293                 }
294         }
295 #endif
296 }
297
298 static int
299 try_preserve_large_page(pte_t *kpte, unsigned long address,
300                         struct cpa_data *cpa)
301 {
302         unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
303         pte_t new_pte, old_pte, *tmp;
304         pgprot_t old_prot, new_prot;
305         int i, do_split = 1;
306         unsigned int level;
307
308         if (cpa->force_split)
309                 return 1;
310
311         spin_lock_irqsave(&pgd_lock, flags);
312         /*
313          * Check for races, another CPU might have split this page
314          * up already:
315          */
316         tmp = lookup_address(address, &level);
317         if (tmp != kpte)
318                 goto out_unlock;
319
320         switch (level) {
321         case PG_LEVEL_2M:
322                 psize = PMD_PAGE_SIZE;
323                 pmask = PMD_PAGE_MASK;
324                 break;
325 #ifdef CONFIG_X86_64
326         case PG_LEVEL_1G:
327                 psize = PUD_PAGE_SIZE;
328                 pmask = PUD_PAGE_MASK;
329                 break;
330 #endif
331         default:
332                 do_split = -EINVAL;
333                 goto out_unlock;
334         }
335
336         /*
337          * Calculate the number of pages, which fit into this large
338          * page starting at address:
339          */
340         nextpage_addr = (address + psize) & pmask;
341         numpages = (nextpage_addr - address) >> PAGE_SHIFT;
342         if (numpages < cpa->numpages)
343                 cpa->numpages = numpages;
344
345         /*
346          * We are safe now. Check whether the new pgprot is the same:
347          */
348         old_pte = *kpte;
349         old_prot = new_prot = pte_pgprot(old_pte);
350
351         pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
352         pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
353
354         /*
355          * old_pte points to the large page base address. So we need
356          * to add the offset of the virtual address:
357          */
358         pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
359         cpa->pfn = pfn;
360
361         new_prot = static_protections(new_prot, address, pfn);
362
363         /*
364          * We need to check the full range, whether
365          * static_protection() requires a different pgprot for one of
366          * the pages in the range we try to preserve:
367          */
368         addr = address + PAGE_SIZE;
369         pfn++;
370         for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
371                 pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
372
373                 if (pgprot_val(chk_prot) != pgprot_val(new_prot))
374                         goto out_unlock;
375         }
376
377         /*
378          * If there are no changes, return. maxpages has been updated
379          * above:
380          */
381         if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
382                 do_split = 0;
383                 goto out_unlock;
384         }
385
386         /*
387          * We need to change the attributes. Check, whether we can
388          * change the large page in one go. We request a split, when
389          * the address is not aligned and the number of pages is
390          * smaller than the number of pages in the large page. Note
391          * that we limited the number of possible pages already to
392          * the number of pages in the large page.
393          */
394         if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
395                 /*
396                  * The address is aligned and the number of pages
397                  * covers the full page.
398                  */
399                 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
400                 __set_pmd_pte(kpte, address, new_pte);
401                 cpa->flushtlb = 1;
402                 do_split = 0;
403         }
404
405 out_unlock:
406         spin_unlock_irqrestore(&pgd_lock, flags);
407
408         return do_split;
409 }
410
411 static LIST_HEAD(page_pool);
412 static unsigned long pool_size, pool_pages, pool_low;
413 static unsigned long pool_used, pool_failed;
414
415 static void cpa_fill_pool(struct page **ret)
416 {
417         gfp_t gfp = GFP_KERNEL;
418         unsigned long flags;
419         struct page *p;
420
421         /*
422          * Avoid recursion (on debug-pagealloc) and also signal
423          * our priority to get to these pagetables:
424          */
425         if (current->flags & PF_MEMALLOC)
426                 return;
427         current->flags |= PF_MEMALLOC;
428
429         /*
430          * Allocate atomically from atomic contexts:
431          */
432         if (in_atomic() || irqs_disabled() || debug_pagealloc)
433                 gfp =  GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
434
435         while (pool_pages < pool_size || (ret && !*ret)) {
436                 p = alloc_pages(gfp, 0);
437                 if (!p) {
438                         pool_failed++;
439                         break;
440                 }
441                 /*
442                  * If the call site needs a page right now, provide it:
443                  */
444                 if (ret && !*ret) {
445                         *ret = p;
446                         continue;
447                 }
448                 spin_lock_irqsave(&pgd_lock, flags);
449                 list_add(&p->lru, &page_pool);
450                 pool_pages++;
451                 spin_unlock_irqrestore(&pgd_lock, flags);
452         }
453
454         current->flags &= ~PF_MEMALLOC;
455 }
456
457 #define SHIFT_MB                (20 - PAGE_SHIFT)
458 #define ROUND_MB_GB             ((1 << 10) - 1)
459 #define SHIFT_MB_GB             10
460 #define POOL_PAGES_PER_GB       16
461
462 void __init cpa_init(void)
463 {
464         struct sysinfo si;
465         unsigned long gb;
466
467         si_meminfo(&si);
468         /*
469          * Calculate the number of pool pages:
470          *
471          * Convert totalram (nr of pages) to MiB and round to the next
472          * GiB. Shift MiB to Gib and multiply the result by
473          * POOL_PAGES_PER_GB:
474          */
475         if (debug_pagealloc) {
476                 gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
477                 pool_size = POOL_PAGES_PER_GB * gb;
478         } else {
479                 pool_size = 1;
480         }
481         pool_low = pool_size;
482
483         cpa_fill_pool(NULL);
484         printk(KERN_DEBUG
485                "CPA: page pool initialized %lu of %lu pages preallocated\n",
486                pool_pages, pool_size);
487 }
488
489 static int split_large_page(pte_t *kpte, unsigned long address)
490 {
491         unsigned long flags, pfn, pfninc = 1;
492         unsigned int i, level;
493         pte_t *pbase, *tmp;
494         pgprot_t ref_prot;
495         struct page *base;
496
497         /*
498          * Get a page from the pool. The pool list is protected by the
499          * pgd_lock, which we have to take anyway for the split
500          * operation:
501          */
502         spin_lock_irqsave(&pgd_lock, flags);
503         if (list_empty(&page_pool)) {
504                 spin_unlock_irqrestore(&pgd_lock, flags);
505                 base = NULL;
506                 cpa_fill_pool(&base);
507                 if (!base)
508                         return -ENOMEM;
509                 spin_lock_irqsave(&pgd_lock, flags);
510         } else {
511                 base = list_first_entry(&page_pool, struct page, lru);
512                 list_del(&base->lru);
513                 pool_pages--;
514
515                 if (pool_pages < pool_low)
516                         pool_low = pool_pages;
517         }
518
519         /*
520          * Check for races, another CPU might have split this page
521          * up for us already:
522          */
523         tmp = lookup_address(address, &level);
524         if (tmp != kpte)
525                 goto out_unlock;
526
527         pbase = (pte_t *)page_address(base);
528         paravirt_alloc_pte(&init_mm, page_to_pfn(base));
529         ref_prot = pte_pgprot(pte_clrhuge(*kpte));
530
531 #ifdef CONFIG_X86_64
532         if (level == PG_LEVEL_1G) {
533                 pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
534                 pgprot_val(ref_prot) |= _PAGE_PSE;
535         }
536 #endif
537
538         /*
539          * Get the target pfn from the original entry:
540          */
541         pfn = pte_pfn(*kpte);
542         for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
543                 set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
544
545         if (address >= (unsigned long)__va(0) &&
546                 address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
547                 split_page_count(level);
548
549 #ifdef CONFIG_X86_64
550         if (address >= (unsigned long)__va(1UL<<32) &&
551                 address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
552                 split_page_count(level);
553 #endif
554
555         /*
556          * Install the new, split up pagetable. Important details here:
557          *
558          * On Intel the NX bit of all levels must be cleared to make a
559          * page executable. See section 4.13.2 of Intel 64 and IA-32
560          * Architectures Software Developer's Manual).
561          *
562          * Mark the entry present. The current mapping might be
563          * set to not present, which we preserved above.
564          */
565         ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
566         pgprot_val(ref_prot) |= _PAGE_PRESENT;
567         __set_pmd_pte(kpte, address, mk_pte(base, ref_prot));
568         base = NULL;
569
570 out_unlock:
571         /*
572          * If we dropped out via the lookup_address check under
573          * pgd_lock then stick the page back into the pool:
574          */
575         if (base) {
576                 list_add(&base->lru, &page_pool);
577                 pool_pages++;
578         } else
579                 pool_used++;
580         spin_unlock_irqrestore(&pgd_lock, flags);
581
582         return 0;
583 }
584
585 static int __change_page_attr(struct cpa_data *cpa, int primary)
586 {
587         unsigned long address = cpa->vaddr;
588         int do_split, err;
589         unsigned int level;
590         pte_t *kpte, old_pte;
591
592 repeat:
593         kpte = lookup_address(address, &level);
594         if (!kpte)
595                 return 0;
596
597         old_pte = *kpte;
598         if (!pte_val(old_pte)) {
599                 if (!primary)
600                         return 0;
601                 WARN(1, KERN_WARNING "CPA: called for zero pte. "
602                        "vaddr = %lx cpa->vaddr = %lx\n", address,
603                        cpa->vaddr);
604                 return -EINVAL;
605         }
606
607         if (level == PG_LEVEL_4K) {
608                 pte_t new_pte;
609                 pgprot_t new_prot = pte_pgprot(old_pte);
610                 unsigned long pfn = pte_pfn(old_pte);
611
612                 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
613                 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
614
615                 new_prot = static_protections(new_prot, address, pfn);
616
617                 /*
618                  * We need to keep the pfn from the existing PTE,
619                  * after all we're only going to change it's attributes
620                  * not the memory it points to
621                  */
622                 new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
623                 cpa->pfn = pfn;
624                 /*
625                  * Do we really change anything ?
626                  */
627                 if (pte_val(old_pte) != pte_val(new_pte)) {
628                         set_pte_atomic(kpte, new_pte);
629                         cpa->flushtlb = 1;
630                 }
631                 cpa->numpages = 1;
632                 return 0;
633         }
634
635         /*
636          * Check, whether we can keep the large page intact
637          * and just change the pte:
638          */
639         do_split = try_preserve_large_page(kpte, address, cpa);
640         /*
641          * When the range fits into the existing large page,
642          * return. cp->numpages and cpa->tlbflush have been updated in
643          * try_large_page:
644          */
645         if (do_split <= 0)
646                 return do_split;
647
648         /*
649          * We have to split the large page:
650          */
651         err = split_large_page(kpte, address);
652         if (!err) {
653                 cpa->flushtlb = 1;
654                 goto repeat;
655         }
656
657         return err;
658 }
659
660 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
661
662 static int cpa_process_alias(struct cpa_data *cpa)
663 {
664         struct cpa_data alias_cpa;
665         int ret = 0;
666
667         if (cpa->pfn >= max_pfn_mapped)
668                 return 0;
669
670 #ifdef CONFIG_X86_64
671         if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
672                 return 0;
673 #endif
674         /*
675          * No need to redo, when the primary call touched the direct
676          * mapping already:
677          */
678         if (!(within(cpa->vaddr, PAGE_OFFSET,
679                     PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
680 #ifdef CONFIG_X86_64
681                 || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
682                     PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
683 #endif
684         )) {
685
686                 alias_cpa = *cpa;
687                 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
688
689                 ret = __change_page_attr_set_clr(&alias_cpa, 0);
690         }
691
692 #ifdef CONFIG_X86_64
693         if (ret)
694                 return ret;
695         /*
696          * No need to redo, when the primary call touched the high
697          * mapping already:
698          */
699         if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end))
700                 return 0;
701
702         /*
703          * If the physical address is inside the kernel map, we need
704          * to touch the high mapped kernel as well:
705          */
706         if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn()))
707                 return 0;
708
709         alias_cpa = *cpa;
710         alias_cpa.vaddr =
711                 (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
712
713         /*
714          * The high mapping range is imprecise, so ignore the return value.
715          */
716         __change_page_attr_set_clr(&alias_cpa, 0);
717 #endif
718         return ret;
719 }
720
721 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
722 {
723         int ret, numpages = cpa->numpages;
724
725         while (numpages) {
726                 /*
727                  * Store the remaining nr of pages for the large page
728                  * preservation check.
729                  */
730                 cpa->numpages = numpages;
731
732                 ret = __change_page_attr(cpa, checkalias);
733                 if (ret)
734                         return ret;
735
736                 if (checkalias) {
737                         ret = cpa_process_alias(cpa);
738                         if (ret)
739                                 return ret;
740                 }
741
742                 /*
743                  * Adjust the number of pages with the result of the
744                  * CPA operation. Either a large page has been
745                  * preserved or a single page update happened.
746                  */
747                 BUG_ON(cpa->numpages > numpages);
748                 numpages -= cpa->numpages;
749                 cpa->vaddr += cpa->numpages * PAGE_SIZE;
750         }
751         return 0;
752 }
753
754 static inline int cache_attr(pgprot_t attr)
755 {
756         return pgprot_val(attr) &
757                 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
758 }
759
760 static int change_page_attr_set_clr(unsigned long addr, int numpages,
761                                     pgprot_t mask_set, pgprot_t mask_clr,
762                                     int force_split)
763 {
764         struct cpa_data cpa;
765         int ret, cache, checkalias;
766
767         /*
768          * Check, if we are requested to change a not supported
769          * feature:
770          */
771         mask_set = canon_pgprot(mask_set);
772         mask_clr = canon_pgprot(mask_clr);
773         if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
774                 return 0;
775
776         /* Ensure we are PAGE_SIZE aligned */
777         if (addr & ~PAGE_MASK) {
778                 addr &= PAGE_MASK;
779                 /*
780                  * People should not be passing in unaligned addresses:
781                  */
782                 WARN_ON_ONCE(1);
783         }
784
785         cpa.vaddr = addr;
786         cpa.numpages = numpages;
787         cpa.mask_set = mask_set;
788         cpa.mask_clr = mask_clr;
789         cpa.flushtlb = 0;
790         cpa.force_split = force_split;
791
792         /* No alias checking for _NX bit modifications */
793         checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
794
795         ret = __change_page_attr_set_clr(&cpa, checkalias);
796
797         /*
798          * Check whether we really changed something:
799          */
800         if (!cpa.flushtlb)
801                 goto out;
802
803         /*
804          * No need to flush, when we did not set any of the caching
805          * attributes:
806          */
807         cache = cache_attr(mask_set);
808
809         /*
810          * On success we use clflush, when the CPU supports it to
811          * avoid the wbindv. If the CPU does not support it and in the
812          * error case we fall back to cpa_flush_all (which uses
813          * wbindv):
814          */
815         if (!ret && cpu_has_clflush)
816                 cpa_flush_range(addr, numpages, cache);
817         else
818                 cpa_flush_all(cache);
819
820 out:
821         cpa_fill_pool(NULL);
822
823         return ret;
824 }
825
826 static inline int change_page_attr_set(unsigned long addr, int numpages,
827                                        pgprot_t mask)
828 {
829         return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0);
830 }
831
832 static inline int change_page_attr_clear(unsigned long addr, int numpages,
833                                          pgprot_t mask)
834 {
835         return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
836 }
837
838 int _set_memory_uc(unsigned long addr, int numpages)
839 {
840         /*
841          * for now UC MINUS. see comments in ioremap_nocache()
842          */
843         return change_page_attr_set(addr, numpages,
844                                     __pgprot(_PAGE_CACHE_UC_MINUS));
845 }
846
847 int set_memory_uc(unsigned long addr, int numpages)
848 {
849         /*
850          * for now UC MINUS. see comments in ioremap_nocache()
851          */
852         if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
853                             _PAGE_CACHE_UC_MINUS, NULL))
854                 return -EINVAL;
855
856         return _set_memory_uc(addr, numpages);
857 }
858 EXPORT_SYMBOL(set_memory_uc);
859
860 int _set_memory_wc(unsigned long addr, int numpages)
861 {
862         return change_page_attr_set(addr, numpages,
863                                     __pgprot(_PAGE_CACHE_WC));
864 }
865
866 int set_memory_wc(unsigned long addr, int numpages)
867 {
868         if (!pat_enabled)
869                 return set_memory_uc(addr, numpages);
870
871         if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
872                 _PAGE_CACHE_WC, NULL))
873                 return -EINVAL;
874
875         return _set_memory_wc(addr, numpages);
876 }
877 EXPORT_SYMBOL(set_memory_wc);
878
879 int _set_memory_wb(unsigned long addr, int numpages)
880 {
881         return change_page_attr_clear(addr, numpages,
882                                       __pgprot(_PAGE_CACHE_MASK));
883 }
884
885 int set_memory_wb(unsigned long addr, int numpages)
886 {
887         free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
888
889         return _set_memory_wb(addr, numpages);
890 }
891 EXPORT_SYMBOL(set_memory_wb);
892
893 int set_memory_x(unsigned long addr, int numpages)
894 {
895         return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
896 }
897 EXPORT_SYMBOL(set_memory_x);
898
899 int set_memory_nx(unsigned long addr, int numpages)
900 {
901         return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
902 }
903 EXPORT_SYMBOL(set_memory_nx);
904
905 int set_memory_ro(unsigned long addr, int numpages)
906 {
907         return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
908 }
909 EXPORT_SYMBOL_GPL(set_memory_ro);
910
911 int set_memory_rw(unsigned long addr, int numpages)
912 {
913         return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
914 }
915 EXPORT_SYMBOL_GPL(set_memory_rw);
916
917 int set_memory_np(unsigned long addr, int numpages)
918 {
919         return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
920 }
921
922 int set_memory_4k(unsigned long addr, int numpages)
923 {
924         return change_page_attr_set_clr(addr, numpages, __pgprot(0),
925                                         __pgprot(0), 1);
926 }
927
928 int set_pages_uc(struct page *page, int numpages)
929 {
930         unsigned long addr = (unsigned long)page_address(page);
931
932         return set_memory_uc(addr, numpages);
933 }
934 EXPORT_SYMBOL(set_pages_uc);
935
936 int set_pages_wb(struct page *page, int numpages)
937 {
938         unsigned long addr = (unsigned long)page_address(page);
939
940         return set_memory_wb(addr, numpages);
941 }
942 EXPORT_SYMBOL(set_pages_wb);
943
944 int set_pages_x(struct page *page, int numpages)
945 {
946         unsigned long addr = (unsigned long)page_address(page);
947
948         return set_memory_x(addr, numpages);
949 }
950 EXPORT_SYMBOL(set_pages_x);
951
952 int set_pages_nx(struct page *page, int numpages)
953 {
954         unsigned long addr = (unsigned long)page_address(page);
955
956         return set_memory_nx(addr, numpages);
957 }
958 EXPORT_SYMBOL(set_pages_nx);
959
960 int set_pages_ro(struct page *page, int numpages)
961 {
962         unsigned long addr = (unsigned long)page_address(page);
963
964         return set_memory_ro(addr, numpages);
965 }
966
967 int set_pages_rw(struct page *page, int numpages)
968 {
969         unsigned long addr = (unsigned long)page_address(page);
970
971         return set_memory_rw(addr, numpages);
972 }
973
974 #ifdef CONFIG_DEBUG_PAGEALLOC
975
976 static int __set_pages_p(struct page *page, int numpages)
977 {
978         struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
979                                 .numpages = numpages,
980                                 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
981                                 .mask_clr = __pgprot(0)};
982
983         return __change_page_attr_set_clr(&cpa, 1);
984 }
985
986 static int __set_pages_np(struct page *page, int numpages)
987 {
988         struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
989                                 .numpages = numpages,
990                                 .mask_set = __pgprot(0),
991                                 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
992
993         return __change_page_attr_set_clr(&cpa, 1);
994 }
995
996 void kernel_map_pages(struct page *page, int numpages, int enable)
997 {
998         if (PageHighMem(page))
999                 return;
1000         if (!enable) {
1001                 debug_check_no_locks_freed(page_address(page),
1002                                            numpages * PAGE_SIZE);
1003         }
1004
1005         /*
1006          * If page allocator is not up yet then do not call c_p_a():
1007          */
1008         if (!debug_pagealloc_enabled)
1009                 return;
1010
1011         /*
1012          * The return value is ignored as the calls cannot fail.
1013          * Large pages are kept enabled at boot time, and are
1014          * split up quickly with DEBUG_PAGEALLOC. If a splitup
1015          * fails here (due to temporary memory shortage) no damage
1016          * is done because we just keep the largepage intact up
1017          * to the next attempt when it will likely be split up:
1018          */
1019         if (enable)
1020                 __set_pages_p(page, numpages);
1021         else
1022                 __set_pages_np(page, numpages);
1023
1024         /*
1025          * We should perform an IPI and flush all tlbs,
1026          * but that can deadlock->flush only current cpu:
1027          */
1028         __flush_tlb_all();
1029
1030         /*
1031          * Try to refill the page pool here. We can do this only after
1032          * the tlb flush.
1033          */
1034         cpa_fill_pool(NULL);
1035 }
1036
1037 #ifdef CONFIG_DEBUG_FS
1038 static int dpa_show(struct seq_file *m, void *v)
1039 {
1040         seq_puts(m, "DEBUG_PAGEALLOC\n");
1041         seq_printf(m, "pool_size     : %lu\n", pool_size);
1042         seq_printf(m, "pool_pages    : %lu\n", pool_pages);
1043         seq_printf(m, "pool_low      : %lu\n", pool_low);
1044         seq_printf(m, "pool_used     : %lu\n", pool_used);
1045         seq_printf(m, "pool_failed   : %lu\n", pool_failed);
1046
1047         return 0;
1048 }
1049
1050 static int dpa_open(struct inode *inode, struct file *filp)
1051 {
1052         return single_open(filp, dpa_show, NULL);
1053 }
1054
1055 static const struct file_operations dpa_fops = {
1056         .open           = dpa_open,
1057         .read           = seq_read,
1058         .llseek         = seq_lseek,
1059         .release        = single_release,
1060 };
1061
1062 static int __init debug_pagealloc_proc_init(void)
1063 {
1064         struct dentry *de;
1065
1066         de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
1067                                  &dpa_fops);
1068         if (!de)
1069                 return -ENOMEM;
1070
1071         return 0;
1072 }
1073 __initcall(debug_pagealloc_proc_init);
1074 #endif
1075
1076 #ifdef CONFIG_HIBERNATION
1077
1078 bool kernel_page_present(struct page *page)
1079 {
1080         unsigned int level;
1081         pte_t *pte;
1082
1083         if (PageHighMem(page))
1084                 return false;
1085
1086         pte = lookup_address((unsigned long)page_address(page), &level);
1087         return (pte_val(*pte) & _PAGE_PRESENT);
1088 }
1089
1090 #endif /* CONFIG_HIBERNATION */
1091
1092 #endif /* CONFIG_DEBUG_PAGEALLOC */
1093
1094 /*
1095  * The testcases use internal knowledge of the implementation that shouldn't
1096  * be exposed to the rest of the kernel. Include these directly here.
1097  */
1098 #ifdef CONFIG_CPA_DEBUG
1099 #include "pageattr-test.c"
1100 #endif