mm: thp: set the accessed flag for old pages on access fault
[pandora-kernel.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
46
47 #define ROOT_SIZE               VTD_PAGE_SIZE
48 #define CONTEXT_SIZE            VTD_PAGE_SIZE
49
50 #define IS_BRIDGE_HOST_DEVICE(pdev) \
51                             ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63
64 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66
67 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
70                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72
73 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
74 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
75 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
76
77 /* page table handling */
78 #define LEVEL_STRIDE            (9)
79 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
80
81 static inline int agaw_to_level(int agaw)
82 {
83         return agaw + 2;
84 }
85
86 static inline int agaw_to_width(int agaw)
87 {
88         return 30 + agaw * LEVEL_STRIDE;
89 }
90
91 static inline int width_to_agaw(int width)
92 {
93         return (width - 30) / LEVEL_STRIDE;
94 }
95
96 static inline unsigned int level_to_offset_bits(int level)
97 {
98         return (level - 1) * LEVEL_STRIDE;
99 }
100
101 static inline int pfn_level_offset(unsigned long pfn, int level)
102 {
103         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
104 }
105
106 static inline unsigned long level_mask(int level)
107 {
108         return -1UL << level_to_offset_bits(level);
109 }
110
111 static inline unsigned long level_size(int level)
112 {
113         return 1UL << level_to_offset_bits(level);
114 }
115
116 static inline unsigned long align_to_level(unsigned long pfn, int level)
117 {
118         return (pfn + level_size(level) - 1) & level_mask(level);
119 }
120
121 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
122 {
123         return  1 << ((lvl - 1) * LEVEL_STRIDE);
124 }
125
126 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
127    are never going to work. */
128 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
129 {
130         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
131 }
132
133 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
134 {
135         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
136 }
137 static inline unsigned long page_to_dma_pfn(struct page *pg)
138 {
139         return mm_to_dma_pfn(page_to_pfn(pg));
140 }
141 static inline unsigned long virt_to_dma_pfn(void *p)
142 {
143         return page_to_dma_pfn(virt_to_page(p));
144 }
145
146 /* global iommu list, set NULL for ignored DMAR units */
147 static struct intel_iommu **g_iommus;
148
149 static void __init check_tylersburg_isoch(void);
150 static int rwbf_quirk;
151
152 /*
153  * set to 1 to panic kernel if can't successfully enable VT-d
154  * (used when kernel is launched w/ TXT)
155  */
156 static int force_on = 0;
157
158 /*
159  * 0: Present
160  * 1-11: Reserved
161  * 12-63: Context Ptr (12 - (haw-1))
162  * 64-127: Reserved
163  */
164 struct root_entry {
165         u64     val;
166         u64     rsvd1;
167 };
168 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
169 static inline bool root_present(struct root_entry *root)
170 {
171         return (root->val & 1);
172 }
173 static inline void set_root_present(struct root_entry *root)
174 {
175         root->val |= 1;
176 }
177 static inline void set_root_value(struct root_entry *root, unsigned long value)
178 {
179         root->val |= value & VTD_PAGE_MASK;
180 }
181
182 static inline struct context_entry *
183 get_context_addr_from_root(struct root_entry *root)
184 {
185         return (struct context_entry *)
186                 (root_present(root)?phys_to_virt(
187                 root->val & VTD_PAGE_MASK) :
188                 NULL);
189 }
190
191 /*
192  * low 64 bits:
193  * 0: present
194  * 1: fault processing disable
195  * 2-3: translation type
196  * 12-63: address space root
197  * high 64 bits:
198  * 0-2: address width
199  * 3-6: aval
200  * 8-23: domain id
201  */
202 struct context_entry {
203         u64 lo;
204         u64 hi;
205 };
206
207 static inline bool context_present(struct context_entry *context)
208 {
209         return (context->lo & 1);
210 }
211 static inline void context_set_present(struct context_entry *context)
212 {
213         context->lo |= 1;
214 }
215
216 static inline void context_set_fault_enable(struct context_entry *context)
217 {
218         context->lo &= (((u64)-1) << 2) | 1;
219 }
220
221 static inline void context_set_translation_type(struct context_entry *context,
222                                                 unsigned long value)
223 {
224         context->lo &= (((u64)-1) << 4) | 3;
225         context->lo |= (value & 3) << 2;
226 }
227
228 static inline void context_set_address_root(struct context_entry *context,
229                                             unsigned long value)
230 {
231         context->lo |= value & VTD_PAGE_MASK;
232 }
233
234 static inline void context_set_address_width(struct context_entry *context,
235                                              unsigned long value)
236 {
237         context->hi |= value & 7;
238 }
239
240 static inline void context_set_domain_id(struct context_entry *context,
241                                          unsigned long value)
242 {
243         context->hi |= (value & ((1 << 16) - 1)) << 8;
244 }
245
246 static inline void context_clear_entry(struct context_entry *context)
247 {
248         context->lo = 0;
249         context->hi = 0;
250 }
251
252 /*
253  * 0: readable
254  * 1: writable
255  * 2-6: reserved
256  * 7: super page
257  * 8-10: available
258  * 11: snoop behavior
259  * 12-63: Host physcial address
260  */
261 struct dma_pte {
262         u64 val;
263 };
264
265 static inline void dma_clear_pte(struct dma_pte *pte)
266 {
267         pte->val = 0;
268 }
269
270 static inline void dma_set_pte_readable(struct dma_pte *pte)
271 {
272         pte->val |= DMA_PTE_READ;
273 }
274
275 static inline void dma_set_pte_writable(struct dma_pte *pte)
276 {
277         pte->val |= DMA_PTE_WRITE;
278 }
279
280 static inline void dma_set_pte_snp(struct dma_pte *pte)
281 {
282         pte->val |= DMA_PTE_SNP;
283 }
284
285 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
286 {
287         pte->val = (pte->val & ~3) | (prot & 3);
288 }
289
290 static inline u64 dma_pte_addr(struct dma_pte *pte)
291 {
292 #ifdef CONFIG_64BIT
293         return pte->val & VTD_PAGE_MASK;
294 #else
295         /* Must have a full atomic 64-bit read */
296         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
297 #endif
298 }
299
300 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
301 {
302         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
303 }
304
305 static inline bool dma_pte_present(struct dma_pte *pte)
306 {
307         return (pte->val & 3) != 0;
308 }
309
310 static inline bool dma_pte_superpage(struct dma_pte *pte)
311 {
312         return (pte->val & (1 << 7));
313 }
314
315 static inline int first_pte_in_page(struct dma_pte *pte)
316 {
317         return !((unsigned long)pte & ~VTD_PAGE_MASK);
318 }
319
320 /*
321  * This domain is a statically identity mapping domain.
322  *      1. This domain creats a static 1:1 mapping to all usable memory.
323  *      2. It maps to each iommu if successful.
324  *      3. Each iommu mapps to this domain if successful.
325  */
326 static struct dmar_domain *si_domain;
327 static int hw_pass_through = 1;
328
329 /* devices under the same p2p bridge are owned in one domain */
330 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
331
332 /* domain represents a virtual machine, more than one devices
333  * across iommus may be owned in one domain, e.g. kvm guest.
334  */
335 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
336
337 /* si_domain contains mulitple devices */
338 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
339
340 struct dmar_domain {
341         int     id;                     /* domain id */
342         int     nid;                    /* node id */
343         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
344
345         struct list_head devices;       /* all devices' list */
346         struct iova_domain iovad;       /* iova's that belong to this domain */
347
348         struct dma_pte  *pgd;           /* virtual address */
349         int             gaw;            /* max guest address width */
350
351         /* adjusted guest address width, 0 is level 2 30-bit */
352         int             agaw;
353
354         int             flags;          /* flags to find out type of domain */
355
356         int             iommu_coherency;/* indicate coherency of iommu access */
357         int             iommu_snooping; /* indicate snooping control feature*/
358         int             iommu_count;    /* reference count of iommu */
359         int             iommu_superpage;/* Level of superpages supported:
360                                            0 == 4KiB (no superpages), 1 == 2MiB,
361                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
362         spinlock_t      iommu_lock;     /* protect iommu set in domain */
363         u64             max_addr;       /* maximum mapped address */
364 };
365
366 /* PCI domain-device relationship */
367 struct device_domain_info {
368         struct list_head link;  /* link to domain siblings */
369         struct list_head global; /* link to global list */
370         int segment;            /* PCI domain */
371         u8 bus;                 /* PCI bus number */
372         u8 devfn;               /* PCI devfn number */
373         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
374         struct intel_iommu *iommu; /* IOMMU used by this device */
375         struct dmar_domain *domain; /* pointer to domain */
376 };
377
378 static void flush_unmaps_timeout(unsigned long data);
379
380 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
381
382 #define HIGH_WATER_MARK 250
383 struct deferred_flush_tables {
384         int next;
385         struct iova *iova[HIGH_WATER_MARK];
386         struct dmar_domain *domain[HIGH_WATER_MARK];
387 };
388
389 static struct deferred_flush_tables *deferred_flush;
390
391 /* bitmap for indexing intel_iommus */
392 static int g_num_of_iommus;
393
394 static DEFINE_SPINLOCK(async_umap_flush_lock);
395 static LIST_HEAD(unmaps_to_do);
396
397 static int timer_on;
398 static long list_size;
399
400 static void domain_remove_dev_info(struct dmar_domain *domain);
401
402 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
403 int dmar_disabled = 0;
404 #else
405 int dmar_disabled = 1;
406 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
407
408 int intel_iommu_enabled = 0;
409 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
410
411 static int dmar_map_gfx = 1;
412 static int dmar_forcedac;
413 static int intel_iommu_strict;
414 static int intel_iommu_superpage = 1;
415
416 int intel_iommu_gfx_mapped;
417 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
418
419 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
420 static DEFINE_SPINLOCK(device_domain_lock);
421 static LIST_HEAD(device_domain_list);
422
423 static struct iommu_ops intel_iommu_ops;
424
425 static int __init intel_iommu_setup(char *str)
426 {
427         if (!str)
428                 return -EINVAL;
429         while (*str) {
430                 if (!strncmp(str, "on", 2)) {
431                         dmar_disabled = 0;
432                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
433                 } else if (!strncmp(str, "off", 3)) {
434                         dmar_disabled = 1;
435                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
436                 } else if (!strncmp(str, "igfx_off", 8)) {
437                         dmar_map_gfx = 0;
438                         printk(KERN_INFO
439                                 "Intel-IOMMU: disable GFX device mapping\n");
440                 } else if (!strncmp(str, "forcedac", 8)) {
441                         printk(KERN_INFO
442                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
443                         dmar_forcedac = 1;
444                 } else if (!strncmp(str, "strict", 6)) {
445                         printk(KERN_INFO
446                                 "Intel-IOMMU: disable batched IOTLB flush\n");
447                         intel_iommu_strict = 1;
448                 } else if (!strncmp(str, "sp_off", 6)) {
449                         printk(KERN_INFO
450                                 "Intel-IOMMU: disable supported super page\n");
451                         intel_iommu_superpage = 0;
452                 }
453
454                 str += strcspn(str, ",");
455                 while (*str == ',')
456                         str++;
457         }
458         return 0;
459 }
460 __setup("intel_iommu=", intel_iommu_setup);
461
462 static struct kmem_cache *iommu_domain_cache;
463 static struct kmem_cache *iommu_devinfo_cache;
464 static struct kmem_cache *iommu_iova_cache;
465
466 static inline void *alloc_pgtable_page(int node)
467 {
468         struct page *page;
469         void *vaddr = NULL;
470
471         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
472         if (page)
473                 vaddr = page_address(page);
474         return vaddr;
475 }
476
477 static inline void free_pgtable_page(void *vaddr)
478 {
479         free_page((unsigned long)vaddr);
480 }
481
482 static inline void *alloc_domain_mem(void)
483 {
484         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
485 }
486
487 static void free_domain_mem(void *vaddr)
488 {
489         kmem_cache_free(iommu_domain_cache, vaddr);
490 }
491
492 static inline void * alloc_devinfo_mem(void)
493 {
494         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
495 }
496
497 static inline void free_devinfo_mem(void *vaddr)
498 {
499         kmem_cache_free(iommu_devinfo_cache, vaddr);
500 }
501
502 struct iova *alloc_iova_mem(void)
503 {
504         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
505 }
506
507 void free_iova_mem(struct iova *iova)
508 {
509         kmem_cache_free(iommu_iova_cache, iova);
510 }
511
512
513 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
514 {
515         unsigned long sagaw;
516         int agaw = -1;
517
518         sagaw = cap_sagaw(iommu->cap);
519         for (agaw = width_to_agaw(max_gaw);
520              agaw >= 0; agaw--) {
521                 if (test_bit(agaw, &sagaw))
522                         break;
523         }
524
525         return agaw;
526 }
527
528 /*
529  * Calculate max SAGAW for each iommu.
530  */
531 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
532 {
533         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
534 }
535
536 /*
537  * calculate agaw for each iommu.
538  * "SAGAW" may be different across iommus, use a default agaw, and
539  * get a supported less agaw for iommus that don't support the default agaw.
540  */
541 int iommu_calculate_agaw(struct intel_iommu *iommu)
542 {
543         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
544 }
545
546 /* This functionin only returns single iommu in a domain */
547 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
548 {
549         int iommu_id;
550
551         /* si_domain and vm domain should not get here. */
552         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
553         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
554
555         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
556         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
557                 return NULL;
558
559         return g_iommus[iommu_id];
560 }
561
562 static void domain_update_iommu_coherency(struct dmar_domain *domain)
563 {
564         int i;
565
566         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
567
568         domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
569
570         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
571                 if (!ecap_coherent(g_iommus[i]->ecap)) {
572                         domain->iommu_coherency = 0;
573                         break;
574                 }
575         }
576 }
577
578 static void domain_update_iommu_snooping(struct dmar_domain *domain)
579 {
580         int i;
581
582         domain->iommu_snooping = 1;
583
584         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
585                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
586                         domain->iommu_snooping = 0;
587                         break;
588                 }
589         }
590 }
591
592 static void domain_update_iommu_superpage(struct dmar_domain *domain)
593 {
594         struct dmar_drhd_unit *drhd;
595         struct intel_iommu *iommu = NULL;
596         int mask = 0xf;
597
598         if (!intel_iommu_superpage) {
599                 domain->iommu_superpage = 0;
600                 return;
601         }
602
603         /* set iommu_superpage to the smallest common denominator */
604         for_each_active_iommu(iommu, drhd) {
605                 mask &= cap_super_page_val(iommu->cap);
606                 if (!mask) {
607                         break;
608                 }
609         }
610         domain->iommu_superpage = fls(mask);
611 }
612
613 /* Some capabilities may be different across iommus */
614 static void domain_update_iommu_cap(struct dmar_domain *domain)
615 {
616         domain_update_iommu_coherency(domain);
617         domain_update_iommu_snooping(domain);
618         domain_update_iommu_superpage(domain);
619 }
620
621 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
622 {
623         struct dmar_drhd_unit *drhd = NULL;
624         int i;
625
626         for_each_drhd_unit(drhd) {
627                 if (drhd->ignored)
628                         continue;
629                 if (segment != drhd->segment)
630                         continue;
631
632                 for (i = 0; i < drhd->devices_cnt; i++) {
633                         if (drhd->devices[i] &&
634                             drhd->devices[i]->bus->number == bus &&
635                             drhd->devices[i]->devfn == devfn)
636                                 return drhd->iommu;
637                         if (drhd->devices[i] &&
638                             drhd->devices[i]->subordinate &&
639                             drhd->devices[i]->subordinate->number <= bus &&
640                             drhd->devices[i]->subordinate->subordinate >= bus)
641                                 return drhd->iommu;
642                 }
643
644                 if (drhd->include_all)
645                         return drhd->iommu;
646         }
647
648         return NULL;
649 }
650
651 static void domain_flush_cache(struct dmar_domain *domain,
652                                void *addr, int size)
653 {
654         if (!domain->iommu_coherency)
655                 clflush_cache_range(addr, size);
656 }
657
658 /* Gets context entry for a given bus and devfn */
659 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
660                 u8 bus, u8 devfn)
661 {
662         struct root_entry *root;
663         struct context_entry *context;
664         unsigned long phy_addr;
665         unsigned long flags;
666
667         spin_lock_irqsave(&iommu->lock, flags);
668         root = &iommu->root_entry[bus];
669         context = get_context_addr_from_root(root);
670         if (!context) {
671                 context = (struct context_entry *)
672                                 alloc_pgtable_page(iommu->node);
673                 if (!context) {
674                         spin_unlock_irqrestore(&iommu->lock, flags);
675                         return NULL;
676                 }
677                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
678                 phy_addr = virt_to_phys((void *)context);
679                 set_root_value(root, phy_addr);
680                 set_root_present(root);
681                 __iommu_flush_cache(iommu, root, sizeof(*root));
682         }
683         spin_unlock_irqrestore(&iommu->lock, flags);
684         return &context[devfn];
685 }
686
687 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
688 {
689         struct root_entry *root;
690         struct context_entry *context;
691         int ret;
692         unsigned long flags;
693
694         spin_lock_irqsave(&iommu->lock, flags);
695         root = &iommu->root_entry[bus];
696         context = get_context_addr_from_root(root);
697         if (!context) {
698                 ret = 0;
699                 goto out;
700         }
701         ret = context_present(&context[devfn]);
702 out:
703         spin_unlock_irqrestore(&iommu->lock, flags);
704         return ret;
705 }
706
707 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
708 {
709         struct root_entry *root;
710         struct context_entry *context;
711         unsigned long flags;
712
713         spin_lock_irqsave(&iommu->lock, flags);
714         root = &iommu->root_entry[bus];
715         context = get_context_addr_from_root(root);
716         if (context) {
717                 context_clear_entry(&context[devfn]);
718                 __iommu_flush_cache(iommu, &context[devfn], \
719                         sizeof(*context));
720         }
721         spin_unlock_irqrestore(&iommu->lock, flags);
722 }
723
724 static void free_context_table(struct intel_iommu *iommu)
725 {
726         struct root_entry *root;
727         int i;
728         unsigned long flags;
729         struct context_entry *context;
730
731         spin_lock_irqsave(&iommu->lock, flags);
732         if (!iommu->root_entry) {
733                 goto out;
734         }
735         for (i = 0; i < ROOT_ENTRY_NR; i++) {
736                 root = &iommu->root_entry[i];
737                 context = get_context_addr_from_root(root);
738                 if (context)
739                         free_pgtable_page(context);
740         }
741         free_pgtable_page(iommu->root_entry);
742         iommu->root_entry = NULL;
743 out:
744         spin_unlock_irqrestore(&iommu->lock, flags);
745 }
746
747 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
748                                       unsigned long pfn, int target_level)
749 {
750         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
751         struct dma_pte *parent, *pte = NULL;
752         int level = agaw_to_level(domain->agaw);
753         int offset;
754
755         BUG_ON(!domain->pgd);
756         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
757         parent = domain->pgd;
758
759         while (level > 0) {
760                 void *tmp_page;
761
762                 offset = pfn_level_offset(pfn, level);
763                 pte = &parent[offset];
764                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
765                         break;
766                 if (level == target_level)
767                         break;
768
769                 if (!dma_pte_present(pte)) {
770                         uint64_t pteval;
771
772                         tmp_page = alloc_pgtable_page(domain->nid);
773
774                         if (!tmp_page)
775                                 return NULL;
776
777                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
778                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
779                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
780                                 /* Someone else set it while we were thinking; use theirs. */
781                                 free_pgtable_page(tmp_page);
782                         } else {
783                                 dma_pte_addr(pte);
784                                 domain_flush_cache(domain, pte, sizeof(*pte));
785                         }
786                 }
787                 parent = phys_to_virt(dma_pte_addr(pte));
788                 level--;
789         }
790
791         return pte;
792 }
793
794
795 /* return address's pte at specific level */
796 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
797                                          unsigned long pfn,
798                                          int level, int *large_page)
799 {
800         struct dma_pte *parent, *pte = NULL;
801         int total = agaw_to_level(domain->agaw);
802         int offset;
803
804         parent = domain->pgd;
805         while (level <= total) {
806                 offset = pfn_level_offset(pfn, total);
807                 pte = &parent[offset];
808                 if (level == total)
809                         return pte;
810
811                 if (!dma_pte_present(pte)) {
812                         *large_page = total;
813                         break;
814                 }
815
816                 if (pte->val & DMA_PTE_LARGE_PAGE) {
817                         *large_page = total;
818                         return pte;
819                 }
820
821                 parent = phys_to_virt(dma_pte_addr(pte));
822                 total--;
823         }
824         return NULL;
825 }
826
827 /* clear last level pte, a tlb flush should be followed */
828 static int dma_pte_clear_range(struct dmar_domain *domain,
829                                 unsigned long start_pfn,
830                                 unsigned long last_pfn)
831 {
832         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
833         unsigned int large_page = 1;
834         struct dma_pte *first_pte, *pte;
835         int order;
836
837         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
838         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
839         BUG_ON(start_pfn > last_pfn);
840
841         /* we don't need lock here; nobody else touches the iova range */
842         do {
843                 large_page = 1;
844                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
845                 if (!pte) {
846                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
847                         continue;
848                 }
849                 do {
850                         dma_clear_pte(pte);
851                         start_pfn += lvl_to_nr_pages(large_page);
852                         pte++;
853                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
854
855                 domain_flush_cache(domain, first_pte,
856                                    (void *)pte - (void *)first_pte);
857
858         } while (start_pfn && start_pfn <= last_pfn);
859
860         order = (large_page - 1) * 9;
861         return order;
862 }
863
864 /* free page table pages. last level pte should already be cleared */
865 static void dma_pte_free_pagetable(struct dmar_domain *domain,
866                                    unsigned long start_pfn,
867                                    unsigned long last_pfn)
868 {
869         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
870         struct dma_pte *first_pte, *pte;
871         int total = agaw_to_level(domain->agaw);
872         int level;
873         unsigned long tmp;
874         int large_page = 2;
875
876         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
877         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
878         BUG_ON(start_pfn > last_pfn);
879
880         /* We don't need lock here; nobody else touches the iova range */
881         level = 2;
882         while (level <= total) {
883                 tmp = align_to_level(start_pfn, level);
884
885                 /* If we can't even clear one PTE at this level, we're done */
886                 if (tmp + level_size(level) - 1 > last_pfn)
887                         return;
888
889                 do {
890                         large_page = level;
891                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
892                         if (large_page > level)
893                                 level = large_page + 1;
894                         if (!pte) {
895                                 tmp = align_to_level(tmp + 1, level + 1);
896                                 continue;
897                         }
898                         do {
899                                 if (dma_pte_present(pte)) {
900                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
901                                         dma_clear_pte(pte);
902                                 }
903                                 pte++;
904                                 tmp += level_size(level);
905                         } while (!first_pte_in_page(pte) &&
906                                  tmp + level_size(level) - 1 <= last_pfn);
907
908                         domain_flush_cache(domain, first_pte,
909                                            (void *)pte - (void *)first_pte);
910                         
911                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
912                 level++;
913         }
914         /* free pgd */
915         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
916                 free_pgtable_page(domain->pgd);
917                 domain->pgd = NULL;
918         }
919 }
920
921 /* iommu handling */
922 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
923 {
924         struct root_entry *root;
925         unsigned long flags;
926
927         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
928         if (!root)
929                 return -ENOMEM;
930
931         __iommu_flush_cache(iommu, root, ROOT_SIZE);
932
933         spin_lock_irqsave(&iommu->lock, flags);
934         iommu->root_entry = root;
935         spin_unlock_irqrestore(&iommu->lock, flags);
936
937         return 0;
938 }
939
940 static void iommu_set_root_entry(struct intel_iommu *iommu)
941 {
942         void *addr;
943         u32 sts;
944         unsigned long flag;
945
946         addr = iommu->root_entry;
947
948         raw_spin_lock_irqsave(&iommu->register_lock, flag);
949         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
950
951         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
952
953         /* Make sure hardware complete it */
954         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
955                       readl, (sts & DMA_GSTS_RTPS), sts);
956
957         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
958 }
959
960 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
961 {
962         u32 val;
963         unsigned long flag;
964
965         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
966                 return;
967
968         raw_spin_lock_irqsave(&iommu->register_lock, flag);
969         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
970
971         /* Make sure hardware complete it */
972         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
973                       readl, (!(val & DMA_GSTS_WBFS)), val);
974
975         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
976 }
977
978 /* return value determine if we need a write buffer flush */
979 static void __iommu_flush_context(struct intel_iommu *iommu,
980                                   u16 did, u16 source_id, u8 function_mask,
981                                   u64 type)
982 {
983         u64 val = 0;
984         unsigned long flag;
985
986         switch (type) {
987         case DMA_CCMD_GLOBAL_INVL:
988                 val = DMA_CCMD_GLOBAL_INVL;
989                 break;
990         case DMA_CCMD_DOMAIN_INVL:
991                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
992                 break;
993         case DMA_CCMD_DEVICE_INVL:
994                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
995                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
996                 break;
997         default:
998                 BUG();
999         }
1000         val |= DMA_CCMD_ICC;
1001
1002         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1003         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1004
1005         /* Make sure hardware complete it */
1006         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1007                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1008
1009         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1010 }
1011
1012 /* return value determine if we need a write buffer flush */
1013 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1014                                 u64 addr, unsigned int size_order, u64 type)
1015 {
1016         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1017         u64 val = 0, val_iva = 0;
1018         unsigned long flag;
1019
1020         switch (type) {
1021         case DMA_TLB_GLOBAL_FLUSH:
1022                 /* global flush doesn't need set IVA_REG */
1023                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1024                 break;
1025         case DMA_TLB_DSI_FLUSH:
1026                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1027                 break;
1028         case DMA_TLB_PSI_FLUSH:
1029                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1030                 /* Note: always flush non-leaf currently */
1031                 val_iva = size_order | addr;
1032                 break;
1033         default:
1034                 BUG();
1035         }
1036         /* Note: set drain read/write */
1037 #if 0
1038         /*
1039          * This is probably to be super secure.. Looks like we can
1040          * ignore it without any impact.
1041          */
1042         if (cap_read_drain(iommu->cap))
1043                 val |= DMA_TLB_READ_DRAIN;
1044 #endif
1045         if (cap_write_drain(iommu->cap))
1046                 val |= DMA_TLB_WRITE_DRAIN;
1047
1048         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1049         /* Note: Only uses first TLB reg currently */
1050         if (val_iva)
1051                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1052         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1053
1054         /* Make sure hardware complete it */
1055         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1056                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1057
1058         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1059
1060         /* check IOTLB invalidation granularity */
1061         if (DMA_TLB_IAIG(val) == 0)
1062                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1063         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1064                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1065                         (unsigned long long)DMA_TLB_IIRG(type),
1066                         (unsigned long long)DMA_TLB_IAIG(val));
1067 }
1068
1069 static struct device_domain_info *iommu_support_dev_iotlb(
1070         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1071 {
1072         int found = 0;
1073         unsigned long flags;
1074         struct device_domain_info *info;
1075         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1076
1077         if (!ecap_dev_iotlb_support(iommu->ecap))
1078                 return NULL;
1079
1080         if (!iommu->qi)
1081                 return NULL;
1082
1083         spin_lock_irqsave(&device_domain_lock, flags);
1084         list_for_each_entry(info, &domain->devices, link)
1085                 if (info->bus == bus && info->devfn == devfn) {
1086                         found = 1;
1087                         break;
1088                 }
1089         spin_unlock_irqrestore(&device_domain_lock, flags);
1090
1091         if (!found || !info->dev)
1092                 return NULL;
1093
1094         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1095                 return NULL;
1096
1097         if (!dmar_find_matched_atsr_unit(info->dev))
1098                 return NULL;
1099
1100         info->iommu = iommu;
1101
1102         return info;
1103 }
1104
1105 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1106 {
1107         if (!info)
1108                 return;
1109
1110         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1111 }
1112
1113 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1114 {
1115         if (!info->dev || !pci_ats_enabled(info->dev))
1116                 return;
1117
1118         pci_disable_ats(info->dev);
1119 }
1120
1121 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1122                                   u64 addr, unsigned mask)
1123 {
1124         u16 sid, qdep;
1125         unsigned long flags;
1126         struct device_domain_info *info;
1127
1128         spin_lock_irqsave(&device_domain_lock, flags);
1129         list_for_each_entry(info, &domain->devices, link) {
1130                 if (!info->dev || !pci_ats_enabled(info->dev))
1131                         continue;
1132
1133                 sid = info->bus << 8 | info->devfn;
1134                 qdep = pci_ats_queue_depth(info->dev);
1135                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1136         }
1137         spin_unlock_irqrestore(&device_domain_lock, flags);
1138 }
1139
1140 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1141                                   unsigned long pfn, unsigned int pages, int map)
1142 {
1143         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1144         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1145
1146         BUG_ON(pages == 0);
1147
1148         /*
1149          * Fallback to domain selective flush if no PSI support or the size is
1150          * too big.
1151          * PSI requires page size to be 2 ^ x, and the base address is naturally
1152          * aligned to the size
1153          */
1154         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1155                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1156                                                 DMA_TLB_DSI_FLUSH);
1157         else
1158                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1159                                                 DMA_TLB_PSI_FLUSH);
1160
1161         /*
1162          * In caching mode, changes of pages from non-present to present require
1163          * flush. However, device IOTLB doesn't need to be flushed in this case.
1164          */
1165         if (!cap_caching_mode(iommu->cap) || !map)
1166                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1167 }
1168
1169 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1170 {
1171         u32 pmen;
1172         unsigned long flags;
1173
1174         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1175         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1176         pmen &= ~DMA_PMEN_EPM;
1177         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1178
1179         /* wait for the protected region status bit to clear */
1180         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1181                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1182
1183         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1184 }
1185
1186 static int iommu_enable_translation(struct intel_iommu *iommu)
1187 {
1188         u32 sts;
1189         unsigned long flags;
1190
1191         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1192         iommu->gcmd |= DMA_GCMD_TE;
1193         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1194
1195         /* Make sure hardware complete it */
1196         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1197                       readl, (sts & DMA_GSTS_TES), sts);
1198
1199         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1200         return 0;
1201 }
1202
1203 static int iommu_disable_translation(struct intel_iommu *iommu)
1204 {
1205         u32 sts;
1206         unsigned long flag;
1207
1208         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1209         iommu->gcmd &= ~DMA_GCMD_TE;
1210         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1211
1212         /* Make sure hardware complete it */
1213         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1214                       readl, (!(sts & DMA_GSTS_TES)), sts);
1215
1216         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1217         return 0;
1218 }
1219
1220
1221 static int iommu_init_domains(struct intel_iommu *iommu)
1222 {
1223         unsigned long ndomains;
1224         unsigned long nlongs;
1225
1226         ndomains = cap_ndoms(iommu->cap);
1227         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1228                         ndomains);
1229         nlongs = BITS_TO_LONGS(ndomains);
1230
1231         spin_lock_init(&iommu->lock);
1232
1233         /* TBD: there might be 64K domains,
1234          * consider other allocation for future chip
1235          */
1236         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1237         if (!iommu->domain_ids) {
1238                 printk(KERN_ERR "Allocating domain id array failed\n");
1239                 return -ENOMEM;
1240         }
1241         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1242                         GFP_KERNEL);
1243         if (!iommu->domains) {
1244                 printk(KERN_ERR "Allocating domain array failed\n");
1245                 return -ENOMEM;
1246         }
1247
1248         /*
1249          * if Caching mode is set, then invalid translations are tagged
1250          * with domainid 0. Hence we need to pre-allocate it.
1251          */
1252         if (cap_caching_mode(iommu->cap))
1253                 set_bit(0, iommu->domain_ids);
1254         return 0;
1255 }
1256
1257
1258 static void domain_exit(struct dmar_domain *domain);
1259 static void vm_domain_exit(struct dmar_domain *domain);
1260
1261 void free_dmar_iommu(struct intel_iommu *iommu)
1262 {
1263         struct dmar_domain *domain;
1264         int i;
1265         unsigned long flags;
1266
1267         if ((iommu->domains) && (iommu->domain_ids)) {
1268                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1269                         domain = iommu->domains[i];
1270                         clear_bit(i, iommu->domain_ids);
1271
1272                         spin_lock_irqsave(&domain->iommu_lock, flags);
1273                         if (--domain->iommu_count == 0) {
1274                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1275                                         vm_domain_exit(domain);
1276                                 else
1277                                         domain_exit(domain);
1278                         }
1279                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1280                 }
1281         }
1282
1283         if (iommu->gcmd & DMA_GCMD_TE)
1284                 iommu_disable_translation(iommu);
1285
1286         if (iommu->irq) {
1287                 irq_set_handler_data(iommu->irq, NULL);
1288                 /* This will mask the irq */
1289                 free_irq(iommu->irq, iommu);
1290                 destroy_irq(iommu->irq);
1291         }
1292
1293         kfree(iommu->domains);
1294         kfree(iommu->domain_ids);
1295
1296         g_iommus[iommu->seq_id] = NULL;
1297
1298         /* if all iommus are freed, free g_iommus */
1299         for (i = 0; i < g_num_of_iommus; i++) {
1300                 if (g_iommus[i])
1301                         break;
1302         }
1303
1304         if (i == g_num_of_iommus)
1305                 kfree(g_iommus);
1306
1307         /* free context mapping */
1308         free_context_table(iommu);
1309 }
1310
1311 static struct dmar_domain *alloc_domain(void)
1312 {
1313         struct dmar_domain *domain;
1314
1315         domain = alloc_domain_mem();
1316         if (!domain)
1317                 return NULL;
1318
1319         domain->nid = -1;
1320         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1321         domain->flags = 0;
1322
1323         return domain;
1324 }
1325
1326 static int iommu_attach_domain(struct dmar_domain *domain,
1327                                struct intel_iommu *iommu)
1328 {
1329         int num;
1330         unsigned long ndomains;
1331         unsigned long flags;
1332
1333         ndomains = cap_ndoms(iommu->cap);
1334
1335         spin_lock_irqsave(&iommu->lock, flags);
1336
1337         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1338         if (num >= ndomains) {
1339                 spin_unlock_irqrestore(&iommu->lock, flags);
1340                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1341                 return -ENOMEM;
1342         }
1343
1344         domain->id = num;
1345         set_bit(num, iommu->domain_ids);
1346         set_bit(iommu->seq_id, &domain->iommu_bmp);
1347         iommu->domains[num] = domain;
1348         spin_unlock_irqrestore(&iommu->lock, flags);
1349
1350         return 0;
1351 }
1352
1353 static void iommu_detach_domain(struct dmar_domain *domain,
1354                                 struct intel_iommu *iommu)
1355 {
1356         unsigned long flags;
1357         int num, ndomains;
1358         int found = 0;
1359
1360         spin_lock_irqsave(&iommu->lock, flags);
1361         ndomains = cap_ndoms(iommu->cap);
1362         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1363                 if (iommu->domains[num] == domain) {
1364                         found = 1;
1365                         break;
1366                 }
1367         }
1368
1369         if (found) {
1370                 clear_bit(num, iommu->domain_ids);
1371                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1372                 iommu->domains[num] = NULL;
1373         }
1374         spin_unlock_irqrestore(&iommu->lock, flags);
1375 }
1376
1377 static struct iova_domain reserved_iova_list;
1378 static struct lock_class_key reserved_rbtree_key;
1379
1380 static int dmar_init_reserved_ranges(void)
1381 {
1382         struct pci_dev *pdev = NULL;
1383         struct iova *iova;
1384         int i;
1385
1386         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1387
1388         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1389                 &reserved_rbtree_key);
1390
1391         /* IOAPIC ranges shouldn't be accessed by DMA */
1392         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1393                 IOVA_PFN(IOAPIC_RANGE_END));
1394         if (!iova) {
1395                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1396                 return -ENODEV;
1397         }
1398
1399         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1400         for_each_pci_dev(pdev) {
1401                 struct resource *r;
1402
1403                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1404                         r = &pdev->resource[i];
1405                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1406                                 continue;
1407                         iova = reserve_iova(&reserved_iova_list,
1408                                             IOVA_PFN(r->start),
1409                                             IOVA_PFN(r->end));
1410                         if (!iova) {
1411                                 printk(KERN_ERR "Reserve iova failed\n");
1412                                 return -ENODEV;
1413                         }
1414                 }
1415         }
1416         return 0;
1417 }
1418
1419 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1420 {
1421         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1422 }
1423
1424 static inline int guestwidth_to_adjustwidth(int gaw)
1425 {
1426         int agaw;
1427         int r = (gaw - 12) % 9;
1428
1429         if (r == 0)
1430                 agaw = gaw;
1431         else
1432                 agaw = gaw + 9 - r;
1433         if (agaw > 64)
1434                 agaw = 64;
1435         return agaw;
1436 }
1437
1438 static int domain_init(struct dmar_domain *domain, int guest_width)
1439 {
1440         struct intel_iommu *iommu;
1441         int adjust_width, agaw;
1442         unsigned long sagaw;
1443
1444         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1445         spin_lock_init(&domain->iommu_lock);
1446
1447         domain_reserve_special_ranges(domain);
1448
1449         /* calculate AGAW */
1450         iommu = domain_get_iommu(domain);
1451         if (guest_width > cap_mgaw(iommu->cap))
1452                 guest_width = cap_mgaw(iommu->cap);
1453         domain->gaw = guest_width;
1454         adjust_width = guestwidth_to_adjustwidth(guest_width);
1455         agaw = width_to_agaw(adjust_width);
1456         sagaw = cap_sagaw(iommu->cap);
1457         if (!test_bit(agaw, &sagaw)) {
1458                 /* hardware doesn't support it, choose a bigger one */
1459                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1460                 agaw = find_next_bit(&sagaw, 5, agaw);
1461                 if (agaw >= 5)
1462                         return -ENODEV;
1463         }
1464         domain->agaw = agaw;
1465         INIT_LIST_HEAD(&domain->devices);
1466
1467         if (ecap_coherent(iommu->ecap))
1468                 domain->iommu_coherency = 1;
1469         else
1470                 domain->iommu_coherency = 0;
1471
1472         if (ecap_sc_support(iommu->ecap))
1473                 domain->iommu_snooping = 1;
1474         else
1475                 domain->iommu_snooping = 0;
1476
1477         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1478         domain->iommu_count = 1;
1479         domain->nid = iommu->node;
1480
1481         /* always allocate the top pgd */
1482         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1483         if (!domain->pgd)
1484                 return -ENOMEM;
1485         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1486         return 0;
1487 }
1488
1489 static void domain_exit(struct dmar_domain *domain)
1490 {
1491         struct dmar_drhd_unit *drhd;
1492         struct intel_iommu *iommu;
1493
1494         /* Domain 0 is reserved, so dont process it */
1495         if (!domain)
1496                 return;
1497
1498         /* Flush any lazy unmaps that may reference this domain */
1499         if (!intel_iommu_strict)
1500                 flush_unmaps_timeout(0);
1501
1502         domain_remove_dev_info(domain);
1503         /* destroy iovas */
1504         put_iova_domain(&domain->iovad);
1505
1506         /* clear ptes */
1507         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1508
1509         /* free page tables */
1510         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1511
1512         for_each_active_iommu(iommu, drhd)
1513                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1514                         iommu_detach_domain(domain, iommu);
1515
1516         free_domain_mem(domain);
1517 }
1518
1519 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1520                                  u8 bus, u8 devfn, int translation)
1521 {
1522         struct context_entry *context;
1523         unsigned long flags;
1524         struct intel_iommu *iommu;
1525         struct dma_pte *pgd;
1526         unsigned long num;
1527         unsigned long ndomains;
1528         int id;
1529         int agaw;
1530         struct device_domain_info *info = NULL;
1531
1532         pr_debug("Set context mapping for %02x:%02x.%d\n",
1533                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1534
1535         BUG_ON(!domain->pgd);
1536         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1537                translation != CONTEXT_TT_MULTI_LEVEL);
1538
1539         iommu = device_to_iommu(segment, bus, devfn);
1540         if (!iommu)
1541                 return -ENODEV;
1542
1543         context = device_to_context_entry(iommu, bus, devfn);
1544         if (!context)
1545                 return -ENOMEM;
1546         spin_lock_irqsave(&iommu->lock, flags);
1547         if (context_present(context)) {
1548                 spin_unlock_irqrestore(&iommu->lock, flags);
1549                 return 0;
1550         }
1551
1552         id = domain->id;
1553         pgd = domain->pgd;
1554
1555         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1556             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1557                 int found = 0;
1558
1559                 /* find an available domain id for this device in iommu */
1560                 ndomains = cap_ndoms(iommu->cap);
1561                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1562                         if (iommu->domains[num] == domain) {
1563                                 id = num;
1564                                 found = 1;
1565                                 break;
1566                         }
1567                 }
1568
1569                 if (found == 0) {
1570                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1571                         if (num >= ndomains) {
1572                                 spin_unlock_irqrestore(&iommu->lock, flags);
1573                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1574                                 return -EFAULT;
1575                         }
1576
1577                         set_bit(num, iommu->domain_ids);
1578                         iommu->domains[num] = domain;
1579                         id = num;
1580                 }
1581
1582                 /* Skip top levels of page tables for
1583                  * iommu which has less agaw than default.
1584                  * Unnecessary for PT mode.
1585                  */
1586                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1587                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1588                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1589                                 if (!dma_pte_present(pgd)) {
1590                                         spin_unlock_irqrestore(&iommu->lock, flags);
1591                                         return -ENOMEM;
1592                                 }
1593                         }
1594                 }
1595         }
1596
1597         context_set_domain_id(context, id);
1598
1599         if (translation != CONTEXT_TT_PASS_THROUGH) {
1600                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1601                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1602                                      CONTEXT_TT_MULTI_LEVEL;
1603         }
1604         /*
1605          * In pass through mode, AW must be programmed to indicate the largest
1606          * AGAW value supported by hardware. And ASR is ignored by hardware.
1607          */
1608         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1609                 context_set_address_width(context, iommu->msagaw);
1610         else {
1611                 context_set_address_root(context, virt_to_phys(pgd));
1612                 context_set_address_width(context, iommu->agaw);
1613         }
1614
1615         context_set_translation_type(context, translation);
1616         context_set_fault_enable(context);
1617         context_set_present(context);
1618         domain_flush_cache(domain, context, sizeof(*context));
1619
1620         /*
1621          * It's a non-present to present mapping. If hardware doesn't cache
1622          * non-present entry we only need to flush the write-buffer. If the
1623          * _does_ cache non-present entries, then it does so in the special
1624          * domain #0, which we have to flush:
1625          */
1626         if (cap_caching_mode(iommu->cap)) {
1627                 iommu->flush.flush_context(iommu, 0,
1628                                            (((u16)bus) << 8) | devfn,
1629                                            DMA_CCMD_MASK_NOBIT,
1630                                            DMA_CCMD_DEVICE_INVL);
1631                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1632         } else {
1633                 iommu_flush_write_buffer(iommu);
1634         }
1635         iommu_enable_dev_iotlb(info);
1636         spin_unlock_irqrestore(&iommu->lock, flags);
1637
1638         spin_lock_irqsave(&domain->iommu_lock, flags);
1639         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1640                 domain->iommu_count++;
1641                 if (domain->iommu_count == 1)
1642                         domain->nid = iommu->node;
1643                 domain_update_iommu_cap(domain);
1644         }
1645         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1646         return 0;
1647 }
1648
1649 static int
1650 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1651                         int translation)
1652 {
1653         int ret;
1654         struct pci_dev *tmp, *parent;
1655
1656         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1657                                          pdev->bus->number, pdev->devfn,
1658                                          translation);
1659         if (ret)
1660                 return ret;
1661
1662         /* dependent device mapping */
1663         tmp = pci_find_upstream_pcie_bridge(pdev);
1664         if (!tmp)
1665                 return 0;
1666         /* Secondary interface's bus number and devfn 0 */
1667         parent = pdev->bus->self;
1668         while (parent != tmp) {
1669                 ret = domain_context_mapping_one(domain,
1670                                                  pci_domain_nr(parent->bus),
1671                                                  parent->bus->number,
1672                                                  parent->devfn, translation);
1673                 if (ret)
1674                         return ret;
1675                 parent = parent->bus->self;
1676         }
1677         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1678                 return domain_context_mapping_one(domain,
1679                                         pci_domain_nr(tmp->subordinate),
1680                                         tmp->subordinate->number, 0,
1681                                         translation);
1682         else /* this is a legacy PCI bridge */
1683                 return domain_context_mapping_one(domain,
1684                                                   pci_domain_nr(tmp->bus),
1685                                                   tmp->bus->number,
1686                                                   tmp->devfn,
1687                                                   translation);
1688 }
1689
1690 static int domain_context_mapped(struct pci_dev *pdev)
1691 {
1692         int ret;
1693         struct pci_dev *tmp, *parent;
1694         struct intel_iommu *iommu;
1695
1696         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1697                                 pdev->devfn);
1698         if (!iommu)
1699                 return -ENODEV;
1700
1701         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1702         if (!ret)
1703                 return ret;
1704         /* dependent device mapping */
1705         tmp = pci_find_upstream_pcie_bridge(pdev);
1706         if (!tmp)
1707                 return ret;
1708         /* Secondary interface's bus number and devfn 0 */
1709         parent = pdev->bus->self;
1710         while (parent != tmp) {
1711                 ret = device_context_mapped(iommu, parent->bus->number,
1712                                             parent->devfn);
1713                 if (!ret)
1714                         return ret;
1715                 parent = parent->bus->self;
1716         }
1717         if (pci_is_pcie(tmp))
1718                 return device_context_mapped(iommu, tmp->subordinate->number,
1719                                              0);
1720         else
1721                 return device_context_mapped(iommu, tmp->bus->number,
1722                                              tmp->devfn);
1723 }
1724
1725 /* Returns a number of VTD pages, but aligned to MM page size */
1726 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1727                                             size_t size)
1728 {
1729         host_addr &= ~PAGE_MASK;
1730         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1731 }
1732
1733 /* Return largest possible superpage level for a given mapping */
1734 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1735                                           unsigned long iov_pfn,
1736                                           unsigned long phy_pfn,
1737                                           unsigned long pages)
1738 {
1739         int support, level = 1;
1740         unsigned long pfnmerge;
1741
1742         support = domain->iommu_superpage;
1743
1744         /* To use a large page, the virtual *and* physical addresses
1745            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1746            of them will mean we have to use smaller pages. So just
1747            merge them and check both at once. */
1748         pfnmerge = iov_pfn | phy_pfn;
1749
1750         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1751                 pages >>= VTD_STRIDE_SHIFT;
1752                 if (!pages)
1753                         break;
1754                 pfnmerge >>= VTD_STRIDE_SHIFT;
1755                 level++;
1756                 support--;
1757         }
1758         return level;
1759 }
1760
1761 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1762                             struct scatterlist *sg, unsigned long phys_pfn,
1763                             unsigned long nr_pages, int prot)
1764 {
1765         struct dma_pte *first_pte = NULL, *pte = NULL;
1766         phys_addr_t uninitialized_var(pteval);
1767         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1768         unsigned long sg_res;
1769         unsigned int largepage_lvl = 0;
1770         unsigned long lvl_pages = 0;
1771
1772         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1773
1774         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1775                 return -EINVAL;
1776
1777         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1778
1779         if (sg)
1780                 sg_res = 0;
1781         else {
1782                 sg_res = nr_pages + 1;
1783                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1784         }
1785
1786         while (nr_pages > 0) {
1787                 uint64_t tmp;
1788
1789                 if (!sg_res) {
1790                         sg_res = aligned_nrpages(sg->offset, sg->length);
1791                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1792                         sg->dma_length = sg->length;
1793                         pteval = page_to_phys(sg_page(sg)) | prot;
1794                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1795                 }
1796
1797                 if (!pte) {
1798                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1799
1800                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1801                         if (!pte)
1802                                 return -ENOMEM;
1803                         /* It is large page*/
1804                         if (largepage_lvl > 1) {
1805                                 pteval |= DMA_PTE_LARGE_PAGE;
1806                                 /* Ensure that old small page tables are removed to make room
1807                                    for superpage, if they exist. */
1808                                 dma_pte_clear_range(domain, iov_pfn,
1809                                                     iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1810                                 dma_pte_free_pagetable(domain, iov_pfn,
1811                                                        iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1812                         } else {
1813                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1814                         }
1815
1816                 }
1817                 /* We don't need lock here, nobody else
1818                  * touches the iova range
1819                  */
1820                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1821                 if (tmp) {
1822                         static int dumps = 5;
1823                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1824                                iov_pfn, tmp, (unsigned long long)pteval);
1825                         if (dumps) {
1826                                 dumps--;
1827                                 debug_dma_dump_mappings(NULL);
1828                         }
1829                         WARN_ON(1);
1830                 }
1831
1832                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1833
1834                 BUG_ON(nr_pages < lvl_pages);
1835                 BUG_ON(sg_res < lvl_pages);
1836
1837                 nr_pages -= lvl_pages;
1838                 iov_pfn += lvl_pages;
1839                 phys_pfn += lvl_pages;
1840                 pteval += lvl_pages * VTD_PAGE_SIZE;
1841                 sg_res -= lvl_pages;
1842
1843                 /* If the next PTE would be the first in a new page, then we
1844                    need to flush the cache on the entries we've just written.
1845                    And then we'll need to recalculate 'pte', so clear it and
1846                    let it get set again in the if (!pte) block above.
1847
1848                    If we're done (!nr_pages) we need to flush the cache too.
1849
1850                    Also if we've been setting superpages, we may need to
1851                    recalculate 'pte' and switch back to smaller pages for the
1852                    end of the mapping, if the trailing size is not enough to
1853                    use another superpage (i.e. sg_res < lvl_pages). */
1854                 pte++;
1855                 if (!nr_pages || first_pte_in_page(pte) ||
1856                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1857                         domain_flush_cache(domain, first_pte,
1858                                            (void *)pte - (void *)first_pte);
1859                         pte = NULL;
1860                 }
1861
1862                 if (!sg_res && nr_pages)
1863                         sg = sg_next(sg);
1864         }
1865         return 0;
1866 }
1867
1868 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1869                                     struct scatterlist *sg, unsigned long nr_pages,
1870                                     int prot)
1871 {
1872         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1873 }
1874
1875 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1876                                      unsigned long phys_pfn, unsigned long nr_pages,
1877                                      int prot)
1878 {
1879         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1880 }
1881
1882 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1883 {
1884         if (!iommu)
1885                 return;
1886
1887         clear_context_table(iommu, bus, devfn);
1888         iommu->flush.flush_context(iommu, 0, 0, 0,
1889                                            DMA_CCMD_GLOBAL_INVL);
1890         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1891 }
1892
1893 static void domain_remove_dev_info(struct dmar_domain *domain)
1894 {
1895         struct device_domain_info *info;
1896         unsigned long flags;
1897         struct intel_iommu *iommu;
1898
1899         spin_lock_irqsave(&device_domain_lock, flags);
1900         while (!list_empty(&domain->devices)) {
1901                 info = list_entry(domain->devices.next,
1902                         struct device_domain_info, link);
1903                 list_del(&info->link);
1904                 list_del(&info->global);
1905                 if (info->dev)
1906                         info->dev->dev.archdata.iommu = NULL;
1907                 spin_unlock_irqrestore(&device_domain_lock, flags);
1908
1909                 iommu_disable_dev_iotlb(info);
1910                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1911                 iommu_detach_dev(iommu, info->bus, info->devfn);
1912                 free_devinfo_mem(info);
1913
1914                 spin_lock_irqsave(&device_domain_lock, flags);
1915         }
1916         spin_unlock_irqrestore(&device_domain_lock, flags);
1917 }
1918
1919 /*
1920  * find_domain
1921  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1922  */
1923 static struct dmar_domain *
1924 find_domain(struct pci_dev *pdev)
1925 {
1926         struct device_domain_info *info;
1927
1928         /* No lock here, assumes no domain exit in normal case */
1929         info = pdev->dev.archdata.iommu;
1930         if (info)
1931                 return info->domain;
1932         return NULL;
1933 }
1934
1935 /* domain is initialized */
1936 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1937 {
1938         struct dmar_domain *domain, *found = NULL;
1939         struct intel_iommu *iommu;
1940         struct dmar_drhd_unit *drhd;
1941         struct device_domain_info *info, *tmp;
1942         struct pci_dev *dev_tmp;
1943         unsigned long flags;
1944         int bus = 0, devfn = 0;
1945         int segment;
1946         int ret;
1947
1948         domain = find_domain(pdev);
1949         if (domain)
1950                 return domain;
1951
1952         segment = pci_domain_nr(pdev->bus);
1953
1954         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1955         if (dev_tmp) {
1956                 if (pci_is_pcie(dev_tmp)) {
1957                         bus = dev_tmp->subordinate->number;
1958                         devfn = 0;
1959                 } else {
1960                         bus = dev_tmp->bus->number;
1961                         devfn = dev_tmp->devfn;
1962                 }
1963                 spin_lock_irqsave(&device_domain_lock, flags);
1964                 list_for_each_entry(info, &device_domain_list, global) {
1965                         if (info->segment == segment &&
1966                             info->bus == bus && info->devfn == devfn) {
1967                                 found = info->domain;
1968                                 break;
1969                         }
1970                 }
1971                 spin_unlock_irqrestore(&device_domain_lock, flags);
1972                 /* pcie-pci bridge already has a domain, uses it */
1973                 if (found) {
1974                         domain = found;
1975                         goto found_domain;
1976                 }
1977         }
1978
1979         domain = alloc_domain();
1980         if (!domain)
1981                 goto error;
1982
1983         /* Allocate new domain for the device */
1984         drhd = dmar_find_matched_drhd_unit(pdev);
1985         if (!drhd) {
1986                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1987                         pci_name(pdev));
1988                 return NULL;
1989         }
1990         iommu = drhd->iommu;
1991
1992         ret = iommu_attach_domain(domain, iommu);
1993         if (ret) {
1994                 free_domain_mem(domain);
1995                 goto error;
1996         }
1997
1998         if (domain_init(domain, gaw)) {
1999                 domain_exit(domain);
2000                 goto error;
2001         }
2002
2003         /* register pcie-to-pci device */
2004         if (dev_tmp) {
2005                 info = alloc_devinfo_mem();
2006                 if (!info) {
2007                         domain_exit(domain);
2008                         goto error;
2009                 }
2010                 info->segment = segment;
2011                 info->bus = bus;
2012                 info->devfn = devfn;
2013                 info->dev = NULL;
2014                 info->domain = domain;
2015                 /* This domain is shared by devices under p2p bridge */
2016                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2017
2018                 /* pcie-to-pci bridge already has a domain, uses it */
2019                 found = NULL;
2020                 spin_lock_irqsave(&device_domain_lock, flags);
2021                 list_for_each_entry(tmp, &device_domain_list, global) {
2022                         if (tmp->segment == segment &&
2023                             tmp->bus == bus && tmp->devfn == devfn) {
2024                                 found = tmp->domain;
2025                                 break;
2026                         }
2027                 }
2028                 if (found) {
2029                         spin_unlock_irqrestore(&device_domain_lock, flags);
2030                         free_devinfo_mem(info);
2031                         domain_exit(domain);
2032                         domain = found;
2033                 } else {
2034                         list_add(&info->link, &domain->devices);
2035                         list_add(&info->global, &device_domain_list);
2036                         spin_unlock_irqrestore(&device_domain_lock, flags);
2037                 }
2038         }
2039
2040 found_domain:
2041         info = alloc_devinfo_mem();
2042         if (!info)
2043                 goto error;
2044         info->segment = segment;
2045         info->bus = pdev->bus->number;
2046         info->devfn = pdev->devfn;
2047         info->dev = pdev;
2048         info->domain = domain;
2049         spin_lock_irqsave(&device_domain_lock, flags);
2050         /* somebody is fast */
2051         found = find_domain(pdev);
2052         if (found != NULL) {
2053                 spin_unlock_irqrestore(&device_domain_lock, flags);
2054                 if (found != domain) {
2055                         domain_exit(domain);
2056                         domain = found;
2057                 }
2058                 free_devinfo_mem(info);
2059                 return domain;
2060         }
2061         list_add(&info->link, &domain->devices);
2062         list_add(&info->global, &device_domain_list);
2063         pdev->dev.archdata.iommu = info;
2064         spin_unlock_irqrestore(&device_domain_lock, flags);
2065         return domain;
2066 error:
2067         /* recheck it here, maybe others set it */
2068         return find_domain(pdev);
2069 }
2070
2071 static int iommu_identity_mapping;
2072 #define IDENTMAP_ALL            1
2073 #define IDENTMAP_GFX            2
2074 #define IDENTMAP_AZALIA         4
2075
2076 static int iommu_domain_identity_map(struct dmar_domain *domain,
2077                                      unsigned long long start,
2078                                      unsigned long long end)
2079 {
2080         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2081         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2082
2083         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2084                           dma_to_mm_pfn(last_vpfn))) {
2085                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2086                 return -ENOMEM;
2087         }
2088
2089         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2090                  start, end, domain->id);
2091         /*
2092          * RMRR range might have overlap with physical memory range,
2093          * clear it first
2094          */
2095         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2096
2097         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2098                                   last_vpfn - first_vpfn + 1,
2099                                   DMA_PTE_READ|DMA_PTE_WRITE);
2100 }
2101
2102 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2103                                       unsigned long long start,
2104                                       unsigned long long end)
2105 {
2106         struct dmar_domain *domain;
2107         int ret;
2108
2109         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2110         if (!domain)
2111                 return -ENOMEM;
2112
2113         /* For _hardware_ passthrough, don't bother. But for software
2114            passthrough, we do it anyway -- it may indicate a memory
2115            range which is reserved in E820, so which didn't get set
2116            up to start with in si_domain */
2117         if (domain == si_domain && hw_pass_through) {
2118                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2119                        pci_name(pdev), start, end);
2120                 return 0;
2121         }
2122
2123         printk(KERN_INFO
2124                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2125                pci_name(pdev), start, end);
2126         
2127         if (end < start) {
2128                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2129                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2130                         dmi_get_system_info(DMI_BIOS_VENDOR),
2131                         dmi_get_system_info(DMI_BIOS_VERSION),
2132                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2133                 ret = -EIO;
2134                 goto error;
2135         }
2136
2137         if (end >> agaw_to_width(domain->agaw)) {
2138                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2139                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2140                      agaw_to_width(domain->agaw),
2141                      dmi_get_system_info(DMI_BIOS_VENDOR),
2142                      dmi_get_system_info(DMI_BIOS_VERSION),
2143                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2144                 ret = -EIO;
2145                 goto error;
2146         }
2147
2148         ret = iommu_domain_identity_map(domain, start, end);
2149         if (ret)
2150                 goto error;
2151
2152         /* context entry init */
2153         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2154         if (ret)
2155                 goto error;
2156
2157         return 0;
2158
2159  error:
2160         domain_exit(domain);
2161         return ret;
2162 }
2163
2164 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2165         struct pci_dev *pdev)
2166 {
2167         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2168                 return 0;
2169         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2170                 rmrr->end_address);
2171 }
2172
2173 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2174 static inline void iommu_prepare_isa(void)
2175 {
2176         struct pci_dev *pdev;
2177         int ret;
2178
2179         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2180         if (!pdev)
2181                 return;
2182
2183         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2184         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2185
2186         if (ret)
2187                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2188                        "floppy might not work\n");
2189
2190 }
2191 #else
2192 static inline void iommu_prepare_isa(void)
2193 {
2194         return;
2195 }
2196 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2197
2198 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2199
2200 static int __init si_domain_work_fn(unsigned long start_pfn,
2201                                     unsigned long end_pfn, void *datax)
2202 {
2203         int *ret = datax;
2204
2205         *ret = iommu_domain_identity_map(si_domain,
2206                                          (uint64_t)start_pfn << PAGE_SHIFT,
2207                                          (uint64_t)end_pfn << PAGE_SHIFT);
2208         return *ret;
2209
2210 }
2211
2212 static int __init si_domain_init(int hw)
2213 {
2214         struct dmar_drhd_unit *drhd;
2215         struct intel_iommu *iommu;
2216         int nid, ret = 0;
2217
2218         si_domain = alloc_domain();
2219         if (!si_domain)
2220                 return -EFAULT;
2221
2222         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2223
2224         for_each_active_iommu(iommu, drhd) {
2225                 ret = iommu_attach_domain(si_domain, iommu);
2226                 if (ret) {
2227                         domain_exit(si_domain);
2228                         return -EFAULT;
2229                 }
2230         }
2231
2232         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2233                 domain_exit(si_domain);
2234                 return -EFAULT;
2235         }
2236
2237         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2238
2239         if (hw)
2240                 return 0;
2241
2242         for_each_online_node(nid) {
2243                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2244                 if (ret)
2245                         return ret;
2246         }
2247
2248         return 0;
2249 }
2250
2251 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2252                                           struct pci_dev *pdev);
2253 static int identity_mapping(struct pci_dev *pdev)
2254 {
2255         struct device_domain_info *info;
2256
2257         if (likely(!iommu_identity_mapping))
2258                 return 0;
2259
2260         info = pdev->dev.archdata.iommu;
2261         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2262                 return (info->domain == si_domain);
2263
2264         return 0;
2265 }
2266
2267 static int domain_add_dev_info(struct dmar_domain *domain,
2268                                struct pci_dev *pdev,
2269                                int translation)
2270 {
2271         struct device_domain_info *info;
2272         unsigned long flags;
2273         int ret;
2274
2275         info = alloc_devinfo_mem();
2276         if (!info)
2277                 return -ENOMEM;
2278
2279         info->segment = pci_domain_nr(pdev->bus);
2280         info->bus = pdev->bus->number;
2281         info->devfn = pdev->devfn;
2282         info->dev = pdev;
2283         info->domain = domain;
2284
2285         spin_lock_irqsave(&device_domain_lock, flags);
2286         list_add(&info->link, &domain->devices);
2287         list_add(&info->global, &device_domain_list);
2288         pdev->dev.archdata.iommu = info;
2289         spin_unlock_irqrestore(&device_domain_lock, flags);
2290
2291         ret = domain_context_mapping(domain, pdev, translation);
2292         if (ret) {
2293                 spin_lock_irqsave(&device_domain_lock, flags);
2294                 list_del(&info->link);
2295                 list_del(&info->global);
2296                 pdev->dev.archdata.iommu = NULL;
2297                 spin_unlock_irqrestore(&device_domain_lock, flags);
2298                 free_devinfo_mem(info);
2299                 return ret;
2300         }
2301
2302         return 0;
2303 }
2304
2305 static bool device_has_rmrr(struct pci_dev *dev)
2306 {
2307         struct dmar_rmrr_unit *rmrr;
2308         int i;
2309
2310         for_each_rmrr_units(rmrr) {
2311                 for (i = 0; i < rmrr->devices_cnt; i++) {
2312                         /*
2313                          * Return TRUE if this RMRR contains the device that
2314                          * is passed in.
2315                          */
2316                         if (rmrr->devices[i] == dev)
2317                                 return true;
2318                 }
2319         }
2320         return false;
2321 }
2322
2323 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2324 {
2325
2326         /*
2327          * We want to prevent any device associated with an RMRR from
2328          * getting placed into the SI Domain. This is done because
2329          * problems exist when devices are moved in and out of domains
2330          * and their respective RMRR info is lost. We exempt USB devices
2331          * from this process due to their usage of RMRRs that are known
2332          * to not be needed after BIOS hand-off to OS.
2333          */
2334         if (device_has_rmrr(pdev) &&
2335             (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2336                 return 0;
2337
2338         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2339                 return 1;
2340
2341         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2342                 return 1;
2343
2344         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2345                 return 0;
2346
2347         /*
2348          * We want to start off with all devices in the 1:1 domain, and
2349          * take them out later if we find they can't access all of memory.
2350          *
2351          * However, we can't do this for PCI devices behind bridges,
2352          * because all PCI devices behind the same bridge will end up
2353          * with the same source-id on their transactions.
2354          *
2355          * Practically speaking, we can't change things around for these
2356          * devices at run-time, because we can't be sure there'll be no
2357          * DMA transactions in flight for any of their siblings.
2358          * 
2359          * So PCI devices (unless they're on the root bus) as well as
2360          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2361          * the 1:1 domain, just in _case_ one of their siblings turns out
2362          * not to be able to map all of memory.
2363          */
2364         if (!pci_is_pcie(pdev)) {
2365                 if (!pci_is_root_bus(pdev->bus))
2366                         return 0;
2367                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2368                         return 0;
2369         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2370                 return 0;
2371
2372         /* 
2373          * At boot time, we don't yet know if devices will be 64-bit capable.
2374          * Assume that they will -- if they turn out not to be, then we can 
2375          * take them out of the 1:1 domain later.
2376          */
2377         if (!startup) {
2378                 /*
2379                  * If the device's dma_mask is less than the system's memory
2380                  * size then this is not a candidate for identity mapping.
2381                  */
2382                 u64 dma_mask = pdev->dma_mask;
2383
2384                 if (pdev->dev.coherent_dma_mask &&
2385                     pdev->dev.coherent_dma_mask < dma_mask)
2386                         dma_mask = pdev->dev.coherent_dma_mask;
2387
2388                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2389         }
2390
2391         return 1;
2392 }
2393
2394 static int __init iommu_prepare_static_identity_mapping(int hw)
2395 {
2396         struct pci_dev *pdev = NULL;
2397         int ret;
2398
2399         ret = si_domain_init(hw);
2400         if (ret)
2401                 return -EFAULT;
2402
2403         for_each_pci_dev(pdev) {
2404                 /* Skip Host/PCI Bridge devices */
2405                 if (IS_BRIDGE_HOST_DEVICE(pdev))
2406                         continue;
2407                 if (iommu_should_identity_map(pdev, 1)) {
2408                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2409                                hw ? "hardware" : "software", pci_name(pdev));
2410
2411                         ret = domain_add_dev_info(si_domain, pdev,
2412                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2413                                                      CONTEXT_TT_MULTI_LEVEL);
2414                         if (ret)
2415                                 return ret;
2416                 }
2417         }
2418
2419         return 0;
2420 }
2421
2422 static int __init init_dmars(void)
2423 {
2424         struct dmar_drhd_unit *drhd;
2425         struct dmar_rmrr_unit *rmrr;
2426         struct pci_dev *pdev;
2427         struct intel_iommu *iommu;
2428         int i, ret;
2429
2430         /*
2431          * for each drhd
2432          *    allocate root
2433          *    initialize and program root entry to not present
2434          * endfor
2435          */
2436         for_each_drhd_unit(drhd) {
2437                 g_num_of_iommus++;
2438                 /*
2439                  * lock not needed as this is only incremented in the single
2440                  * threaded kernel __init code path all other access are read
2441                  * only
2442                  */
2443         }
2444
2445         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2446                         GFP_KERNEL);
2447         if (!g_iommus) {
2448                 printk(KERN_ERR "Allocating global iommu array failed\n");
2449                 ret = -ENOMEM;
2450                 goto error;
2451         }
2452
2453         deferred_flush = kzalloc(g_num_of_iommus *
2454                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2455         if (!deferred_flush) {
2456                 ret = -ENOMEM;
2457                 goto error;
2458         }
2459
2460         for_each_drhd_unit(drhd) {
2461                 if (drhd->ignored)
2462                         continue;
2463
2464                 iommu = drhd->iommu;
2465                 g_iommus[iommu->seq_id] = iommu;
2466
2467                 ret = iommu_init_domains(iommu);
2468                 if (ret)
2469                         goto error;
2470
2471                 /*
2472                  * TBD:
2473                  * we could share the same root & context tables
2474                  * among all IOMMU's. Need to Split it later.
2475                  */
2476                 ret = iommu_alloc_root_entry(iommu);
2477                 if (ret) {
2478                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2479                         goto error;
2480                 }
2481                 if (!ecap_pass_through(iommu->ecap))
2482                         hw_pass_through = 0;
2483         }
2484
2485         /*
2486          * Start from the sane iommu hardware state.
2487          */
2488         for_each_drhd_unit(drhd) {
2489                 if (drhd->ignored)
2490                         continue;
2491
2492                 iommu = drhd->iommu;
2493
2494                 /*
2495                  * If the queued invalidation is already initialized by us
2496                  * (for example, while enabling interrupt-remapping) then
2497                  * we got the things already rolling from a sane state.
2498                  */
2499                 if (iommu->qi)
2500                         continue;
2501
2502                 /*
2503                  * Clear any previous faults.
2504                  */
2505                 dmar_fault(-1, iommu);
2506                 /*
2507                  * Disable queued invalidation if supported and already enabled
2508                  * before OS handover.
2509                  */
2510                 dmar_disable_qi(iommu);
2511         }
2512
2513         for_each_drhd_unit(drhd) {
2514                 if (drhd->ignored)
2515                         continue;
2516
2517                 iommu = drhd->iommu;
2518
2519                 if (dmar_enable_qi(iommu)) {
2520                         /*
2521                          * Queued Invalidate not enabled, use Register Based
2522                          * Invalidate
2523                          */
2524                         iommu->flush.flush_context = __iommu_flush_context;
2525                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2526                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2527                                "invalidation\n",
2528                                 iommu->seq_id,
2529                                (unsigned long long)drhd->reg_base_addr);
2530                 } else {
2531                         iommu->flush.flush_context = qi_flush_context;
2532                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2533                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2534                                "invalidation\n",
2535                                 iommu->seq_id,
2536                                (unsigned long long)drhd->reg_base_addr);
2537                 }
2538         }
2539
2540         if (iommu_pass_through)
2541                 iommu_identity_mapping |= IDENTMAP_ALL;
2542
2543 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2544         iommu_identity_mapping |= IDENTMAP_GFX;
2545 #endif
2546
2547         check_tylersburg_isoch();
2548
2549         /*
2550          * If pass through is not set or not enabled, setup context entries for
2551          * identity mappings for rmrr, gfx, and isa and may fall back to static
2552          * identity mapping if iommu_identity_mapping is set.
2553          */
2554         if (iommu_identity_mapping) {
2555                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2556                 if (ret) {
2557                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2558                         goto error;
2559                 }
2560         }
2561         /*
2562          * For each rmrr
2563          *   for each dev attached to rmrr
2564          *   do
2565          *     locate drhd for dev, alloc domain for dev
2566          *     allocate free domain
2567          *     allocate page table entries for rmrr
2568          *     if context not allocated for bus
2569          *           allocate and init context
2570          *           set present in root table for this bus
2571          *     init context with domain, translation etc
2572          *    endfor
2573          * endfor
2574          */
2575         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2576         for_each_rmrr_units(rmrr) {
2577                 for (i = 0; i < rmrr->devices_cnt; i++) {
2578                         pdev = rmrr->devices[i];
2579                         /*
2580                          * some BIOS lists non-exist devices in DMAR
2581                          * table.
2582                          */
2583                         if (!pdev)
2584                                 continue;
2585                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2586                         if (ret)
2587                                 printk(KERN_ERR
2588                                        "IOMMU: mapping reserved region failed\n");
2589                 }
2590         }
2591
2592         iommu_prepare_isa();
2593
2594         /*
2595          * for each drhd
2596          *   enable fault log
2597          *   global invalidate context cache
2598          *   global invalidate iotlb
2599          *   enable translation
2600          */
2601         for_each_drhd_unit(drhd) {
2602                 if (drhd->ignored) {
2603                         /*
2604                          * we always have to disable PMRs or DMA may fail on
2605                          * this device
2606                          */
2607                         if (force_on)
2608                                 iommu_disable_protect_mem_regions(drhd->iommu);
2609                         continue;
2610                 }
2611                 iommu = drhd->iommu;
2612
2613                 iommu_flush_write_buffer(iommu);
2614
2615                 ret = dmar_set_interrupt(iommu);
2616                 if (ret)
2617                         goto error;
2618
2619                 iommu_set_root_entry(iommu);
2620
2621                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2622                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2623
2624                 ret = iommu_enable_translation(iommu);
2625                 if (ret)
2626                         goto error;
2627
2628                 iommu_disable_protect_mem_regions(iommu);
2629         }
2630
2631         return 0;
2632 error:
2633         for_each_drhd_unit(drhd) {
2634                 if (drhd->ignored)
2635                         continue;
2636                 iommu = drhd->iommu;
2637                 free_iommu(iommu);
2638         }
2639         kfree(g_iommus);
2640         return ret;
2641 }
2642
2643 /* This takes a number of _MM_ pages, not VTD pages */
2644 static struct iova *intel_alloc_iova(struct device *dev,
2645                                      struct dmar_domain *domain,
2646                                      unsigned long nrpages, uint64_t dma_mask)
2647 {
2648         struct pci_dev *pdev = to_pci_dev(dev);
2649         struct iova *iova = NULL;
2650
2651         /* Restrict dma_mask to the width that the iommu can handle */
2652         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2653
2654         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2655                 /*
2656                  * First try to allocate an io virtual address in
2657                  * DMA_BIT_MASK(32) and if that fails then try allocating
2658                  * from higher range
2659                  */
2660                 iova = alloc_iova(&domain->iovad, nrpages,
2661                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2662                 if (iova)
2663                         return iova;
2664         }
2665         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2666         if (unlikely(!iova)) {
2667                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2668                        nrpages, pci_name(pdev));
2669                 return NULL;
2670         }
2671
2672         return iova;
2673 }
2674
2675 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2676 {
2677         struct dmar_domain *domain;
2678         int ret;
2679
2680         domain = get_domain_for_dev(pdev,
2681                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2682         if (!domain) {
2683                 printk(KERN_ERR
2684                         "Allocating domain for %s failed", pci_name(pdev));
2685                 return NULL;
2686         }
2687
2688         /* make sure context mapping is ok */
2689         if (unlikely(!domain_context_mapped(pdev))) {
2690                 ret = domain_context_mapping(domain, pdev,
2691                                              CONTEXT_TT_MULTI_LEVEL);
2692                 if (ret) {
2693                         printk(KERN_ERR
2694                                 "Domain context map for %s failed",
2695                                 pci_name(pdev));
2696                         return NULL;
2697                 }
2698         }
2699
2700         return domain;
2701 }
2702
2703 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2704 {
2705         struct device_domain_info *info;
2706
2707         /* No lock here, assumes no domain exit in normal case */
2708         info = dev->dev.archdata.iommu;
2709         if (likely(info))
2710                 return info->domain;
2711
2712         return __get_valid_domain_for_dev(dev);
2713 }
2714
2715 static int iommu_dummy(struct pci_dev *pdev)
2716 {
2717         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2718 }
2719
2720 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2721 static int iommu_no_mapping(struct device *dev)
2722 {
2723         struct pci_dev *pdev;
2724         int found;
2725
2726         if (unlikely(dev->bus != &pci_bus_type))
2727                 return 1;
2728
2729         pdev = to_pci_dev(dev);
2730         if (iommu_dummy(pdev))
2731                 return 1;
2732
2733         if (!iommu_identity_mapping)
2734                 return 0;
2735
2736         found = identity_mapping(pdev);
2737         if (found) {
2738                 if (iommu_should_identity_map(pdev, 0))
2739                         return 1;
2740                 else {
2741                         /*
2742                          * 32 bit DMA is removed from si_domain and fall back
2743                          * to non-identity mapping.
2744                          */
2745                         domain_remove_one_dev_info(si_domain, pdev);
2746                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2747                                pci_name(pdev));
2748                         return 0;
2749                 }
2750         } else {
2751                 /*
2752                  * In case of a detached 64 bit DMA device from vm, the device
2753                  * is put into si_domain for identity mapping.
2754                  */
2755                 if (iommu_should_identity_map(pdev, 0)) {
2756                         int ret;
2757                         ret = domain_add_dev_info(si_domain, pdev,
2758                                                   hw_pass_through ?
2759                                                   CONTEXT_TT_PASS_THROUGH :
2760                                                   CONTEXT_TT_MULTI_LEVEL);
2761                         if (!ret) {
2762                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2763                                        pci_name(pdev));
2764                                 return 1;
2765                         }
2766                 }
2767         }
2768
2769         return 0;
2770 }
2771
2772 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2773                                      size_t size, int dir, u64 dma_mask)
2774 {
2775         struct pci_dev *pdev = to_pci_dev(hwdev);
2776         struct dmar_domain *domain;
2777         phys_addr_t start_paddr;
2778         struct iova *iova;
2779         int prot = 0;
2780         int ret;
2781         struct intel_iommu *iommu;
2782         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2783
2784         BUG_ON(dir == DMA_NONE);
2785
2786         if (iommu_no_mapping(hwdev))
2787                 return paddr;
2788
2789         domain = get_valid_domain_for_dev(pdev);
2790         if (!domain)
2791                 return 0;
2792
2793         iommu = domain_get_iommu(domain);
2794         size = aligned_nrpages(paddr, size);
2795
2796         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2797         if (!iova)
2798                 goto error;
2799
2800         /*
2801          * Check if DMAR supports zero-length reads on write only
2802          * mappings..
2803          */
2804         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2805                         !cap_zlr(iommu->cap))
2806                 prot |= DMA_PTE_READ;
2807         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2808                 prot |= DMA_PTE_WRITE;
2809         /*
2810          * paddr - (paddr + size) might be partial page, we should map the whole
2811          * page.  Note: if two part of one page are separately mapped, we
2812          * might have two guest_addr mapping to the same host paddr, but this
2813          * is not a big problem
2814          */
2815         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2816                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2817         if (ret)
2818                 goto error;
2819
2820         /* it's a non-present to present mapping. Only flush if caching mode */
2821         if (cap_caching_mode(iommu->cap))
2822                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2823         else
2824                 iommu_flush_write_buffer(iommu);
2825
2826         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2827         start_paddr += paddr & ~PAGE_MASK;
2828         return start_paddr;
2829
2830 error:
2831         if (iova)
2832                 __free_iova(&domain->iovad, iova);
2833         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2834                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2835         return 0;
2836 }
2837
2838 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2839                                  unsigned long offset, size_t size,
2840                                  enum dma_data_direction dir,
2841                                  struct dma_attrs *attrs)
2842 {
2843         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2844                                   dir, to_pci_dev(dev)->dma_mask);
2845 }
2846
2847 static void flush_unmaps(void)
2848 {
2849         int i, j;
2850
2851         timer_on = 0;
2852
2853         /* just flush them all */
2854         for (i = 0; i < g_num_of_iommus; i++) {
2855                 struct intel_iommu *iommu = g_iommus[i];
2856                 if (!iommu)
2857                         continue;
2858
2859                 if (!deferred_flush[i].next)
2860                         continue;
2861
2862                 /* In caching mode, global flushes turn emulation expensive */
2863                 if (!cap_caching_mode(iommu->cap))
2864                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2865                                          DMA_TLB_GLOBAL_FLUSH);
2866                 for (j = 0; j < deferred_flush[i].next; j++) {
2867                         unsigned long mask;
2868                         struct iova *iova = deferred_flush[i].iova[j];
2869                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2870
2871                         /* On real hardware multiple invalidations are expensive */
2872                         if (cap_caching_mode(iommu->cap))
2873                                 iommu_flush_iotlb_psi(iommu, domain->id,
2874                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2875                         else {
2876                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2877                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2878                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2879                         }
2880                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2881                 }
2882                 deferred_flush[i].next = 0;
2883         }
2884
2885         list_size = 0;
2886 }
2887
2888 static void flush_unmaps_timeout(unsigned long data)
2889 {
2890         unsigned long flags;
2891
2892         spin_lock_irqsave(&async_umap_flush_lock, flags);
2893         flush_unmaps();
2894         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2895 }
2896
2897 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2898 {
2899         unsigned long flags;
2900         int next, iommu_id;
2901         struct intel_iommu *iommu;
2902
2903         spin_lock_irqsave(&async_umap_flush_lock, flags);
2904         if (list_size == HIGH_WATER_MARK)
2905                 flush_unmaps();
2906
2907         iommu = domain_get_iommu(dom);
2908         iommu_id = iommu->seq_id;
2909
2910         next = deferred_flush[iommu_id].next;
2911         deferred_flush[iommu_id].domain[next] = dom;
2912         deferred_flush[iommu_id].iova[next] = iova;
2913         deferred_flush[iommu_id].next++;
2914
2915         if (!timer_on) {
2916                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2917                 timer_on = 1;
2918         }
2919         list_size++;
2920         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2921 }
2922
2923 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2924                              size_t size, enum dma_data_direction dir,
2925                              struct dma_attrs *attrs)
2926 {
2927         struct pci_dev *pdev = to_pci_dev(dev);
2928         struct dmar_domain *domain;
2929         unsigned long start_pfn, last_pfn;
2930         struct iova *iova;
2931         struct intel_iommu *iommu;
2932
2933         if (iommu_no_mapping(dev))
2934                 return;
2935
2936         domain = find_domain(pdev);
2937         BUG_ON(!domain);
2938
2939         iommu = domain_get_iommu(domain);
2940
2941         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2942         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2943                       (unsigned long long)dev_addr))
2944                 return;
2945
2946         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2947         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2948
2949         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2950                  pci_name(pdev), start_pfn, last_pfn);
2951
2952         /*  clear the whole page */
2953         dma_pte_clear_range(domain, start_pfn, last_pfn);
2954
2955         /* free page tables */
2956         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2957
2958         if (intel_iommu_strict) {
2959                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2960                                       last_pfn - start_pfn + 1, 0);
2961                 /* free iova */
2962                 __free_iova(&domain->iovad, iova);
2963         } else {
2964                 add_unmap(domain, iova);
2965                 /*
2966                  * queue up the release of the unmap to save the 1/6th of the
2967                  * cpu used up by the iotlb flush operation...
2968                  */
2969         }
2970 }
2971
2972 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2973                                   dma_addr_t *dma_handle, gfp_t flags)
2974 {
2975         void *vaddr;
2976         int order;
2977
2978         size = PAGE_ALIGN(size);
2979         order = get_order(size);
2980
2981         if (!iommu_no_mapping(hwdev))
2982                 flags &= ~(GFP_DMA | GFP_DMA32);
2983         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2984                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2985                         flags |= GFP_DMA;
2986                 else
2987                         flags |= GFP_DMA32;
2988         }
2989
2990         vaddr = (void *)__get_free_pages(flags, order);
2991         if (!vaddr)
2992                 return NULL;
2993         memset(vaddr, 0, size);
2994
2995         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2996                                          DMA_BIDIRECTIONAL,
2997                                          hwdev->coherent_dma_mask);
2998         if (*dma_handle)
2999                 return vaddr;
3000         free_pages((unsigned long)vaddr, order);
3001         return NULL;
3002 }
3003
3004 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3005                                 dma_addr_t dma_handle)
3006 {
3007         int order;
3008
3009         size = PAGE_ALIGN(size);
3010         order = get_order(size);
3011
3012         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3013         free_pages((unsigned long)vaddr, order);
3014 }
3015
3016 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3017                            int nelems, enum dma_data_direction dir,
3018                            struct dma_attrs *attrs)
3019 {
3020         struct pci_dev *pdev = to_pci_dev(hwdev);
3021         struct dmar_domain *domain;
3022         unsigned long start_pfn, last_pfn;
3023         struct iova *iova;
3024         struct intel_iommu *iommu;
3025
3026         if (iommu_no_mapping(hwdev))
3027                 return;
3028
3029         domain = find_domain(pdev);
3030         BUG_ON(!domain);
3031
3032         iommu = domain_get_iommu(domain);
3033
3034         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3035         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3036                       (unsigned long long)sglist[0].dma_address))
3037                 return;
3038
3039         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3040         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3041
3042         /*  clear the whole page */
3043         dma_pte_clear_range(domain, start_pfn, last_pfn);
3044
3045         /* free page tables */
3046         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3047
3048         if (intel_iommu_strict) {
3049                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3050                                       last_pfn - start_pfn + 1, 0);
3051                 /* free iova */
3052                 __free_iova(&domain->iovad, iova);
3053         } else {
3054                 add_unmap(domain, iova);
3055                 /*
3056                  * queue up the release of the unmap to save the 1/6th of the
3057                  * cpu used up by the iotlb flush operation...
3058                  */
3059         }
3060 }
3061
3062 static int intel_nontranslate_map_sg(struct device *hddev,
3063         struct scatterlist *sglist, int nelems, int dir)
3064 {
3065         int i;
3066         struct scatterlist *sg;
3067
3068         for_each_sg(sglist, sg, nelems, i) {
3069                 BUG_ON(!sg_page(sg));
3070                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3071                 sg->dma_length = sg->length;
3072         }
3073         return nelems;
3074 }
3075
3076 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3077                         enum dma_data_direction dir, struct dma_attrs *attrs)
3078 {
3079         int i;
3080         struct pci_dev *pdev = to_pci_dev(hwdev);
3081         struct dmar_domain *domain;
3082         size_t size = 0;
3083         int prot = 0;
3084         struct iova *iova = NULL;
3085         int ret;
3086         struct scatterlist *sg;
3087         unsigned long start_vpfn;
3088         struct intel_iommu *iommu;
3089
3090         BUG_ON(dir == DMA_NONE);
3091         if (iommu_no_mapping(hwdev))
3092                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3093
3094         domain = get_valid_domain_for_dev(pdev);
3095         if (!domain)
3096                 return 0;
3097
3098         iommu = domain_get_iommu(domain);
3099
3100         for_each_sg(sglist, sg, nelems, i)
3101                 size += aligned_nrpages(sg->offset, sg->length);
3102
3103         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3104                                 pdev->dma_mask);
3105         if (!iova) {
3106                 sglist->dma_length = 0;
3107                 return 0;
3108         }
3109
3110         /*
3111          * Check if DMAR supports zero-length reads on write only
3112          * mappings..
3113          */
3114         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3115                         !cap_zlr(iommu->cap))
3116                 prot |= DMA_PTE_READ;
3117         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3118                 prot |= DMA_PTE_WRITE;
3119
3120         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3121
3122         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3123         if (unlikely(ret)) {
3124                 /*  clear the page */
3125                 dma_pte_clear_range(domain, start_vpfn,
3126                                     start_vpfn + size - 1);
3127                 /* free page tables */
3128                 dma_pte_free_pagetable(domain, start_vpfn,
3129                                        start_vpfn + size - 1);
3130                 /* free iova */
3131                 __free_iova(&domain->iovad, iova);
3132                 return 0;
3133         }
3134
3135         /* it's a non-present to present mapping. Only flush if caching mode */
3136         if (cap_caching_mode(iommu->cap))
3137                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3138         else
3139                 iommu_flush_write_buffer(iommu);
3140
3141         return nelems;
3142 }
3143
3144 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3145 {
3146         return !dma_addr;
3147 }
3148
3149 struct dma_map_ops intel_dma_ops = {
3150         .alloc_coherent = intel_alloc_coherent,
3151         .free_coherent = intel_free_coherent,
3152         .map_sg = intel_map_sg,
3153         .unmap_sg = intel_unmap_sg,
3154         .map_page = intel_map_page,
3155         .unmap_page = intel_unmap_page,
3156         .mapping_error = intel_mapping_error,
3157 };
3158
3159 static inline int iommu_domain_cache_init(void)
3160 {
3161         int ret = 0;
3162
3163         iommu_domain_cache = kmem_cache_create("iommu_domain",
3164                                          sizeof(struct dmar_domain),
3165                                          0,
3166                                          SLAB_HWCACHE_ALIGN,
3167
3168                                          NULL);
3169         if (!iommu_domain_cache) {
3170                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3171                 ret = -ENOMEM;
3172         }
3173
3174         return ret;
3175 }
3176
3177 static inline int iommu_devinfo_cache_init(void)
3178 {
3179         int ret = 0;
3180
3181         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3182                                          sizeof(struct device_domain_info),
3183                                          0,
3184                                          SLAB_HWCACHE_ALIGN,
3185                                          NULL);
3186         if (!iommu_devinfo_cache) {
3187                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3188                 ret = -ENOMEM;
3189         }
3190
3191         return ret;
3192 }
3193
3194 static inline int iommu_iova_cache_init(void)
3195 {
3196         int ret = 0;
3197
3198         iommu_iova_cache = kmem_cache_create("iommu_iova",
3199                                          sizeof(struct iova),
3200                                          0,
3201                                          SLAB_HWCACHE_ALIGN,
3202                                          NULL);
3203         if (!iommu_iova_cache) {
3204                 printk(KERN_ERR "Couldn't create iova cache\n");
3205                 ret = -ENOMEM;
3206         }
3207
3208         return ret;
3209 }
3210
3211 static int __init iommu_init_mempool(void)
3212 {
3213         int ret;
3214         ret = iommu_iova_cache_init();
3215         if (ret)
3216                 return ret;
3217
3218         ret = iommu_domain_cache_init();
3219         if (ret)
3220                 goto domain_error;
3221
3222         ret = iommu_devinfo_cache_init();
3223         if (!ret)
3224                 return ret;
3225
3226         kmem_cache_destroy(iommu_domain_cache);
3227 domain_error:
3228         kmem_cache_destroy(iommu_iova_cache);
3229
3230         return -ENOMEM;
3231 }
3232
3233 static void __init iommu_exit_mempool(void)
3234 {
3235         kmem_cache_destroy(iommu_devinfo_cache);
3236         kmem_cache_destroy(iommu_domain_cache);
3237         kmem_cache_destroy(iommu_iova_cache);
3238
3239 }
3240
3241 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3242 {
3243         struct dmar_drhd_unit *drhd;
3244         u32 vtbar;
3245         int rc;
3246
3247         /* We know that this device on this chipset has its own IOMMU.
3248          * If we find it under a different IOMMU, then the BIOS is lying
3249          * to us. Hope that the IOMMU for this device is actually
3250          * disabled, and it needs no translation...
3251          */
3252         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3253         if (rc) {
3254                 /* "can't" happen */
3255                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3256                 return;
3257         }
3258         vtbar &= 0xffff0000;
3259
3260         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3261         drhd = dmar_find_matched_drhd_unit(pdev);
3262         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3263                             TAINT_FIRMWARE_WORKAROUND,
3264                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3265                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3266 }
3267 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3268
3269 static void __init init_no_remapping_devices(void)
3270 {
3271         struct dmar_drhd_unit *drhd;
3272
3273         for_each_drhd_unit(drhd) {
3274                 if (!drhd->include_all) {
3275                         int i;
3276                         for (i = 0; i < drhd->devices_cnt; i++)
3277                                 if (drhd->devices[i] != NULL)
3278                                         break;
3279                         /* ignore DMAR unit if no pci devices exist */
3280                         if (i == drhd->devices_cnt)
3281                                 drhd->ignored = 1;
3282                 }
3283         }
3284
3285         for_each_drhd_unit(drhd) {
3286                 int i;
3287                 if (drhd->ignored || drhd->include_all)
3288                         continue;
3289
3290                 for (i = 0; i < drhd->devices_cnt; i++)
3291                         if (drhd->devices[i] &&
3292                             !IS_GFX_DEVICE(drhd->devices[i]))
3293                                 break;
3294
3295                 if (i < drhd->devices_cnt)
3296                         continue;
3297
3298                 /* This IOMMU has *only* gfx devices. Either bypass it or
3299                    set the gfx_mapped flag, as appropriate */
3300                 if (dmar_map_gfx) {
3301                         intel_iommu_gfx_mapped = 1;
3302                 } else {
3303                         drhd->ignored = 1;
3304                         for (i = 0; i < drhd->devices_cnt; i++) {
3305                                 if (!drhd->devices[i])
3306                                         continue;
3307                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3308                         }
3309                 }
3310         }
3311 }
3312
3313 #ifdef CONFIG_SUSPEND
3314 static int init_iommu_hw(void)
3315 {
3316         struct dmar_drhd_unit *drhd;
3317         struct intel_iommu *iommu = NULL;
3318
3319         for_each_active_iommu(iommu, drhd)
3320                 if (iommu->qi)
3321                         dmar_reenable_qi(iommu);
3322
3323         for_each_iommu(iommu, drhd) {
3324                 if (drhd->ignored) {
3325                         /*
3326                          * we always have to disable PMRs or DMA may fail on
3327                          * this device
3328                          */
3329                         if (force_on)
3330                                 iommu_disable_protect_mem_regions(iommu);
3331                         continue;
3332                 }
3333         
3334                 iommu_flush_write_buffer(iommu);
3335
3336                 iommu_set_root_entry(iommu);
3337
3338                 iommu->flush.flush_context(iommu, 0, 0, 0,
3339                                            DMA_CCMD_GLOBAL_INVL);
3340                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3341                                          DMA_TLB_GLOBAL_FLUSH);
3342                 if (iommu_enable_translation(iommu))
3343                         return 1;
3344                 iommu_disable_protect_mem_regions(iommu);
3345         }
3346
3347         return 0;
3348 }
3349
3350 static void iommu_flush_all(void)
3351 {
3352         struct dmar_drhd_unit *drhd;
3353         struct intel_iommu *iommu;
3354
3355         for_each_active_iommu(iommu, drhd) {
3356                 iommu->flush.flush_context(iommu, 0, 0, 0,
3357                                            DMA_CCMD_GLOBAL_INVL);
3358                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3359                                          DMA_TLB_GLOBAL_FLUSH);
3360         }
3361 }
3362
3363 static int iommu_suspend(void)
3364 {
3365         struct dmar_drhd_unit *drhd;
3366         struct intel_iommu *iommu = NULL;
3367         unsigned long flag;
3368
3369         for_each_active_iommu(iommu, drhd) {
3370                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3371                                                  GFP_ATOMIC);
3372                 if (!iommu->iommu_state)
3373                         goto nomem;
3374         }
3375
3376         iommu_flush_all();
3377
3378         for_each_active_iommu(iommu, drhd) {
3379                 iommu_disable_translation(iommu);
3380
3381                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3382
3383                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3384                         readl(iommu->reg + DMAR_FECTL_REG);
3385                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3386                         readl(iommu->reg + DMAR_FEDATA_REG);
3387                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3388                         readl(iommu->reg + DMAR_FEADDR_REG);
3389                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3390                         readl(iommu->reg + DMAR_FEUADDR_REG);
3391
3392                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3393         }
3394         return 0;
3395
3396 nomem:
3397         for_each_active_iommu(iommu, drhd)
3398                 kfree(iommu->iommu_state);
3399
3400         return -ENOMEM;
3401 }
3402
3403 static void iommu_resume(void)
3404 {
3405         struct dmar_drhd_unit *drhd;
3406         struct intel_iommu *iommu = NULL;
3407         unsigned long flag;
3408
3409         if (init_iommu_hw()) {
3410                 if (force_on)
3411                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3412                 else
3413                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3414                 return;
3415         }
3416
3417         for_each_active_iommu(iommu, drhd) {
3418
3419                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3420
3421                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3422                         iommu->reg + DMAR_FECTL_REG);
3423                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3424                         iommu->reg + DMAR_FEDATA_REG);
3425                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3426                         iommu->reg + DMAR_FEADDR_REG);
3427                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3428                         iommu->reg + DMAR_FEUADDR_REG);
3429
3430                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3431         }
3432
3433         for_each_active_iommu(iommu, drhd)
3434                 kfree(iommu->iommu_state);
3435 }
3436
3437 static struct syscore_ops iommu_syscore_ops = {
3438         .resume         = iommu_resume,
3439         .suspend        = iommu_suspend,
3440 };
3441
3442 static void __init init_iommu_pm_ops(void)
3443 {
3444         register_syscore_ops(&iommu_syscore_ops);
3445 }
3446
3447 #else
3448 static inline void init_iommu_pm_ops(void) {}
3449 #endif  /* CONFIG_PM */
3450
3451 LIST_HEAD(dmar_rmrr_units);
3452
3453 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3454 {
3455         list_add(&rmrr->list, &dmar_rmrr_units);
3456 }
3457
3458
3459 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3460 {
3461         struct acpi_dmar_reserved_memory *rmrr;
3462         struct dmar_rmrr_unit *rmrru;
3463
3464         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3465         if (!rmrru)
3466                 return -ENOMEM;
3467
3468         rmrru->hdr = header;
3469         rmrr = (struct acpi_dmar_reserved_memory *)header;
3470         rmrru->base_address = rmrr->base_address;
3471         rmrru->end_address = rmrr->end_address;
3472
3473         dmar_register_rmrr_unit(rmrru);
3474         return 0;
3475 }
3476
3477 static int __init
3478 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3479 {
3480         struct acpi_dmar_reserved_memory *rmrr;
3481         int ret;
3482
3483         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3484         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3485                 ((void *)rmrr) + rmrr->header.length,
3486                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3487
3488         if (ret || (rmrru->devices_cnt == 0)) {
3489                 list_del(&rmrru->list);
3490                 kfree(rmrru);
3491         }
3492         return ret;
3493 }
3494
3495 static LIST_HEAD(dmar_atsr_units);
3496
3497 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3498 {
3499         struct acpi_dmar_atsr *atsr;
3500         struct dmar_atsr_unit *atsru;
3501
3502         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3503         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3504         if (!atsru)
3505                 return -ENOMEM;
3506
3507         atsru->hdr = hdr;
3508         atsru->include_all = atsr->flags & 0x1;
3509
3510         list_add(&atsru->list, &dmar_atsr_units);
3511
3512         return 0;
3513 }
3514
3515 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3516 {
3517         int rc;
3518         struct acpi_dmar_atsr *atsr;
3519
3520         if (atsru->include_all)
3521                 return 0;
3522
3523         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3524         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3525                                 (void *)atsr + atsr->header.length,
3526                                 &atsru->devices_cnt, &atsru->devices,
3527                                 atsr->segment);
3528         if (rc || !atsru->devices_cnt) {
3529                 list_del(&atsru->list);
3530                 kfree(atsru);
3531         }
3532
3533         return rc;
3534 }
3535
3536 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3537 {
3538         int i;
3539         struct pci_bus *bus;
3540         struct acpi_dmar_atsr *atsr;
3541         struct dmar_atsr_unit *atsru;
3542
3543         dev = pci_physfn(dev);
3544
3545         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3546                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3547                 if (atsr->segment == pci_domain_nr(dev->bus))
3548                         goto found;
3549         }
3550
3551         return 0;
3552
3553 found:
3554         for (bus = dev->bus; bus; bus = bus->parent) {
3555                 struct pci_dev *bridge = bus->self;
3556
3557                 if (!bridge || !pci_is_pcie(bridge) ||
3558                     bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3559                         return 0;
3560
3561                 if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3562                         for (i = 0; i < atsru->devices_cnt; i++)
3563                                 if (atsru->devices[i] == bridge)
3564                                         return 1;
3565                         break;
3566                 }
3567         }
3568
3569         if (atsru->include_all)
3570                 return 1;
3571
3572         return 0;
3573 }
3574
3575 int __init dmar_parse_rmrr_atsr_dev(void)
3576 {
3577         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3578         struct dmar_atsr_unit *atsr, *atsr_n;
3579         int ret = 0;
3580
3581         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3582                 ret = rmrr_parse_dev(rmrr);
3583                 if (ret)
3584                         return ret;
3585         }
3586
3587         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3588                 ret = atsr_parse_dev(atsr);
3589                 if (ret)
3590                         return ret;
3591         }
3592
3593         return ret;
3594 }
3595
3596 /*
3597  * Here we only respond to action of unbound device from driver.
3598  *
3599  * Added device is not attached to its DMAR domain here yet. That will happen
3600  * when mapping the device to iova.
3601  */
3602 static int device_notifier(struct notifier_block *nb,
3603                                   unsigned long action, void *data)
3604 {
3605         struct device *dev = data;
3606         struct pci_dev *pdev = to_pci_dev(dev);
3607         struct dmar_domain *domain;
3608
3609         if (iommu_no_mapping(dev))
3610                 return 0;
3611
3612         domain = find_domain(pdev);
3613         if (!domain)
3614                 return 0;
3615
3616         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3617                 domain_remove_one_dev_info(domain, pdev);
3618
3619                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3620                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3621                     list_empty(&domain->devices))
3622                         domain_exit(domain);
3623         }
3624
3625         return 0;
3626 }
3627
3628 static struct notifier_block device_nb = {
3629         .notifier_call = device_notifier,
3630 };
3631
3632 int __init intel_iommu_init(void)
3633 {
3634         int ret = 0;
3635
3636         /* VT-d is required for a TXT/tboot launch, so enforce that */
3637         force_on = tboot_force_iommu();
3638
3639         if (dmar_table_init()) {
3640                 if (force_on)
3641                         panic("tboot: Failed to initialize DMAR table\n");
3642                 return  -ENODEV;
3643         }
3644
3645         if (dmar_dev_scope_init() < 0) {
3646                 if (force_on)
3647                         panic("tboot: Failed to initialize DMAR device scope\n");
3648                 return  -ENODEV;
3649         }
3650
3651         if (no_iommu || dmar_disabled)
3652                 return -ENODEV;
3653
3654         if (iommu_init_mempool()) {
3655                 if (force_on)
3656                         panic("tboot: Failed to initialize iommu memory\n");
3657                 return  -ENODEV;
3658         }
3659
3660         if (list_empty(&dmar_rmrr_units))
3661                 printk(KERN_INFO "DMAR: No RMRR found\n");
3662
3663         if (list_empty(&dmar_atsr_units))
3664                 printk(KERN_INFO "DMAR: No ATSR found\n");
3665
3666         if (dmar_init_reserved_ranges()) {
3667                 if (force_on)
3668                         panic("tboot: Failed to reserve iommu ranges\n");
3669                 return  -ENODEV;
3670         }
3671
3672         init_no_remapping_devices();
3673
3674         ret = init_dmars();
3675         if (ret) {
3676                 if (force_on)
3677                         panic("tboot: Failed to initialize DMARs\n");
3678                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3679                 put_iova_domain(&reserved_iova_list);
3680                 iommu_exit_mempool();
3681                 return ret;
3682         }
3683         printk(KERN_INFO
3684         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3685
3686         init_timer(&unmap_timer);
3687 #ifdef CONFIG_SWIOTLB
3688         swiotlb = 0;
3689 #endif
3690         dma_ops = &intel_dma_ops;
3691
3692         init_iommu_pm_ops();
3693
3694         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3695
3696         bus_register_notifier(&pci_bus_type, &device_nb);
3697
3698         intel_iommu_enabled = 1;
3699
3700         return 0;
3701 }
3702
3703 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3704                                            struct pci_dev *pdev)
3705 {
3706         struct pci_dev *tmp, *parent;
3707
3708         if (!iommu || !pdev)
3709                 return;
3710
3711         /* dependent device detach */
3712         tmp = pci_find_upstream_pcie_bridge(pdev);
3713         /* Secondary interface's bus number and devfn 0 */
3714         if (tmp) {
3715                 parent = pdev->bus->self;
3716                 while (parent != tmp) {
3717                         iommu_detach_dev(iommu, parent->bus->number,
3718                                          parent->devfn);
3719                         parent = parent->bus->self;
3720                 }
3721                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3722                         iommu_detach_dev(iommu,
3723                                 tmp->subordinate->number, 0);
3724                 else /* this is a legacy PCI bridge */
3725                         iommu_detach_dev(iommu, tmp->bus->number,
3726                                          tmp->devfn);
3727         }
3728 }
3729
3730 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3731                                           struct pci_dev *pdev)
3732 {
3733         struct device_domain_info *info;
3734         struct intel_iommu *iommu;
3735         unsigned long flags;
3736         int found = 0;
3737         struct list_head *entry, *tmp;
3738
3739         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3740                                 pdev->devfn);
3741         if (!iommu)
3742                 return;
3743
3744         spin_lock_irqsave(&device_domain_lock, flags);
3745         list_for_each_safe(entry, tmp, &domain->devices) {
3746                 info = list_entry(entry, struct device_domain_info, link);
3747                 if (info->segment == pci_domain_nr(pdev->bus) &&
3748                     info->bus == pdev->bus->number &&
3749                     info->devfn == pdev->devfn) {
3750                         list_del(&info->link);
3751                         list_del(&info->global);
3752                         if (info->dev)
3753                                 info->dev->dev.archdata.iommu = NULL;
3754                         spin_unlock_irqrestore(&device_domain_lock, flags);
3755
3756                         iommu_disable_dev_iotlb(info);
3757                         iommu_detach_dev(iommu, info->bus, info->devfn);
3758                         iommu_detach_dependent_devices(iommu, pdev);
3759                         free_devinfo_mem(info);
3760
3761                         spin_lock_irqsave(&device_domain_lock, flags);
3762
3763                         if (found)
3764                                 break;
3765                         else
3766                                 continue;
3767                 }
3768
3769                 /* if there is no other devices under the same iommu
3770                  * owned by this domain, clear this iommu in iommu_bmp
3771                  * update iommu count and coherency
3772                  */
3773                 if (iommu == device_to_iommu(info->segment, info->bus,
3774                                             info->devfn))
3775                         found = 1;
3776         }
3777
3778         spin_unlock_irqrestore(&device_domain_lock, flags);
3779
3780         if (found == 0) {
3781                 unsigned long tmp_flags;
3782                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3783                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3784                 domain->iommu_count--;
3785                 domain_update_iommu_cap(domain);
3786                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3787
3788                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3789                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3790                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3791                         clear_bit(domain->id, iommu->domain_ids);
3792                         iommu->domains[domain->id] = NULL;
3793                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3794                 }
3795         }
3796 }
3797
3798 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3799 {
3800         struct device_domain_info *info;
3801         struct intel_iommu *iommu;
3802         unsigned long flags1, flags2;
3803
3804         spin_lock_irqsave(&device_domain_lock, flags1);
3805         while (!list_empty(&domain->devices)) {
3806                 info = list_entry(domain->devices.next,
3807                         struct device_domain_info, link);
3808                 list_del(&info->link);
3809                 list_del(&info->global);
3810                 if (info->dev)
3811                         info->dev->dev.archdata.iommu = NULL;
3812
3813                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3814
3815                 iommu_disable_dev_iotlb(info);
3816                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3817                 iommu_detach_dev(iommu, info->bus, info->devfn);
3818                 iommu_detach_dependent_devices(iommu, info->dev);
3819
3820                 /* clear this iommu in iommu_bmp, update iommu count
3821                  * and capabilities
3822                  */
3823                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3824                 if (test_and_clear_bit(iommu->seq_id,
3825                                        &domain->iommu_bmp)) {
3826                         domain->iommu_count--;
3827                         domain_update_iommu_cap(domain);
3828                 }
3829                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3830
3831                 free_devinfo_mem(info);
3832                 spin_lock_irqsave(&device_domain_lock, flags1);
3833         }
3834         spin_unlock_irqrestore(&device_domain_lock, flags1);
3835 }
3836
3837 /* domain id for virtual machine, it won't be set in context */
3838 static unsigned long vm_domid;
3839
3840 static struct dmar_domain *iommu_alloc_vm_domain(void)
3841 {
3842         struct dmar_domain *domain;
3843
3844         domain = alloc_domain_mem();
3845         if (!domain)
3846                 return NULL;
3847
3848         domain->id = vm_domid++;
3849         domain->nid = -1;
3850         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3851         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3852
3853         return domain;
3854 }
3855
3856 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3857 {
3858         int adjust_width;
3859
3860         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3861         spin_lock_init(&domain->iommu_lock);
3862
3863         domain_reserve_special_ranges(domain);
3864
3865         /* calculate AGAW */
3866         domain->gaw = guest_width;
3867         adjust_width = guestwidth_to_adjustwidth(guest_width);
3868         domain->agaw = width_to_agaw(adjust_width);
3869
3870         INIT_LIST_HEAD(&domain->devices);
3871
3872         domain->iommu_count = 0;
3873         domain->iommu_coherency = 0;
3874         domain->iommu_snooping = 0;
3875         domain->iommu_superpage = 0;
3876         domain->max_addr = 0;
3877         domain->nid = -1;
3878
3879         /* always allocate the top pgd */
3880         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3881         if (!domain->pgd)
3882                 return -ENOMEM;
3883         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3884         return 0;
3885 }
3886
3887 static void iommu_free_vm_domain(struct dmar_domain *domain)
3888 {
3889         unsigned long flags;
3890         struct dmar_drhd_unit *drhd;
3891         struct intel_iommu *iommu;
3892         unsigned long i;
3893         unsigned long ndomains;
3894
3895         for_each_drhd_unit(drhd) {
3896                 if (drhd->ignored)
3897                         continue;
3898                 iommu = drhd->iommu;
3899
3900                 ndomains = cap_ndoms(iommu->cap);
3901                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3902                         if (iommu->domains[i] == domain) {
3903                                 spin_lock_irqsave(&iommu->lock, flags);
3904                                 clear_bit(i, iommu->domain_ids);
3905                                 iommu->domains[i] = NULL;
3906                                 spin_unlock_irqrestore(&iommu->lock, flags);
3907                                 break;
3908                         }
3909                 }
3910         }
3911 }
3912
3913 static void vm_domain_exit(struct dmar_domain *domain)
3914 {
3915         /* Domain 0 is reserved, so dont process it */
3916         if (!domain)
3917                 return;
3918
3919         vm_domain_remove_all_dev_info(domain);
3920         /* destroy iovas */
3921         put_iova_domain(&domain->iovad);
3922
3923         /* clear ptes */
3924         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3925
3926         /* free page tables */
3927         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3928
3929         iommu_free_vm_domain(domain);
3930         free_domain_mem(domain);
3931 }
3932
3933 static int intel_iommu_domain_init(struct iommu_domain *domain)
3934 {
3935         struct dmar_domain *dmar_domain;
3936
3937         dmar_domain = iommu_alloc_vm_domain();
3938         if (!dmar_domain) {
3939                 printk(KERN_ERR
3940                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3941                 return -ENOMEM;
3942         }
3943         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3944                 printk(KERN_ERR
3945                         "intel_iommu_domain_init() failed\n");
3946                 vm_domain_exit(dmar_domain);
3947                 return -ENOMEM;
3948         }
3949         domain_update_iommu_cap(dmar_domain);
3950         domain->priv = dmar_domain;
3951
3952         return 0;
3953 }
3954
3955 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3956 {
3957         struct dmar_domain *dmar_domain = domain->priv;
3958
3959         domain->priv = NULL;
3960         vm_domain_exit(dmar_domain);
3961 }
3962
3963 static int intel_iommu_attach_device(struct iommu_domain *domain,
3964                                      struct device *dev)
3965 {
3966         struct dmar_domain *dmar_domain = domain->priv;
3967         struct pci_dev *pdev = to_pci_dev(dev);
3968         struct intel_iommu *iommu;
3969         int addr_width;
3970
3971         /* normally pdev is not mapped */
3972         if (unlikely(domain_context_mapped(pdev))) {
3973                 struct dmar_domain *old_domain;
3974
3975                 old_domain = find_domain(pdev);
3976                 if (old_domain) {
3977                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3978                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3979                                 domain_remove_one_dev_info(old_domain, pdev);
3980                         else
3981                                 domain_remove_dev_info(old_domain);
3982                 }
3983         }
3984
3985         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3986                                 pdev->devfn);
3987         if (!iommu)
3988                 return -ENODEV;
3989
3990         /* check if this iommu agaw is sufficient for max mapped address */
3991         addr_width = agaw_to_width(iommu->agaw);
3992         if (addr_width > cap_mgaw(iommu->cap))
3993                 addr_width = cap_mgaw(iommu->cap);
3994
3995         if (dmar_domain->max_addr > (1LL << addr_width)) {
3996                 printk(KERN_ERR "%s: iommu width (%d) is not "
3997                        "sufficient for the mapped address (%llx)\n",
3998                        __func__, addr_width, dmar_domain->max_addr);
3999                 return -EFAULT;
4000         }
4001         dmar_domain->gaw = addr_width;
4002
4003         /*
4004          * Knock out extra levels of page tables if necessary
4005          */
4006         while (iommu->agaw < dmar_domain->agaw) {
4007                 struct dma_pte *pte;
4008
4009                 pte = dmar_domain->pgd;
4010                 if (dma_pte_present(pte)) {
4011                         dmar_domain->pgd = (struct dma_pte *)
4012                                 phys_to_virt(dma_pte_addr(pte));
4013                         free_pgtable_page(pte);
4014                 }
4015                 dmar_domain->agaw--;
4016         }
4017
4018         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4019 }
4020
4021 static void intel_iommu_detach_device(struct iommu_domain *domain,
4022                                       struct device *dev)
4023 {
4024         struct dmar_domain *dmar_domain = domain->priv;
4025         struct pci_dev *pdev = to_pci_dev(dev);
4026
4027         domain_remove_one_dev_info(dmar_domain, pdev);
4028 }
4029
4030 static int intel_iommu_map(struct iommu_domain *domain,
4031                            unsigned long iova, phys_addr_t hpa,
4032                            int gfp_order, int iommu_prot)
4033 {
4034         struct dmar_domain *dmar_domain = domain->priv;
4035         u64 max_addr;
4036         int prot = 0;
4037         size_t size;
4038         int ret;
4039
4040         if (iommu_prot & IOMMU_READ)
4041                 prot |= DMA_PTE_READ;
4042         if (iommu_prot & IOMMU_WRITE)
4043                 prot |= DMA_PTE_WRITE;
4044         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4045                 prot |= DMA_PTE_SNP;
4046
4047         size     = PAGE_SIZE << gfp_order;
4048         max_addr = iova + size;
4049         if (dmar_domain->max_addr < max_addr) {
4050                 u64 end;
4051
4052                 /* check if minimum agaw is sufficient for mapped address */
4053                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4054                 if (end < max_addr) {
4055                         printk(KERN_ERR "%s: iommu width (%d) is not "
4056                                "sufficient for the mapped address (%llx)\n",
4057                                __func__, dmar_domain->gaw, max_addr);
4058                         return -EFAULT;
4059                 }
4060                 dmar_domain->max_addr = max_addr;
4061         }
4062         /* Round up size to next multiple of PAGE_SIZE, if it and
4063            the low bits of hpa would take us onto the next page */
4064         size = aligned_nrpages(hpa, size);
4065         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4066                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4067         return ret;
4068 }
4069
4070 static int intel_iommu_unmap(struct iommu_domain *domain,
4071                              unsigned long iova, int gfp_order)
4072 {
4073         struct dmar_domain *dmar_domain = domain->priv;
4074         size_t size = PAGE_SIZE << gfp_order;
4075         int order;
4076
4077         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4078                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4079
4080         if (dmar_domain->max_addr == iova + size)
4081                 dmar_domain->max_addr = iova;
4082
4083         return order;
4084 }
4085
4086 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4087                                             unsigned long iova)
4088 {
4089         struct dmar_domain *dmar_domain = domain->priv;
4090         struct dma_pte *pte;
4091         u64 phys = 0;
4092
4093         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4094         if (pte)
4095                 phys = dma_pte_addr(pte);
4096
4097         return phys;
4098 }
4099
4100 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4101                                       unsigned long cap)
4102 {
4103         struct dmar_domain *dmar_domain = domain->priv;
4104
4105         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4106                 return dmar_domain->iommu_snooping;
4107         if (cap == IOMMU_CAP_INTR_REMAP)
4108                 return intr_remapping_enabled;
4109
4110         return 0;
4111 }
4112
4113 static struct iommu_ops intel_iommu_ops = {
4114         .domain_init    = intel_iommu_domain_init,
4115         .domain_destroy = intel_iommu_domain_destroy,
4116         .attach_dev     = intel_iommu_attach_device,
4117         .detach_dev     = intel_iommu_detach_device,
4118         .map            = intel_iommu_map,
4119         .unmap          = intel_iommu_unmap,
4120         .iova_to_phys   = intel_iommu_iova_to_phys,
4121         .domain_has_cap = intel_iommu_domain_has_cap,
4122 };
4123
4124 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4125 {
4126         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4127         printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4128         dmar_map_gfx = 0;
4129 }
4130
4131 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4132 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4133 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4134 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4135 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4136 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4137 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4138
4139 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4140 {
4141         /*
4142          * Mobile 4 Series Chipset neglects to set RWBF capability,
4143          * but needs it:
4144          */
4145         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4146         rwbf_quirk = 1;
4147 }
4148
4149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4150
4151 #define GGC 0x52
4152 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4153 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4154 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4155 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4156 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4157 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4158 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4159 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4160
4161 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4162 {
4163         unsigned short ggc;
4164
4165         if (pci_read_config_word(dev, GGC, &ggc))
4166                 return;
4167
4168         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4169                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4170                 dmar_map_gfx = 0;
4171         } else if (dmar_map_gfx) {
4172                 /* we have to ensure the gfx device is idle before we flush */
4173                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4174                 intel_iommu_strict = 1;
4175        }
4176 }
4177 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4178 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4179 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4180 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4181
4182 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4183    ISOCH DMAR unit for the Azalia sound device, but not give it any
4184    TLB entries, which causes it to deadlock. Check for that.  We do
4185    this in a function called from init_dmars(), instead of in a PCI
4186    quirk, because we don't want to print the obnoxious "BIOS broken"
4187    message if VT-d is actually disabled.
4188 */
4189 static void __init check_tylersburg_isoch(void)
4190 {
4191         struct pci_dev *pdev;
4192         uint32_t vtisochctrl;
4193
4194         /* If there's no Azalia in the system anyway, forget it. */
4195         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4196         if (!pdev)
4197                 return;
4198         pci_dev_put(pdev);
4199
4200         /* System Management Registers. Might be hidden, in which case
4201            we can't do the sanity check. But that's OK, because the
4202            known-broken BIOSes _don't_ actually hide it, so far. */
4203         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4204         if (!pdev)
4205                 return;
4206
4207         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4208                 pci_dev_put(pdev);
4209                 return;
4210         }
4211
4212         pci_dev_put(pdev);
4213
4214         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4215         if (vtisochctrl & 1)
4216                 return;
4217
4218         /* Drop all bits other than the number of TLB entries */
4219         vtisochctrl &= 0x1c;
4220
4221         /* If we have the recommended number of TLB entries (16), fine. */
4222         if (vtisochctrl == 0x10)
4223                 return;
4224
4225         /* Zero TLB entries? You get to ride the short bus to school. */
4226         if (!vtisochctrl) {
4227                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4228                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4229                      dmi_get_system_info(DMI_BIOS_VENDOR),
4230                      dmi_get_system_info(DMI_BIOS_VERSION),
4231                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4232                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4233                 return;
4234         }
4235         
4236         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4237                vtisochctrl);
4238 }