e83aa8ee27afbf79a819a00ecb3ae99241394a63
[pandora-kernel.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
46
47 #define ROOT_SIZE               VTD_PAGE_SIZE
48 #define CONTEXT_SIZE            VTD_PAGE_SIZE
49
50 #define IS_BRIDGE_HOST_DEVICE(pdev) \
51                             ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63
64 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66
67 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
70                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72
73 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
74 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
75 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
76
77 /* page table handling */
78 #define LEVEL_STRIDE            (9)
79 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
80
81 static inline int agaw_to_level(int agaw)
82 {
83         return agaw + 2;
84 }
85
86 static inline int agaw_to_width(int agaw)
87 {
88         return 30 + agaw * LEVEL_STRIDE;
89 }
90
91 static inline int width_to_agaw(int width)
92 {
93         return (width - 30) / LEVEL_STRIDE;
94 }
95
96 static inline unsigned int level_to_offset_bits(int level)
97 {
98         return (level - 1) * LEVEL_STRIDE;
99 }
100
101 static inline int pfn_level_offset(unsigned long pfn, int level)
102 {
103         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
104 }
105
106 static inline unsigned long level_mask(int level)
107 {
108         return -1UL << level_to_offset_bits(level);
109 }
110
111 static inline unsigned long level_size(int level)
112 {
113         return 1UL << level_to_offset_bits(level);
114 }
115
116 static inline unsigned long align_to_level(unsigned long pfn, int level)
117 {
118         return (pfn + level_size(level) - 1) & level_mask(level);
119 }
120
121 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
122 {
123         return  1 << ((lvl - 1) * LEVEL_STRIDE);
124 }
125
126 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
127    are never going to work. */
128 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
129 {
130         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
131 }
132
133 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
134 {
135         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
136 }
137 static inline unsigned long page_to_dma_pfn(struct page *pg)
138 {
139         return mm_to_dma_pfn(page_to_pfn(pg));
140 }
141 static inline unsigned long virt_to_dma_pfn(void *p)
142 {
143         return page_to_dma_pfn(virt_to_page(p));
144 }
145
146 /* global iommu list, set NULL for ignored DMAR units */
147 static struct intel_iommu **g_iommus;
148
149 static void __init check_tylersburg_isoch(void);
150 static int rwbf_quirk;
151
152 /*
153  * set to 1 to panic kernel if can't successfully enable VT-d
154  * (used when kernel is launched w/ TXT)
155  */
156 static int force_on = 0;
157
158 /*
159  * 0: Present
160  * 1-11: Reserved
161  * 12-63: Context Ptr (12 - (haw-1))
162  * 64-127: Reserved
163  */
164 struct root_entry {
165         u64     val;
166         u64     rsvd1;
167 };
168 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
169 static inline bool root_present(struct root_entry *root)
170 {
171         return (root->val & 1);
172 }
173 static inline void set_root_present(struct root_entry *root)
174 {
175         root->val |= 1;
176 }
177 static inline void set_root_value(struct root_entry *root, unsigned long value)
178 {
179         root->val |= value & VTD_PAGE_MASK;
180 }
181
182 static inline struct context_entry *
183 get_context_addr_from_root(struct root_entry *root)
184 {
185         return (struct context_entry *)
186                 (root_present(root)?phys_to_virt(
187                 root->val & VTD_PAGE_MASK) :
188                 NULL);
189 }
190
191 /*
192  * low 64 bits:
193  * 0: present
194  * 1: fault processing disable
195  * 2-3: translation type
196  * 12-63: address space root
197  * high 64 bits:
198  * 0-2: address width
199  * 3-6: aval
200  * 8-23: domain id
201  */
202 struct context_entry {
203         u64 lo;
204         u64 hi;
205 };
206
207 static inline bool context_present(struct context_entry *context)
208 {
209         return (context->lo & 1);
210 }
211 static inline void context_set_present(struct context_entry *context)
212 {
213         context->lo |= 1;
214 }
215
216 static inline void context_set_fault_enable(struct context_entry *context)
217 {
218         context->lo &= (((u64)-1) << 2) | 1;
219 }
220
221 static inline void context_set_translation_type(struct context_entry *context,
222                                                 unsigned long value)
223 {
224         context->lo &= (((u64)-1) << 4) | 3;
225         context->lo |= (value & 3) << 2;
226 }
227
228 static inline void context_set_address_root(struct context_entry *context,
229                                             unsigned long value)
230 {
231         context->lo |= value & VTD_PAGE_MASK;
232 }
233
234 static inline void context_set_address_width(struct context_entry *context,
235                                              unsigned long value)
236 {
237         context->hi |= value & 7;
238 }
239
240 static inline void context_set_domain_id(struct context_entry *context,
241                                          unsigned long value)
242 {
243         context->hi |= (value & ((1 << 16) - 1)) << 8;
244 }
245
246 static inline void context_clear_entry(struct context_entry *context)
247 {
248         context->lo = 0;
249         context->hi = 0;
250 }
251
252 /*
253  * 0: readable
254  * 1: writable
255  * 2-6: reserved
256  * 7: super page
257  * 8-10: available
258  * 11: snoop behavior
259  * 12-63: Host physcial address
260  */
261 struct dma_pte {
262         u64 val;
263 };
264
265 static inline void dma_clear_pte(struct dma_pte *pte)
266 {
267         pte->val = 0;
268 }
269
270 static inline void dma_set_pte_readable(struct dma_pte *pte)
271 {
272         pte->val |= DMA_PTE_READ;
273 }
274
275 static inline void dma_set_pte_writable(struct dma_pte *pte)
276 {
277         pte->val |= DMA_PTE_WRITE;
278 }
279
280 static inline void dma_set_pte_snp(struct dma_pte *pte)
281 {
282         pte->val |= DMA_PTE_SNP;
283 }
284
285 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
286 {
287         pte->val = (pte->val & ~3) | (prot & 3);
288 }
289
290 static inline u64 dma_pte_addr(struct dma_pte *pte)
291 {
292 #ifdef CONFIG_64BIT
293         return pte->val & VTD_PAGE_MASK;
294 #else
295         /* Must have a full atomic 64-bit read */
296         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
297 #endif
298 }
299
300 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
301 {
302         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
303 }
304
305 static inline bool dma_pte_present(struct dma_pte *pte)
306 {
307         return (pte->val & 3) != 0;
308 }
309
310 static inline bool dma_pte_superpage(struct dma_pte *pte)
311 {
312         return (pte->val & (1 << 7));
313 }
314
315 static inline int first_pte_in_page(struct dma_pte *pte)
316 {
317         return !((unsigned long)pte & ~VTD_PAGE_MASK);
318 }
319
320 /*
321  * This domain is a statically identity mapping domain.
322  *      1. This domain creats a static 1:1 mapping to all usable memory.
323  *      2. It maps to each iommu if successful.
324  *      3. Each iommu mapps to this domain if successful.
325  */
326 static struct dmar_domain *si_domain;
327 static int hw_pass_through = 1;
328
329 /* devices under the same p2p bridge are owned in one domain */
330 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
331
332 /* domain represents a virtual machine, more than one devices
333  * across iommus may be owned in one domain, e.g. kvm guest.
334  */
335 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
336
337 /* si_domain contains mulitple devices */
338 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
339
340 struct dmar_domain {
341         int     id;                     /* domain id */
342         int     nid;                    /* node id */
343         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
344
345         struct list_head devices;       /* all devices' list */
346         struct iova_domain iovad;       /* iova's that belong to this domain */
347
348         struct dma_pte  *pgd;           /* virtual address */
349         int             gaw;            /* max guest address width */
350
351         /* adjusted guest address width, 0 is level 2 30-bit */
352         int             agaw;
353
354         int             flags;          /* flags to find out type of domain */
355
356         int             iommu_coherency;/* indicate coherency of iommu access */
357         int             iommu_snooping; /* indicate snooping control feature*/
358         int             iommu_count;    /* reference count of iommu */
359         int             iommu_superpage;/* Level of superpages supported:
360                                            0 == 4KiB (no superpages), 1 == 2MiB,
361                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
362         spinlock_t      iommu_lock;     /* protect iommu set in domain */
363         u64             max_addr;       /* maximum mapped address */
364 };
365
366 /* PCI domain-device relationship */
367 struct device_domain_info {
368         struct list_head link;  /* link to domain siblings */
369         struct list_head global; /* link to global list */
370         int segment;            /* PCI domain */
371         u8 bus;                 /* PCI bus number */
372         u8 devfn;               /* PCI devfn number */
373         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
374         struct intel_iommu *iommu; /* IOMMU used by this device */
375         struct dmar_domain *domain; /* pointer to domain */
376 };
377
378 static void flush_unmaps_timeout(unsigned long data);
379
380 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
381
382 #define HIGH_WATER_MARK 250
383 struct deferred_flush_tables {
384         int next;
385         struct iova *iova[HIGH_WATER_MARK];
386         struct dmar_domain *domain[HIGH_WATER_MARK];
387 };
388
389 static struct deferred_flush_tables *deferred_flush;
390
391 /* bitmap for indexing intel_iommus */
392 static int g_num_of_iommus;
393
394 static DEFINE_SPINLOCK(async_umap_flush_lock);
395 static LIST_HEAD(unmaps_to_do);
396
397 static int timer_on;
398 static long list_size;
399
400 static void domain_remove_dev_info(struct dmar_domain *domain);
401
402 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
403 int dmar_disabled = 0;
404 #else
405 int dmar_disabled = 1;
406 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
407
408 int intel_iommu_enabled = 0;
409 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
410
411 static int dmar_map_gfx = 1;
412 static int dmar_forcedac;
413 static int intel_iommu_strict;
414 static int intel_iommu_superpage = 1;
415
416 int intel_iommu_gfx_mapped;
417 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
418
419 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
420 static DEFINE_SPINLOCK(device_domain_lock);
421 static LIST_HEAD(device_domain_list);
422
423 static struct iommu_ops intel_iommu_ops;
424
425 static int __init intel_iommu_setup(char *str)
426 {
427         if (!str)
428                 return -EINVAL;
429         while (*str) {
430                 if (!strncmp(str, "on", 2)) {
431                         dmar_disabled = 0;
432                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
433                 } else if (!strncmp(str, "off", 3)) {
434                         dmar_disabled = 1;
435                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
436                 } else if (!strncmp(str, "igfx_off", 8)) {
437                         dmar_map_gfx = 0;
438                         printk(KERN_INFO
439                                 "Intel-IOMMU: disable GFX device mapping\n");
440                 } else if (!strncmp(str, "forcedac", 8)) {
441                         printk(KERN_INFO
442                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
443                         dmar_forcedac = 1;
444                 } else if (!strncmp(str, "strict", 6)) {
445                         printk(KERN_INFO
446                                 "Intel-IOMMU: disable batched IOTLB flush\n");
447                         intel_iommu_strict = 1;
448                 } else if (!strncmp(str, "sp_off", 6)) {
449                         printk(KERN_INFO
450                                 "Intel-IOMMU: disable supported super page\n");
451                         intel_iommu_superpage = 0;
452                 }
453
454                 str += strcspn(str, ",");
455                 while (*str == ',')
456                         str++;
457         }
458         return 0;
459 }
460 __setup("intel_iommu=", intel_iommu_setup);
461
462 static struct kmem_cache *iommu_domain_cache;
463 static struct kmem_cache *iommu_devinfo_cache;
464 static struct kmem_cache *iommu_iova_cache;
465
466 static inline void *alloc_pgtable_page(int node)
467 {
468         struct page *page;
469         void *vaddr = NULL;
470
471         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
472         if (page)
473                 vaddr = page_address(page);
474         return vaddr;
475 }
476
477 static inline void free_pgtable_page(void *vaddr)
478 {
479         free_page((unsigned long)vaddr);
480 }
481
482 static inline void *alloc_domain_mem(void)
483 {
484         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
485 }
486
487 static void free_domain_mem(void *vaddr)
488 {
489         kmem_cache_free(iommu_domain_cache, vaddr);
490 }
491
492 static inline void * alloc_devinfo_mem(void)
493 {
494         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
495 }
496
497 static inline void free_devinfo_mem(void *vaddr)
498 {
499         kmem_cache_free(iommu_devinfo_cache, vaddr);
500 }
501
502 struct iova *alloc_iova_mem(void)
503 {
504         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
505 }
506
507 void free_iova_mem(struct iova *iova)
508 {
509         kmem_cache_free(iommu_iova_cache, iova);
510 }
511
512
513 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
514 {
515         unsigned long sagaw;
516         int agaw = -1;
517
518         sagaw = cap_sagaw(iommu->cap);
519         for (agaw = width_to_agaw(max_gaw);
520              agaw >= 0; agaw--) {
521                 if (test_bit(agaw, &sagaw))
522                         break;
523         }
524
525         return agaw;
526 }
527
528 /*
529  * Calculate max SAGAW for each iommu.
530  */
531 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
532 {
533         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
534 }
535
536 /*
537  * calculate agaw for each iommu.
538  * "SAGAW" may be different across iommus, use a default agaw, and
539  * get a supported less agaw for iommus that don't support the default agaw.
540  */
541 int iommu_calculate_agaw(struct intel_iommu *iommu)
542 {
543         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
544 }
545
546 /* This functionin only returns single iommu in a domain */
547 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
548 {
549         int iommu_id;
550
551         /* si_domain and vm domain should not get here. */
552         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
553         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
554
555         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
556         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
557                 return NULL;
558
559         return g_iommus[iommu_id];
560 }
561
562 static void domain_update_iommu_coherency(struct dmar_domain *domain)
563 {
564         int i;
565
566         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
567
568         domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
569
570         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
571                 if (!ecap_coherent(g_iommus[i]->ecap)) {
572                         domain->iommu_coherency = 0;
573                         break;
574                 }
575         }
576 }
577
578 static void domain_update_iommu_snooping(struct dmar_domain *domain)
579 {
580         int i;
581
582         domain->iommu_snooping = 1;
583
584         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
585                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
586                         domain->iommu_snooping = 0;
587                         break;
588                 }
589         }
590 }
591
592 static void domain_update_iommu_superpage(struct dmar_domain *domain)
593 {
594         struct dmar_drhd_unit *drhd;
595         struct intel_iommu *iommu = NULL;
596         int mask = 0xf;
597
598         if (!intel_iommu_superpage) {
599                 domain->iommu_superpage = 0;
600                 return;
601         }
602
603         /* set iommu_superpage to the smallest common denominator */
604         for_each_active_iommu(iommu, drhd) {
605                 mask &= cap_super_page_val(iommu->cap);
606                 if (!mask) {
607                         break;
608                 }
609         }
610         domain->iommu_superpage = fls(mask);
611 }
612
613 /* Some capabilities may be different across iommus */
614 static void domain_update_iommu_cap(struct dmar_domain *domain)
615 {
616         domain_update_iommu_coherency(domain);
617         domain_update_iommu_snooping(domain);
618         domain_update_iommu_superpage(domain);
619 }
620
621 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
622 {
623         struct dmar_drhd_unit *drhd = NULL;
624         int i;
625
626         for_each_drhd_unit(drhd) {
627                 if (drhd->ignored)
628                         continue;
629                 if (segment != drhd->segment)
630                         continue;
631
632                 for (i = 0; i < drhd->devices_cnt; i++) {
633                         if (drhd->devices[i] &&
634                             drhd->devices[i]->bus->number == bus &&
635                             drhd->devices[i]->devfn == devfn)
636                                 return drhd->iommu;
637                         if (drhd->devices[i] &&
638                             drhd->devices[i]->subordinate &&
639                             drhd->devices[i]->subordinate->number <= bus &&
640                             drhd->devices[i]->subordinate->subordinate >= bus)
641                                 return drhd->iommu;
642                 }
643
644                 if (drhd->include_all)
645                         return drhd->iommu;
646         }
647
648         return NULL;
649 }
650
651 static void domain_flush_cache(struct dmar_domain *domain,
652                                void *addr, int size)
653 {
654         if (!domain->iommu_coherency)
655                 clflush_cache_range(addr, size);
656 }
657
658 /* Gets context entry for a given bus and devfn */
659 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
660                 u8 bus, u8 devfn)
661 {
662         struct root_entry *root;
663         struct context_entry *context;
664         unsigned long phy_addr;
665         unsigned long flags;
666
667         spin_lock_irqsave(&iommu->lock, flags);
668         root = &iommu->root_entry[bus];
669         context = get_context_addr_from_root(root);
670         if (!context) {
671                 context = (struct context_entry *)
672                                 alloc_pgtable_page(iommu->node);
673                 if (!context) {
674                         spin_unlock_irqrestore(&iommu->lock, flags);
675                         return NULL;
676                 }
677                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
678                 phy_addr = virt_to_phys((void *)context);
679                 set_root_value(root, phy_addr);
680                 set_root_present(root);
681                 __iommu_flush_cache(iommu, root, sizeof(*root));
682         }
683         spin_unlock_irqrestore(&iommu->lock, flags);
684         return &context[devfn];
685 }
686
687 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
688 {
689         struct root_entry *root;
690         struct context_entry *context;
691         int ret;
692         unsigned long flags;
693
694         spin_lock_irqsave(&iommu->lock, flags);
695         root = &iommu->root_entry[bus];
696         context = get_context_addr_from_root(root);
697         if (!context) {
698                 ret = 0;
699                 goto out;
700         }
701         ret = context_present(&context[devfn]);
702 out:
703         spin_unlock_irqrestore(&iommu->lock, flags);
704         return ret;
705 }
706
707 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
708 {
709         struct root_entry *root;
710         struct context_entry *context;
711         unsigned long flags;
712
713         spin_lock_irqsave(&iommu->lock, flags);
714         root = &iommu->root_entry[bus];
715         context = get_context_addr_from_root(root);
716         if (context) {
717                 context_clear_entry(&context[devfn]);
718                 __iommu_flush_cache(iommu, &context[devfn], \
719                         sizeof(*context));
720         }
721         spin_unlock_irqrestore(&iommu->lock, flags);
722 }
723
724 static void free_context_table(struct intel_iommu *iommu)
725 {
726         struct root_entry *root;
727         int i;
728         unsigned long flags;
729         struct context_entry *context;
730
731         spin_lock_irqsave(&iommu->lock, flags);
732         if (!iommu->root_entry) {
733                 goto out;
734         }
735         for (i = 0; i < ROOT_ENTRY_NR; i++) {
736                 root = &iommu->root_entry[i];
737                 context = get_context_addr_from_root(root);
738                 if (context)
739                         free_pgtable_page(context);
740         }
741         free_pgtable_page(iommu->root_entry);
742         iommu->root_entry = NULL;
743 out:
744         spin_unlock_irqrestore(&iommu->lock, flags);
745 }
746
747 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
748                                       unsigned long pfn, int target_level)
749 {
750         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
751         struct dma_pte *parent, *pte = NULL;
752         int level = agaw_to_level(domain->agaw);
753         int offset;
754
755         BUG_ON(!domain->pgd);
756         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
757         parent = domain->pgd;
758
759         while (level > 0) {
760                 void *tmp_page;
761
762                 offset = pfn_level_offset(pfn, level);
763                 pte = &parent[offset];
764                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
765                         break;
766                 if (level == target_level)
767                         break;
768
769                 if (!dma_pte_present(pte)) {
770                         uint64_t pteval;
771
772                         tmp_page = alloc_pgtable_page(domain->nid);
773
774                         if (!tmp_page)
775                                 return NULL;
776
777                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
778                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
779                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
780                                 /* Someone else set it while we were thinking; use theirs. */
781                                 free_pgtable_page(tmp_page);
782                         } else {
783                                 dma_pte_addr(pte);
784                                 domain_flush_cache(domain, pte, sizeof(*pte));
785                         }
786                 }
787                 parent = phys_to_virt(dma_pte_addr(pte));
788                 level--;
789         }
790
791         return pte;
792 }
793
794
795 /* return address's pte at specific level */
796 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
797                                          unsigned long pfn,
798                                          int level, int *large_page)
799 {
800         struct dma_pte *parent, *pte = NULL;
801         int total = agaw_to_level(domain->agaw);
802         int offset;
803
804         parent = domain->pgd;
805         while (level <= total) {
806                 offset = pfn_level_offset(pfn, total);
807                 pte = &parent[offset];
808                 if (level == total)
809                         return pte;
810
811                 if (!dma_pte_present(pte)) {
812                         *large_page = total;
813                         break;
814                 }
815
816                 if (pte->val & DMA_PTE_LARGE_PAGE) {
817                         *large_page = total;
818                         return pte;
819                 }
820
821                 parent = phys_to_virt(dma_pte_addr(pte));
822                 total--;
823         }
824         return NULL;
825 }
826
827 /* clear last level pte, a tlb flush should be followed */
828 static int dma_pte_clear_range(struct dmar_domain *domain,
829                                 unsigned long start_pfn,
830                                 unsigned long last_pfn)
831 {
832         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
833         unsigned int large_page = 1;
834         struct dma_pte *first_pte, *pte;
835         int order;
836
837         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
838         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
839         BUG_ON(start_pfn > last_pfn);
840
841         /* we don't need lock here; nobody else touches the iova range */
842         do {
843                 large_page = 1;
844                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
845                 if (!pte) {
846                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
847                         continue;
848                 }
849                 do {
850                         dma_clear_pte(pte);
851                         start_pfn += lvl_to_nr_pages(large_page);
852                         pte++;
853                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
854
855                 domain_flush_cache(domain, first_pte,
856                                    (void *)pte - (void *)first_pte);
857
858         } while (start_pfn && start_pfn <= last_pfn);
859
860         order = (large_page - 1) * 9;
861         return order;
862 }
863
864 static void dma_pte_free_level(struct dmar_domain *domain, int level,
865                                struct dma_pte *pte, unsigned long pfn,
866                                unsigned long start_pfn, unsigned long last_pfn)
867 {
868         pfn = max(start_pfn, pfn);
869         pte = &pte[pfn_level_offset(pfn, level)];
870
871         do {
872                 unsigned long level_pfn;
873                 struct dma_pte *level_pte;
874
875                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
876                         goto next;
877
878                 level_pfn = pfn & level_mask(level - 1);
879                 level_pte = phys_to_virt(dma_pte_addr(pte));
880
881                 if (level > 2)
882                         dma_pte_free_level(domain, level - 1, level_pte,
883                                            level_pfn, start_pfn, last_pfn);
884
885                 /* If range covers entire pagetable, free it */
886                 if (!(start_pfn > level_pfn ||
887                       last_pfn < level_pfn + level_size(level) - 1)) {
888                         dma_clear_pte(pte);
889                         domain_flush_cache(domain, pte, sizeof(*pte));
890                         free_pgtable_page(level_pte);
891                 }
892 next:
893                 pfn += level_size(level);
894         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
895 }
896
897 /* free page table pages. last level pte should already be cleared */
898 static void dma_pte_free_pagetable(struct dmar_domain *domain,
899                                    unsigned long start_pfn,
900                                    unsigned long last_pfn)
901 {
902         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
903
904         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
905         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
906         BUG_ON(start_pfn > last_pfn);
907
908         /* We don't need lock here; nobody else touches the iova range */
909         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
910                            domain->pgd, 0, start_pfn, last_pfn);
911
912         /* free pgd */
913         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
914                 free_pgtable_page(domain->pgd);
915                 domain->pgd = NULL;
916         }
917 }
918
919 /* iommu handling */
920 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
921 {
922         struct root_entry *root;
923         unsigned long flags;
924
925         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
926         if (!root)
927                 return -ENOMEM;
928
929         __iommu_flush_cache(iommu, root, ROOT_SIZE);
930
931         spin_lock_irqsave(&iommu->lock, flags);
932         iommu->root_entry = root;
933         spin_unlock_irqrestore(&iommu->lock, flags);
934
935         return 0;
936 }
937
938 static void iommu_set_root_entry(struct intel_iommu *iommu)
939 {
940         void *addr;
941         u32 sts;
942         unsigned long flag;
943
944         addr = iommu->root_entry;
945
946         raw_spin_lock_irqsave(&iommu->register_lock, flag);
947         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
948
949         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
950
951         /* Make sure hardware complete it */
952         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
953                       readl, (sts & DMA_GSTS_RTPS), sts);
954
955         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
956 }
957
958 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
959 {
960         u32 val;
961         unsigned long flag;
962
963         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
964                 return;
965
966         raw_spin_lock_irqsave(&iommu->register_lock, flag);
967         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
968
969         /* Make sure hardware complete it */
970         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
971                       readl, (!(val & DMA_GSTS_WBFS)), val);
972
973         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
974 }
975
976 /* return value determine if we need a write buffer flush */
977 static void __iommu_flush_context(struct intel_iommu *iommu,
978                                   u16 did, u16 source_id, u8 function_mask,
979                                   u64 type)
980 {
981         u64 val = 0;
982         unsigned long flag;
983
984         switch (type) {
985         case DMA_CCMD_GLOBAL_INVL:
986                 val = DMA_CCMD_GLOBAL_INVL;
987                 break;
988         case DMA_CCMD_DOMAIN_INVL:
989                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
990                 break;
991         case DMA_CCMD_DEVICE_INVL:
992                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
993                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
994                 break;
995         default:
996                 BUG();
997         }
998         val |= DMA_CCMD_ICC;
999
1000         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1001         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1002
1003         /* Make sure hardware complete it */
1004         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1005                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1006
1007         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1008 }
1009
1010 /* return value determine if we need a write buffer flush */
1011 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1012                                 u64 addr, unsigned int size_order, u64 type)
1013 {
1014         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1015         u64 val = 0, val_iva = 0;
1016         unsigned long flag;
1017
1018         switch (type) {
1019         case DMA_TLB_GLOBAL_FLUSH:
1020                 /* global flush doesn't need set IVA_REG */
1021                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1022                 break;
1023         case DMA_TLB_DSI_FLUSH:
1024                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1025                 break;
1026         case DMA_TLB_PSI_FLUSH:
1027                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1028                 /* Note: always flush non-leaf currently */
1029                 val_iva = size_order | addr;
1030                 break;
1031         default:
1032                 BUG();
1033         }
1034         /* Note: set drain read/write */
1035 #if 0
1036         /*
1037          * This is probably to be super secure.. Looks like we can
1038          * ignore it without any impact.
1039          */
1040         if (cap_read_drain(iommu->cap))
1041                 val |= DMA_TLB_READ_DRAIN;
1042 #endif
1043         if (cap_write_drain(iommu->cap))
1044                 val |= DMA_TLB_WRITE_DRAIN;
1045
1046         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1047         /* Note: Only uses first TLB reg currently */
1048         if (val_iva)
1049                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1050         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1051
1052         /* Make sure hardware complete it */
1053         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1054                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1055
1056         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1057
1058         /* check IOTLB invalidation granularity */
1059         if (DMA_TLB_IAIG(val) == 0)
1060                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1061         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1062                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1063                         (unsigned long long)DMA_TLB_IIRG(type),
1064                         (unsigned long long)DMA_TLB_IAIG(val));
1065 }
1066
1067 static struct device_domain_info *iommu_support_dev_iotlb(
1068         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1069 {
1070         int found = 0;
1071         unsigned long flags;
1072         struct device_domain_info *info;
1073         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1074
1075         if (!ecap_dev_iotlb_support(iommu->ecap))
1076                 return NULL;
1077
1078         if (!iommu->qi)
1079                 return NULL;
1080
1081         spin_lock_irqsave(&device_domain_lock, flags);
1082         list_for_each_entry(info, &domain->devices, link)
1083                 if (info->bus == bus && info->devfn == devfn) {
1084                         found = 1;
1085                         break;
1086                 }
1087         spin_unlock_irqrestore(&device_domain_lock, flags);
1088
1089         if (!found || !info->dev)
1090                 return NULL;
1091
1092         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1093                 return NULL;
1094
1095         if (!dmar_find_matched_atsr_unit(info->dev))
1096                 return NULL;
1097
1098         info->iommu = iommu;
1099
1100         return info;
1101 }
1102
1103 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1104 {
1105         if (!info)
1106                 return;
1107
1108         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1109 }
1110
1111 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1112 {
1113         if (!info->dev || !pci_ats_enabled(info->dev))
1114                 return;
1115
1116         pci_disable_ats(info->dev);
1117 }
1118
1119 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1120                                   u64 addr, unsigned mask)
1121 {
1122         u16 sid, qdep;
1123         unsigned long flags;
1124         struct device_domain_info *info;
1125
1126         spin_lock_irqsave(&device_domain_lock, flags);
1127         list_for_each_entry(info, &domain->devices, link) {
1128                 if (!info->dev || !pci_ats_enabled(info->dev))
1129                         continue;
1130
1131                 sid = info->bus << 8 | info->devfn;
1132                 qdep = pci_ats_queue_depth(info->dev);
1133                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1134         }
1135         spin_unlock_irqrestore(&device_domain_lock, flags);
1136 }
1137
1138 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1139                                   unsigned long pfn, unsigned int pages, int map)
1140 {
1141         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1142         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1143
1144         BUG_ON(pages == 0);
1145
1146         /*
1147          * Fallback to domain selective flush if no PSI support or the size is
1148          * too big.
1149          * PSI requires page size to be 2 ^ x, and the base address is naturally
1150          * aligned to the size
1151          */
1152         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1153                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1154                                                 DMA_TLB_DSI_FLUSH);
1155         else
1156                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1157                                                 DMA_TLB_PSI_FLUSH);
1158
1159         /*
1160          * In caching mode, changes of pages from non-present to present require
1161          * flush. However, device IOTLB doesn't need to be flushed in this case.
1162          */
1163         if (!cap_caching_mode(iommu->cap) || !map)
1164                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1165 }
1166
1167 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1168 {
1169         u32 pmen;
1170         unsigned long flags;
1171
1172         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1173         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1174         pmen &= ~DMA_PMEN_EPM;
1175         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1176
1177         /* wait for the protected region status bit to clear */
1178         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1179                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1180
1181         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1182 }
1183
1184 static int iommu_enable_translation(struct intel_iommu *iommu)
1185 {
1186         u32 sts;
1187         unsigned long flags;
1188
1189         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1190         iommu->gcmd |= DMA_GCMD_TE;
1191         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1192
1193         /* Make sure hardware complete it */
1194         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1195                       readl, (sts & DMA_GSTS_TES), sts);
1196
1197         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1198         return 0;
1199 }
1200
1201 static int iommu_disable_translation(struct intel_iommu *iommu)
1202 {
1203         u32 sts;
1204         unsigned long flag;
1205
1206         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1207         iommu->gcmd &= ~DMA_GCMD_TE;
1208         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1209
1210         /* Make sure hardware complete it */
1211         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1212                       readl, (!(sts & DMA_GSTS_TES)), sts);
1213
1214         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1215         return 0;
1216 }
1217
1218
1219 static int iommu_init_domains(struct intel_iommu *iommu)
1220 {
1221         unsigned long ndomains;
1222         unsigned long nlongs;
1223
1224         ndomains = cap_ndoms(iommu->cap);
1225         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1226                         ndomains);
1227         nlongs = BITS_TO_LONGS(ndomains);
1228
1229         spin_lock_init(&iommu->lock);
1230
1231         /* TBD: there might be 64K domains,
1232          * consider other allocation for future chip
1233          */
1234         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1235         if (!iommu->domain_ids) {
1236                 printk(KERN_ERR "Allocating domain id array failed\n");
1237                 return -ENOMEM;
1238         }
1239         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1240                         GFP_KERNEL);
1241         if (!iommu->domains) {
1242                 printk(KERN_ERR "Allocating domain array failed\n");
1243                 return -ENOMEM;
1244         }
1245
1246         /*
1247          * if Caching mode is set, then invalid translations are tagged
1248          * with domainid 0. Hence we need to pre-allocate it.
1249          */
1250         if (cap_caching_mode(iommu->cap))
1251                 set_bit(0, iommu->domain_ids);
1252         return 0;
1253 }
1254
1255
1256 static void domain_exit(struct dmar_domain *domain);
1257 static void vm_domain_exit(struct dmar_domain *domain);
1258
1259 void free_dmar_iommu(struct intel_iommu *iommu)
1260 {
1261         struct dmar_domain *domain;
1262         int i;
1263         unsigned long flags;
1264
1265         if ((iommu->domains) && (iommu->domain_ids)) {
1266                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1267                         domain = iommu->domains[i];
1268                         clear_bit(i, iommu->domain_ids);
1269
1270                         spin_lock_irqsave(&domain->iommu_lock, flags);
1271                         if (--domain->iommu_count == 0) {
1272                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1273                                         vm_domain_exit(domain);
1274                                 else
1275                                         domain_exit(domain);
1276                         }
1277                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1278                 }
1279         }
1280
1281         if (iommu->gcmd & DMA_GCMD_TE)
1282                 iommu_disable_translation(iommu);
1283
1284         if (iommu->irq) {
1285                 irq_set_handler_data(iommu->irq, NULL);
1286                 /* This will mask the irq */
1287                 free_irq(iommu->irq, iommu);
1288                 destroy_irq(iommu->irq);
1289         }
1290
1291         kfree(iommu->domains);
1292         kfree(iommu->domain_ids);
1293
1294         g_iommus[iommu->seq_id] = NULL;
1295
1296         /* if all iommus are freed, free g_iommus */
1297         for (i = 0; i < g_num_of_iommus; i++) {
1298                 if (g_iommus[i])
1299                         break;
1300         }
1301
1302         if (i == g_num_of_iommus)
1303                 kfree(g_iommus);
1304
1305         /* free context mapping */
1306         free_context_table(iommu);
1307 }
1308
1309 static struct dmar_domain *alloc_domain(void)
1310 {
1311         struct dmar_domain *domain;
1312
1313         domain = alloc_domain_mem();
1314         if (!domain)
1315                 return NULL;
1316
1317         domain->nid = -1;
1318         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1319         domain->flags = 0;
1320
1321         return domain;
1322 }
1323
1324 static int iommu_attach_domain(struct dmar_domain *domain,
1325                                struct intel_iommu *iommu)
1326 {
1327         int num;
1328         unsigned long ndomains;
1329         unsigned long flags;
1330
1331         ndomains = cap_ndoms(iommu->cap);
1332
1333         spin_lock_irqsave(&iommu->lock, flags);
1334
1335         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1336         if (num >= ndomains) {
1337                 spin_unlock_irqrestore(&iommu->lock, flags);
1338                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1339                 return -ENOMEM;
1340         }
1341
1342         domain->id = num;
1343         set_bit(num, iommu->domain_ids);
1344         set_bit(iommu->seq_id, &domain->iommu_bmp);
1345         iommu->domains[num] = domain;
1346         spin_unlock_irqrestore(&iommu->lock, flags);
1347
1348         return 0;
1349 }
1350
1351 static void iommu_detach_domain(struct dmar_domain *domain,
1352                                 struct intel_iommu *iommu)
1353 {
1354         unsigned long flags;
1355         int num, ndomains;
1356         int found = 0;
1357
1358         spin_lock_irqsave(&iommu->lock, flags);
1359         ndomains = cap_ndoms(iommu->cap);
1360         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1361                 if (iommu->domains[num] == domain) {
1362                         found = 1;
1363                         break;
1364                 }
1365         }
1366
1367         if (found) {
1368                 clear_bit(num, iommu->domain_ids);
1369                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1370                 iommu->domains[num] = NULL;
1371         }
1372         spin_unlock_irqrestore(&iommu->lock, flags);
1373 }
1374
1375 static struct iova_domain reserved_iova_list;
1376 static struct lock_class_key reserved_rbtree_key;
1377
1378 static int dmar_init_reserved_ranges(void)
1379 {
1380         struct pci_dev *pdev = NULL;
1381         struct iova *iova;
1382         int i;
1383
1384         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1385
1386         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1387                 &reserved_rbtree_key);
1388
1389         /* IOAPIC ranges shouldn't be accessed by DMA */
1390         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1391                 IOVA_PFN(IOAPIC_RANGE_END));
1392         if (!iova) {
1393                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1394                 return -ENODEV;
1395         }
1396
1397         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1398         for_each_pci_dev(pdev) {
1399                 struct resource *r;
1400
1401                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1402                         r = &pdev->resource[i];
1403                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1404                                 continue;
1405                         iova = reserve_iova(&reserved_iova_list,
1406                                             IOVA_PFN(r->start),
1407                                             IOVA_PFN(r->end));
1408                         if (!iova) {
1409                                 printk(KERN_ERR "Reserve iova failed\n");
1410                                 return -ENODEV;
1411                         }
1412                 }
1413         }
1414         return 0;
1415 }
1416
1417 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1418 {
1419         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1420 }
1421
1422 static inline int guestwidth_to_adjustwidth(int gaw)
1423 {
1424         int agaw;
1425         int r = (gaw - 12) % 9;
1426
1427         if (r == 0)
1428                 agaw = gaw;
1429         else
1430                 agaw = gaw + 9 - r;
1431         if (agaw > 64)
1432                 agaw = 64;
1433         return agaw;
1434 }
1435
1436 static int domain_init(struct dmar_domain *domain, int guest_width)
1437 {
1438         struct intel_iommu *iommu;
1439         int adjust_width, agaw;
1440         unsigned long sagaw;
1441
1442         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1443         spin_lock_init(&domain->iommu_lock);
1444
1445         domain_reserve_special_ranges(domain);
1446
1447         /* calculate AGAW */
1448         iommu = domain_get_iommu(domain);
1449         if (guest_width > cap_mgaw(iommu->cap))
1450                 guest_width = cap_mgaw(iommu->cap);
1451         domain->gaw = guest_width;
1452         adjust_width = guestwidth_to_adjustwidth(guest_width);
1453         agaw = width_to_agaw(adjust_width);
1454         sagaw = cap_sagaw(iommu->cap);
1455         if (!test_bit(agaw, &sagaw)) {
1456                 /* hardware doesn't support it, choose a bigger one */
1457                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1458                 agaw = find_next_bit(&sagaw, 5, agaw);
1459                 if (agaw >= 5)
1460                         return -ENODEV;
1461         }
1462         domain->agaw = agaw;
1463         INIT_LIST_HEAD(&domain->devices);
1464
1465         if (ecap_coherent(iommu->ecap))
1466                 domain->iommu_coherency = 1;
1467         else
1468                 domain->iommu_coherency = 0;
1469
1470         if (ecap_sc_support(iommu->ecap))
1471                 domain->iommu_snooping = 1;
1472         else
1473                 domain->iommu_snooping = 0;
1474
1475         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1476         domain->iommu_count = 1;
1477         domain->nid = iommu->node;
1478
1479         /* always allocate the top pgd */
1480         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1481         if (!domain->pgd)
1482                 return -ENOMEM;
1483         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1484         return 0;
1485 }
1486
1487 static void domain_exit(struct dmar_domain *domain)
1488 {
1489         struct dmar_drhd_unit *drhd;
1490         struct intel_iommu *iommu;
1491
1492         /* Domain 0 is reserved, so dont process it */
1493         if (!domain)
1494                 return;
1495
1496         /* Flush any lazy unmaps that may reference this domain */
1497         if (!intel_iommu_strict)
1498                 flush_unmaps_timeout(0);
1499
1500         domain_remove_dev_info(domain);
1501         /* destroy iovas */
1502         put_iova_domain(&domain->iovad);
1503
1504         /* clear ptes */
1505         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1506
1507         /* free page tables */
1508         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1509
1510         for_each_active_iommu(iommu, drhd)
1511                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1512                         iommu_detach_domain(domain, iommu);
1513
1514         free_domain_mem(domain);
1515 }
1516
1517 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1518                                  u8 bus, u8 devfn, int translation)
1519 {
1520         struct context_entry *context;
1521         unsigned long flags;
1522         struct intel_iommu *iommu;
1523         struct dma_pte *pgd;
1524         unsigned long num;
1525         unsigned long ndomains;
1526         int id;
1527         int agaw;
1528         struct device_domain_info *info = NULL;
1529
1530         pr_debug("Set context mapping for %02x:%02x.%d\n",
1531                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1532
1533         BUG_ON(!domain->pgd);
1534         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1535                translation != CONTEXT_TT_MULTI_LEVEL);
1536
1537         iommu = device_to_iommu(segment, bus, devfn);
1538         if (!iommu)
1539                 return -ENODEV;
1540
1541         context = device_to_context_entry(iommu, bus, devfn);
1542         if (!context)
1543                 return -ENOMEM;
1544         spin_lock_irqsave(&iommu->lock, flags);
1545         if (context_present(context)) {
1546                 spin_unlock_irqrestore(&iommu->lock, flags);
1547                 return 0;
1548         }
1549
1550         id = domain->id;
1551         pgd = domain->pgd;
1552
1553         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1554             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1555                 int found = 0;
1556
1557                 /* find an available domain id for this device in iommu */
1558                 ndomains = cap_ndoms(iommu->cap);
1559                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1560                         if (iommu->domains[num] == domain) {
1561                                 id = num;
1562                                 found = 1;
1563                                 break;
1564                         }
1565                 }
1566
1567                 if (found == 0) {
1568                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1569                         if (num >= ndomains) {
1570                                 spin_unlock_irqrestore(&iommu->lock, flags);
1571                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1572                                 return -EFAULT;
1573                         }
1574
1575                         set_bit(num, iommu->domain_ids);
1576                         iommu->domains[num] = domain;
1577                         id = num;
1578                 }
1579
1580                 /* Skip top levels of page tables for
1581                  * iommu which has less agaw than default.
1582                  * Unnecessary for PT mode.
1583                  */
1584                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1585                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1586                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1587                                 if (!dma_pte_present(pgd)) {
1588                                         spin_unlock_irqrestore(&iommu->lock, flags);
1589                                         return -ENOMEM;
1590                                 }
1591                         }
1592                 }
1593         }
1594
1595         context_set_domain_id(context, id);
1596
1597         if (translation != CONTEXT_TT_PASS_THROUGH) {
1598                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1599                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1600                                      CONTEXT_TT_MULTI_LEVEL;
1601         }
1602         /*
1603          * In pass through mode, AW must be programmed to indicate the largest
1604          * AGAW value supported by hardware. And ASR is ignored by hardware.
1605          */
1606         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1607                 context_set_address_width(context, iommu->msagaw);
1608         else {
1609                 context_set_address_root(context, virt_to_phys(pgd));
1610                 context_set_address_width(context, iommu->agaw);
1611         }
1612
1613         context_set_translation_type(context, translation);
1614         context_set_fault_enable(context);
1615         context_set_present(context);
1616         domain_flush_cache(domain, context, sizeof(*context));
1617
1618         /*
1619          * It's a non-present to present mapping. If hardware doesn't cache
1620          * non-present entry we only need to flush the write-buffer. If the
1621          * _does_ cache non-present entries, then it does so in the special
1622          * domain #0, which we have to flush:
1623          */
1624         if (cap_caching_mode(iommu->cap)) {
1625                 iommu->flush.flush_context(iommu, 0,
1626                                            (((u16)bus) << 8) | devfn,
1627                                            DMA_CCMD_MASK_NOBIT,
1628                                            DMA_CCMD_DEVICE_INVL);
1629                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1630         } else {
1631                 iommu_flush_write_buffer(iommu);
1632         }
1633         iommu_enable_dev_iotlb(info);
1634         spin_unlock_irqrestore(&iommu->lock, flags);
1635
1636         spin_lock_irqsave(&domain->iommu_lock, flags);
1637         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1638                 domain->iommu_count++;
1639                 if (domain->iommu_count == 1)
1640                         domain->nid = iommu->node;
1641                 domain_update_iommu_cap(domain);
1642         }
1643         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1644         return 0;
1645 }
1646
1647 static int
1648 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1649                         int translation)
1650 {
1651         int ret;
1652         struct pci_dev *tmp, *parent;
1653
1654         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1655                                          pdev->bus->number, pdev->devfn,
1656                                          translation);
1657         if (ret)
1658                 return ret;
1659
1660         /* dependent device mapping */
1661         tmp = pci_find_upstream_pcie_bridge(pdev);
1662         if (!tmp)
1663                 return 0;
1664         /* Secondary interface's bus number and devfn 0 */
1665         parent = pdev->bus->self;
1666         while (parent != tmp) {
1667                 ret = domain_context_mapping_one(domain,
1668                                                  pci_domain_nr(parent->bus),
1669                                                  parent->bus->number,
1670                                                  parent->devfn, translation);
1671                 if (ret)
1672                         return ret;
1673                 parent = parent->bus->self;
1674         }
1675         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1676                 return domain_context_mapping_one(domain,
1677                                         pci_domain_nr(tmp->subordinate),
1678                                         tmp->subordinate->number, 0,
1679                                         translation);
1680         else /* this is a legacy PCI bridge */
1681                 return domain_context_mapping_one(domain,
1682                                                   pci_domain_nr(tmp->bus),
1683                                                   tmp->bus->number,
1684                                                   tmp->devfn,
1685                                                   translation);
1686 }
1687
1688 static int domain_context_mapped(struct pci_dev *pdev)
1689 {
1690         int ret;
1691         struct pci_dev *tmp, *parent;
1692         struct intel_iommu *iommu;
1693
1694         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1695                                 pdev->devfn);
1696         if (!iommu)
1697                 return -ENODEV;
1698
1699         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1700         if (!ret)
1701                 return ret;
1702         /* dependent device mapping */
1703         tmp = pci_find_upstream_pcie_bridge(pdev);
1704         if (!tmp)
1705                 return ret;
1706         /* Secondary interface's bus number and devfn 0 */
1707         parent = pdev->bus->self;
1708         while (parent != tmp) {
1709                 ret = device_context_mapped(iommu, parent->bus->number,
1710                                             parent->devfn);
1711                 if (!ret)
1712                         return ret;
1713                 parent = parent->bus->self;
1714         }
1715         if (pci_is_pcie(tmp))
1716                 return device_context_mapped(iommu, tmp->subordinate->number,
1717                                              0);
1718         else
1719                 return device_context_mapped(iommu, tmp->bus->number,
1720                                              tmp->devfn);
1721 }
1722
1723 /* Returns a number of VTD pages, but aligned to MM page size */
1724 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1725                                             size_t size)
1726 {
1727         host_addr &= ~PAGE_MASK;
1728         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1729 }
1730
1731 /* Return largest possible superpage level for a given mapping */
1732 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1733                                           unsigned long iov_pfn,
1734                                           unsigned long phy_pfn,
1735                                           unsigned long pages)
1736 {
1737         int support, level = 1;
1738         unsigned long pfnmerge;
1739
1740         support = domain->iommu_superpage;
1741
1742         /* To use a large page, the virtual *and* physical addresses
1743            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1744            of them will mean we have to use smaller pages. So just
1745            merge them and check both at once. */
1746         pfnmerge = iov_pfn | phy_pfn;
1747
1748         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1749                 pages >>= VTD_STRIDE_SHIFT;
1750                 if (!pages)
1751                         break;
1752                 pfnmerge >>= VTD_STRIDE_SHIFT;
1753                 level++;
1754                 support--;
1755         }
1756         return level;
1757 }
1758
1759 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1760                             struct scatterlist *sg, unsigned long phys_pfn,
1761                             unsigned long nr_pages, int prot)
1762 {
1763         struct dma_pte *first_pte = NULL, *pte = NULL;
1764         phys_addr_t uninitialized_var(pteval);
1765         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1766         unsigned long sg_res;
1767         unsigned int largepage_lvl = 0;
1768         unsigned long lvl_pages = 0;
1769
1770         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1771
1772         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1773                 return -EINVAL;
1774
1775         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1776
1777         if (sg)
1778                 sg_res = 0;
1779         else {
1780                 sg_res = nr_pages + 1;
1781                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1782         }
1783
1784         while (nr_pages > 0) {
1785                 uint64_t tmp;
1786
1787                 if (!sg_res) {
1788                         sg_res = aligned_nrpages(sg->offset, sg->length);
1789                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1790                         sg->dma_length = sg->length;
1791                         pteval = page_to_phys(sg_page(sg)) | prot;
1792                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1793                 }
1794
1795                 if (!pte) {
1796                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1797
1798                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1799                         if (!pte)
1800                                 return -ENOMEM;
1801                         /* It is large page*/
1802                         if (largepage_lvl > 1) {
1803                                 pteval |= DMA_PTE_LARGE_PAGE;
1804                                 /* Ensure that old small page tables are removed to make room
1805                                    for superpage, if they exist. */
1806                                 dma_pte_clear_range(domain, iov_pfn,
1807                                                     iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1808                                 dma_pte_free_pagetable(domain, iov_pfn,
1809                                                        iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1810                         } else {
1811                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1812                         }
1813
1814                 }
1815                 /* We don't need lock here, nobody else
1816                  * touches the iova range
1817                  */
1818                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1819                 if (tmp) {
1820                         static int dumps = 5;
1821                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1822                                iov_pfn, tmp, (unsigned long long)pteval);
1823                         if (dumps) {
1824                                 dumps--;
1825                                 debug_dma_dump_mappings(NULL);
1826                         }
1827                         WARN_ON(1);
1828                 }
1829
1830                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1831
1832                 BUG_ON(nr_pages < lvl_pages);
1833                 BUG_ON(sg_res < lvl_pages);
1834
1835                 nr_pages -= lvl_pages;
1836                 iov_pfn += lvl_pages;
1837                 phys_pfn += lvl_pages;
1838                 pteval += lvl_pages * VTD_PAGE_SIZE;
1839                 sg_res -= lvl_pages;
1840
1841                 /* If the next PTE would be the first in a new page, then we
1842                    need to flush the cache on the entries we've just written.
1843                    And then we'll need to recalculate 'pte', so clear it and
1844                    let it get set again in the if (!pte) block above.
1845
1846                    If we're done (!nr_pages) we need to flush the cache too.
1847
1848                    Also if we've been setting superpages, we may need to
1849                    recalculate 'pte' and switch back to smaller pages for the
1850                    end of the mapping, if the trailing size is not enough to
1851                    use another superpage (i.e. sg_res < lvl_pages). */
1852                 pte++;
1853                 if (!nr_pages || first_pte_in_page(pte) ||
1854                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1855                         domain_flush_cache(domain, first_pte,
1856                                            (void *)pte - (void *)first_pte);
1857                         pte = NULL;
1858                 }
1859
1860                 if (!sg_res && nr_pages)
1861                         sg = sg_next(sg);
1862         }
1863         return 0;
1864 }
1865
1866 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1867                                     struct scatterlist *sg, unsigned long nr_pages,
1868                                     int prot)
1869 {
1870         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1871 }
1872
1873 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1874                                      unsigned long phys_pfn, unsigned long nr_pages,
1875                                      int prot)
1876 {
1877         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1878 }
1879
1880 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1881 {
1882         if (!iommu)
1883                 return;
1884
1885         clear_context_table(iommu, bus, devfn);
1886         iommu->flush.flush_context(iommu, 0, 0, 0,
1887                                            DMA_CCMD_GLOBAL_INVL);
1888         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1889 }
1890
1891 static void domain_remove_dev_info(struct dmar_domain *domain)
1892 {
1893         struct device_domain_info *info;
1894         unsigned long flags;
1895         struct intel_iommu *iommu;
1896
1897         spin_lock_irqsave(&device_domain_lock, flags);
1898         while (!list_empty(&domain->devices)) {
1899                 info = list_entry(domain->devices.next,
1900                         struct device_domain_info, link);
1901                 list_del(&info->link);
1902                 list_del(&info->global);
1903                 if (info->dev)
1904                         info->dev->dev.archdata.iommu = NULL;
1905                 spin_unlock_irqrestore(&device_domain_lock, flags);
1906
1907                 iommu_disable_dev_iotlb(info);
1908                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1909                 iommu_detach_dev(iommu, info->bus, info->devfn);
1910                 free_devinfo_mem(info);
1911
1912                 spin_lock_irqsave(&device_domain_lock, flags);
1913         }
1914         spin_unlock_irqrestore(&device_domain_lock, flags);
1915 }
1916
1917 /*
1918  * find_domain
1919  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1920  */
1921 static struct dmar_domain *
1922 find_domain(struct pci_dev *pdev)
1923 {
1924         struct device_domain_info *info;
1925
1926         /* No lock here, assumes no domain exit in normal case */
1927         info = pdev->dev.archdata.iommu;
1928         if (info)
1929                 return info->domain;
1930         return NULL;
1931 }
1932
1933 /* domain is initialized */
1934 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1935 {
1936         struct dmar_domain *domain, *found = NULL;
1937         struct intel_iommu *iommu;
1938         struct dmar_drhd_unit *drhd;
1939         struct device_domain_info *info, *tmp;
1940         struct pci_dev *dev_tmp;
1941         unsigned long flags;
1942         int bus = 0, devfn = 0;
1943         int segment;
1944         int ret;
1945
1946         domain = find_domain(pdev);
1947         if (domain)
1948                 return domain;
1949
1950         segment = pci_domain_nr(pdev->bus);
1951
1952         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1953         if (dev_tmp) {
1954                 if (pci_is_pcie(dev_tmp)) {
1955                         bus = dev_tmp->subordinate->number;
1956                         devfn = 0;
1957                 } else {
1958                         bus = dev_tmp->bus->number;
1959                         devfn = dev_tmp->devfn;
1960                 }
1961                 spin_lock_irqsave(&device_domain_lock, flags);
1962                 list_for_each_entry(info, &device_domain_list, global) {
1963                         if (info->segment == segment &&
1964                             info->bus == bus && info->devfn == devfn) {
1965                                 found = info->domain;
1966                                 break;
1967                         }
1968                 }
1969                 spin_unlock_irqrestore(&device_domain_lock, flags);
1970                 /* pcie-pci bridge already has a domain, uses it */
1971                 if (found) {
1972                         domain = found;
1973                         goto found_domain;
1974                 }
1975         }
1976
1977         domain = alloc_domain();
1978         if (!domain)
1979                 goto error;
1980
1981         /* Allocate new domain for the device */
1982         drhd = dmar_find_matched_drhd_unit(pdev);
1983         if (!drhd) {
1984                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1985                         pci_name(pdev));
1986                 return NULL;
1987         }
1988         iommu = drhd->iommu;
1989
1990         ret = iommu_attach_domain(domain, iommu);
1991         if (ret) {
1992                 free_domain_mem(domain);
1993                 goto error;
1994         }
1995
1996         if (domain_init(domain, gaw)) {
1997                 domain_exit(domain);
1998                 goto error;
1999         }
2000
2001         /* register pcie-to-pci device */
2002         if (dev_tmp) {
2003                 info = alloc_devinfo_mem();
2004                 if (!info) {
2005                         domain_exit(domain);
2006                         goto error;
2007                 }
2008                 info->segment = segment;
2009                 info->bus = bus;
2010                 info->devfn = devfn;
2011                 info->dev = NULL;
2012                 info->domain = domain;
2013                 /* This domain is shared by devices under p2p bridge */
2014                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2015
2016                 /* pcie-to-pci bridge already has a domain, uses it */
2017                 found = NULL;
2018                 spin_lock_irqsave(&device_domain_lock, flags);
2019                 list_for_each_entry(tmp, &device_domain_list, global) {
2020                         if (tmp->segment == segment &&
2021                             tmp->bus == bus && tmp->devfn == devfn) {
2022                                 found = tmp->domain;
2023                                 break;
2024                         }
2025                 }
2026                 if (found) {
2027                         spin_unlock_irqrestore(&device_domain_lock, flags);
2028                         free_devinfo_mem(info);
2029                         domain_exit(domain);
2030                         domain = found;
2031                 } else {
2032                         list_add(&info->link, &domain->devices);
2033                         list_add(&info->global, &device_domain_list);
2034                         spin_unlock_irqrestore(&device_domain_lock, flags);
2035                 }
2036         }
2037
2038 found_domain:
2039         info = alloc_devinfo_mem();
2040         if (!info)
2041                 goto error;
2042         info->segment = segment;
2043         info->bus = pdev->bus->number;
2044         info->devfn = pdev->devfn;
2045         info->dev = pdev;
2046         info->domain = domain;
2047         spin_lock_irqsave(&device_domain_lock, flags);
2048         /* somebody is fast */
2049         found = find_domain(pdev);
2050         if (found != NULL) {
2051                 spin_unlock_irqrestore(&device_domain_lock, flags);
2052                 if (found != domain) {
2053                         domain_exit(domain);
2054                         domain = found;
2055                 }
2056                 free_devinfo_mem(info);
2057                 return domain;
2058         }
2059         list_add(&info->link, &domain->devices);
2060         list_add(&info->global, &device_domain_list);
2061         pdev->dev.archdata.iommu = info;
2062         spin_unlock_irqrestore(&device_domain_lock, flags);
2063         return domain;
2064 error:
2065         /* recheck it here, maybe others set it */
2066         return find_domain(pdev);
2067 }
2068
2069 static int iommu_identity_mapping;
2070 #define IDENTMAP_ALL            1
2071 #define IDENTMAP_GFX            2
2072 #define IDENTMAP_AZALIA         4
2073
2074 static int iommu_domain_identity_map(struct dmar_domain *domain,
2075                                      unsigned long long start,
2076                                      unsigned long long end)
2077 {
2078         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2079         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2080
2081         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2082                           dma_to_mm_pfn(last_vpfn))) {
2083                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2084                 return -ENOMEM;
2085         }
2086
2087         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2088                  start, end, domain->id);
2089         /*
2090          * RMRR range might have overlap with physical memory range,
2091          * clear it first
2092          */
2093         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2094
2095         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2096                                   last_vpfn - first_vpfn + 1,
2097                                   DMA_PTE_READ|DMA_PTE_WRITE);
2098 }
2099
2100 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2101                                       unsigned long long start,
2102                                       unsigned long long end)
2103 {
2104         struct dmar_domain *domain;
2105         int ret;
2106
2107         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2108         if (!domain)
2109                 return -ENOMEM;
2110
2111         /* For _hardware_ passthrough, don't bother. But for software
2112            passthrough, we do it anyway -- it may indicate a memory
2113            range which is reserved in E820, so which didn't get set
2114            up to start with in si_domain */
2115         if (domain == si_domain && hw_pass_through) {
2116                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2117                        pci_name(pdev), start, end);
2118                 return 0;
2119         }
2120
2121         printk(KERN_INFO
2122                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2123                pci_name(pdev), start, end);
2124         
2125         if (end < start) {
2126                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2127                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2128                         dmi_get_system_info(DMI_BIOS_VENDOR),
2129                         dmi_get_system_info(DMI_BIOS_VERSION),
2130                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2131                 ret = -EIO;
2132                 goto error;
2133         }
2134
2135         if (end >> agaw_to_width(domain->agaw)) {
2136                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2137                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2138                      agaw_to_width(domain->agaw),
2139                      dmi_get_system_info(DMI_BIOS_VENDOR),
2140                      dmi_get_system_info(DMI_BIOS_VERSION),
2141                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2142                 ret = -EIO;
2143                 goto error;
2144         }
2145
2146         ret = iommu_domain_identity_map(domain, start, end);
2147         if (ret)
2148                 goto error;
2149
2150         /* context entry init */
2151         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2152         if (ret)
2153                 goto error;
2154
2155         return 0;
2156
2157  error:
2158         domain_exit(domain);
2159         return ret;
2160 }
2161
2162 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2163         struct pci_dev *pdev)
2164 {
2165         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2166                 return 0;
2167         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2168                 rmrr->end_address);
2169 }
2170
2171 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2172 static inline void iommu_prepare_isa(void)
2173 {
2174         struct pci_dev *pdev;
2175         int ret;
2176
2177         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2178         if (!pdev)
2179                 return;
2180
2181         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2182         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2183
2184         if (ret)
2185                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2186                        "floppy might not work\n");
2187
2188 }
2189 #else
2190 static inline void iommu_prepare_isa(void)
2191 {
2192         return;
2193 }
2194 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2195
2196 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2197
2198 static int __init si_domain_work_fn(unsigned long start_pfn,
2199                                     unsigned long end_pfn, void *datax)
2200 {
2201         int *ret = datax;
2202
2203         *ret = iommu_domain_identity_map(si_domain,
2204                                          (uint64_t)start_pfn << PAGE_SHIFT,
2205                                          (uint64_t)end_pfn << PAGE_SHIFT);
2206         return *ret;
2207
2208 }
2209
2210 static int __init si_domain_init(int hw)
2211 {
2212         struct dmar_drhd_unit *drhd;
2213         struct intel_iommu *iommu;
2214         int nid, ret = 0;
2215
2216         si_domain = alloc_domain();
2217         if (!si_domain)
2218                 return -EFAULT;
2219
2220         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2221
2222         for_each_active_iommu(iommu, drhd) {
2223                 ret = iommu_attach_domain(si_domain, iommu);
2224                 if (ret) {
2225                         domain_exit(si_domain);
2226                         return -EFAULT;
2227                 }
2228         }
2229
2230         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2231                 domain_exit(si_domain);
2232                 return -EFAULT;
2233         }
2234
2235         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2236
2237         if (hw)
2238                 return 0;
2239
2240         for_each_online_node(nid) {
2241                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2242                 if (ret)
2243                         return ret;
2244         }
2245
2246         return 0;
2247 }
2248
2249 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2250                                           struct pci_dev *pdev);
2251 static int identity_mapping(struct pci_dev *pdev)
2252 {
2253         struct device_domain_info *info;
2254
2255         if (likely(!iommu_identity_mapping))
2256                 return 0;
2257
2258         info = pdev->dev.archdata.iommu;
2259         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2260                 return (info->domain == si_domain);
2261
2262         return 0;
2263 }
2264
2265 static int domain_add_dev_info(struct dmar_domain *domain,
2266                                struct pci_dev *pdev,
2267                                int translation)
2268 {
2269         struct device_domain_info *info;
2270         unsigned long flags;
2271         int ret;
2272
2273         info = alloc_devinfo_mem();
2274         if (!info)
2275                 return -ENOMEM;
2276
2277         info->segment = pci_domain_nr(pdev->bus);
2278         info->bus = pdev->bus->number;
2279         info->devfn = pdev->devfn;
2280         info->dev = pdev;
2281         info->domain = domain;
2282
2283         spin_lock_irqsave(&device_domain_lock, flags);
2284         list_add(&info->link, &domain->devices);
2285         list_add(&info->global, &device_domain_list);
2286         pdev->dev.archdata.iommu = info;
2287         spin_unlock_irqrestore(&device_domain_lock, flags);
2288
2289         ret = domain_context_mapping(domain, pdev, translation);
2290         if (ret) {
2291                 spin_lock_irqsave(&device_domain_lock, flags);
2292                 list_del(&info->link);
2293                 list_del(&info->global);
2294                 pdev->dev.archdata.iommu = NULL;
2295                 spin_unlock_irqrestore(&device_domain_lock, flags);
2296                 free_devinfo_mem(info);
2297                 return ret;
2298         }
2299
2300         return 0;
2301 }
2302
2303 static bool device_has_rmrr(struct pci_dev *dev)
2304 {
2305         struct dmar_rmrr_unit *rmrr;
2306         int i;
2307
2308         for_each_rmrr_units(rmrr) {
2309                 for (i = 0; i < rmrr->devices_cnt; i++) {
2310                         /*
2311                          * Return TRUE if this RMRR contains the device that
2312                          * is passed in.
2313                          */
2314                         if (rmrr->devices[i] == dev)
2315                                 return true;
2316                 }
2317         }
2318         return false;
2319 }
2320
2321 /*
2322  * There are a couple cases where we need to restrict the functionality of
2323  * devices associated with RMRRs.  The first is when evaluating a device for
2324  * identity mapping because problems exist when devices are moved in and out
2325  * of domains and their respective RMRR information is lost.  This means that
2326  * a device with associated RMRRs will never be in a "passthrough" domain.
2327  * The second is use of the device through the IOMMU API.  This interface
2328  * expects to have full control of the IOVA space for the device.  We cannot
2329  * satisfy both the requirement that RMRR access is maintained and have an
2330  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2331  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2332  * We therefore prevent devices associated with an RMRR from participating in
2333  * the IOMMU API, which eliminates them from device assignment.
2334  *
2335  * In both cases we assume that PCI USB devices with RMRRs have them largely
2336  * for historical reasons and that the RMRR space is not actively used post
2337  * boot.  This exclusion may change if vendors begin to abuse it.
2338  */
2339 static bool device_is_rmrr_locked(struct pci_dev *pdev)
2340 {
2341         return device_has_rmrr(pdev) &&
2342                 (pdev->class >> 8) != PCI_CLASS_SERIAL_USB;
2343 }
2344
2345 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2346 {
2347
2348         if (device_is_rmrr_locked(pdev))
2349                 return 0;
2350
2351         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2352                 return 1;
2353
2354         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2355                 return 1;
2356
2357         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2358                 return 0;
2359
2360         /*
2361          * We want to start off with all devices in the 1:1 domain, and
2362          * take them out later if we find they can't access all of memory.
2363          *
2364          * However, we can't do this for PCI devices behind bridges,
2365          * because all PCI devices behind the same bridge will end up
2366          * with the same source-id on their transactions.
2367          *
2368          * Practically speaking, we can't change things around for these
2369          * devices at run-time, because we can't be sure there'll be no
2370          * DMA transactions in flight for any of their siblings.
2371          * 
2372          * So PCI devices (unless they're on the root bus) as well as
2373          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2374          * the 1:1 domain, just in _case_ one of their siblings turns out
2375          * not to be able to map all of memory.
2376          */
2377         if (!pci_is_pcie(pdev)) {
2378                 if (!pci_is_root_bus(pdev->bus))
2379                         return 0;
2380                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2381                         return 0;
2382         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2383                 return 0;
2384
2385         /* 
2386          * At boot time, we don't yet know if devices will be 64-bit capable.
2387          * Assume that they will -- if they turn out not to be, then we can 
2388          * take them out of the 1:1 domain later.
2389          */
2390         if (!startup) {
2391                 /*
2392                  * If the device's dma_mask is less than the system's memory
2393                  * size then this is not a candidate for identity mapping.
2394                  */
2395                 u64 dma_mask = pdev->dma_mask;
2396
2397                 if (pdev->dev.coherent_dma_mask &&
2398                     pdev->dev.coherent_dma_mask < dma_mask)
2399                         dma_mask = pdev->dev.coherent_dma_mask;
2400
2401                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2402         }
2403
2404         return 1;
2405 }
2406
2407 static int __init iommu_prepare_static_identity_mapping(int hw)
2408 {
2409         struct pci_dev *pdev = NULL;
2410         int ret;
2411
2412         ret = si_domain_init(hw);
2413         if (ret)
2414                 return -EFAULT;
2415
2416         for_each_pci_dev(pdev) {
2417                 /* Skip Host/PCI Bridge devices */
2418                 if (IS_BRIDGE_HOST_DEVICE(pdev))
2419                         continue;
2420                 if (iommu_should_identity_map(pdev, 1)) {
2421                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2422                                hw ? "hardware" : "software", pci_name(pdev));
2423
2424                         ret = domain_add_dev_info(si_domain, pdev,
2425                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2426                                                      CONTEXT_TT_MULTI_LEVEL);
2427                         if (ret)
2428                                 return ret;
2429                 }
2430         }
2431
2432         return 0;
2433 }
2434
2435 static int __init init_dmars(void)
2436 {
2437         struct dmar_drhd_unit *drhd;
2438         struct dmar_rmrr_unit *rmrr;
2439         struct pci_dev *pdev;
2440         struct intel_iommu *iommu;
2441         int i, ret;
2442
2443         /*
2444          * for each drhd
2445          *    allocate root
2446          *    initialize and program root entry to not present
2447          * endfor
2448          */
2449         for_each_drhd_unit(drhd) {
2450                 g_num_of_iommus++;
2451                 /*
2452                  * lock not needed as this is only incremented in the single
2453                  * threaded kernel __init code path all other access are read
2454                  * only
2455                  */
2456         }
2457
2458         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2459                         GFP_KERNEL);
2460         if (!g_iommus) {
2461                 printk(KERN_ERR "Allocating global iommu array failed\n");
2462                 ret = -ENOMEM;
2463                 goto error;
2464         }
2465
2466         deferred_flush = kzalloc(g_num_of_iommus *
2467                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2468         if (!deferred_flush) {
2469                 ret = -ENOMEM;
2470                 goto error;
2471         }
2472
2473         for_each_drhd_unit(drhd) {
2474                 if (drhd->ignored)
2475                         continue;
2476
2477                 iommu = drhd->iommu;
2478                 g_iommus[iommu->seq_id] = iommu;
2479
2480                 ret = iommu_init_domains(iommu);
2481                 if (ret)
2482                         goto error;
2483
2484                 /*
2485                  * TBD:
2486                  * we could share the same root & context tables
2487                  * among all IOMMU's. Need to Split it later.
2488                  */
2489                 ret = iommu_alloc_root_entry(iommu);
2490                 if (ret) {
2491                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2492                         goto error;
2493                 }
2494                 if (!ecap_pass_through(iommu->ecap))
2495                         hw_pass_through = 0;
2496         }
2497
2498         /*
2499          * Start from the sane iommu hardware state.
2500          */
2501         for_each_drhd_unit(drhd) {
2502                 if (drhd->ignored)
2503                         continue;
2504
2505                 iommu = drhd->iommu;
2506
2507                 /*
2508                  * If the queued invalidation is already initialized by us
2509                  * (for example, while enabling interrupt-remapping) then
2510                  * we got the things already rolling from a sane state.
2511                  */
2512                 if (iommu->qi)
2513                         continue;
2514
2515                 /*
2516                  * Clear any previous faults.
2517                  */
2518                 dmar_fault(-1, iommu);
2519                 /*
2520                  * Disable queued invalidation if supported and already enabled
2521                  * before OS handover.
2522                  */
2523                 dmar_disable_qi(iommu);
2524         }
2525
2526         for_each_drhd_unit(drhd) {
2527                 if (drhd->ignored)
2528                         continue;
2529
2530                 iommu = drhd->iommu;
2531
2532                 if (dmar_enable_qi(iommu)) {
2533                         /*
2534                          * Queued Invalidate not enabled, use Register Based
2535                          * Invalidate
2536                          */
2537                         iommu->flush.flush_context = __iommu_flush_context;
2538                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2539                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2540                                "invalidation\n",
2541                                 iommu->seq_id,
2542                                (unsigned long long)drhd->reg_base_addr);
2543                 } else {
2544                         iommu->flush.flush_context = qi_flush_context;
2545                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2546                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2547                                "invalidation\n",
2548                                 iommu->seq_id,
2549                                (unsigned long long)drhd->reg_base_addr);
2550                 }
2551         }
2552
2553         if (iommu_pass_through)
2554                 iommu_identity_mapping |= IDENTMAP_ALL;
2555
2556 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2557         iommu_identity_mapping |= IDENTMAP_GFX;
2558 #endif
2559
2560         check_tylersburg_isoch();
2561
2562         /*
2563          * If pass through is not set or not enabled, setup context entries for
2564          * identity mappings for rmrr, gfx, and isa and may fall back to static
2565          * identity mapping if iommu_identity_mapping is set.
2566          */
2567         if (iommu_identity_mapping) {
2568                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2569                 if (ret) {
2570                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2571                         goto error;
2572                 }
2573         }
2574         /*
2575          * For each rmrr
2576          *   for each dev attached to rmrr
2577          *   do
2578          *     locate drhd for dev, alloc domain for dev
2579          *     allocate free domain
2580          *     allocate page table entries for rmrr
2581          *     if context not allocated for bus
2582          *           allocate and init context
2583          *           set present in root table for this bus
2584          *     init context with domain, translation etc
2585          *    endfor
2586          * endfor
2587          */
2588         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2589         for_each_rmrr_units(rmrr) {
2590                 for (i = 0; i < rmrr->devices_cnt; i++) {
2591                         pdev = rmrr->devices[i];
2592                         /*
2593                          * some BIOS lists non-exist devices in DMAR
2594                          * table.
2595                          */
2596                         if (!pdev)
2597                                 continue;
2598                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2599                         if (ret)
2600                                 printk(KERN_ERR
2601                                        "IOMMU: mapping reserved region failed\n");
2602                 }
2603         }
2604
2605         iommu_prepare_isa();
2606
2607         /*
2608          * for each drhd
2609          *   enable fault log
2610          *   global invalidate context cache
2611          *   global invalidate iotlb
2612          *   enable translation
2613          */
2614         for_each_drhd_unit(drhd) {
2615                 if (drhd->ignored) {
2616                         /*
2617                          * we always have to disable PMRs or DMA may fail on
2618                          * this device
2619                          */
2620                         if (force_on)
2621                                 iommu_disable_protect_mem_regions(drhd->iommu);
2622                         continue;
2623                 }
2624                 iommu = drhd->iommu;
2625
2626                 iommu_flush_write_buffer(iommu);
2627
2628                 ret = dmar_set_interrupt(iommu);
2629                 if (ret)
2630                         goto error;
2631
2632                 iommu_set_root_entry(iommu);
2633
2634                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2635                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2636
2637                 ret = iommu_enable_translation(iommu);
2638                 if (ret)
2639                         goto error;
2640
2641                 iommu_disable_protect_mem_regions(iommu);
2642         }
2643
2644         return 0;
2645 error:
2646         for_each_drhd_unit(drhd) {
2647                 if (drhd->ignored)
2648                         continue;
2649                 iommu = drhd->iommu;
2650                 free_iommu(iommu);
2651         }
2652         kfree(g_iommus);
2653         return ret;
2654 }
2655
2656 /* This takes a number of _MM_ pages, not VTD pages */
2657 static struct iova *intel_alloc_iova(struct device *dev,
2658                                      struct dmar_domain *domain,
2659                                      unsigned long nrpages, uint64_t dma_mask)
2660 {
2661         struct pci_dev *pdev = to_pci_dev(dev);
2662         struct iova *iova = NULL;
2663
2664         /* Restrict dma_mask to the width that the iommu can handle */
2665         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2666
2667         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2668                 /*
2669                  * First try to allocate an io virtual address in
2670                  * DMA_BIT_MASK(32) and if that fails then try allocating
2671                  * from higher range
2672                  */
2673                 iova = alloc_iova(&domain->iovad, nrpages,
2674                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2675                 if (iova)
2676                         return iova;
2677         }
2678         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2679         if (unlikely(!iova)) {
2680                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2681                        nrpages, pci_name(pdev));
2682                 return NULL;
2683         }
2684
2685         return iova;
2686 }
2687
2688 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2689 {
2690         struct dmar_domain *domain;
2691         int ret;
2692
2693         domain = get_domain_for_dev(pdev,
2694                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2695         if (!domain) {
2696                 printk(KERN_ERR
2697                         "Allocating domain for %s failed", pci_name(pdev));
2698                 return NULL;
2699         }
2700
2701         /* make sure context mapping is ok */
2702         if (unlikely(!domain_context_mapped(pdev))) {
2703                 ret = domain_context_mapping(domain, pdev,
2704                                              CONTEXT_TT_MULTI_LEVEL);
2705                 if (ret) {
2706                         printk(KERN_ERR
2707                                 "Domain context map for %s failed",
2708                                 pci_name(pdev));
2709                         return NULL;
2710                 }
2711         }
2712
2713         return domain;
2714 }
2715
2716 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2717 {
2718         struct device_domain_info *info;
2719
2720         /* No lock here, assumes no domain exit in normal case */
2721         info = dev->dev.archdata.iommu;
2722         if (likely(info))
2723                 return info->domain;
2724
2725         return __get_valid_domain_for_dev(dev);
2726 }
2727
2728 static int iommu_dummy(struct pci_dev *pdev)
2729 {
2730         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2731 }
2732
2733 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2734 static int iommu_no_mapping(struct device *dev)
2735 {
2736         struct pci_dev *pdev;
2737         int found;
2738
2739         if (unlikely(dev->bus != &pci_bus_type))
2740                 return 1;
2741
2742         pdev = to_pci_dev(dev);
2743         if (iommu_dummy(pdev))
2744                 return 1;
2745
2746         if (!iommu_identity_mapping)
2747                 return 0;
2748
2749         found = identity_mapping(pdev);
2750         if (found) {
2751                 if (iommu_should_identity_map(pdev, 0))
2752                         return 1;
2753                 else {
2754                         /*
2755                          * 32 bit DMA is removed from si_domain and fall back
2756                          * to non-identity mapping.
2757                          */
2758                         domain_remove_one_dev_info(si_domain, pdev);
2759                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2760                                pci_name(pdev));
2761                         return 0;
2762                 }
2763         } else {
2764                 /*
2765                  * In case of a detached 64 bit DMA device from vm, the device
2766                  * is put into si_domain for identity mapping.
2767                  */
2768                 if (iommu_should_identity_map(pdev, 0)) {
2769                         int ret;
2770                         ret = domain_add_dev_info(si_domain, pdev,
2771                                                   hw_pass_through ?
2772                                                   CONTEXT_TT_PASS_THROUGH :
2773                                                   CONTEXT_TT_MULTI_LEVEL);
2774                         if (!ret) {
2775                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2776                                        pci_name(pdev));
2777                                 return 1;
2778                         }
2779                 }
2780         }
2781
2782         return 0;
2783 }
2784
2785 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2786                                      size_t size, int dir, u64 dma_mask)
2787 {
2788         struct pci_dev *pdev = to_pci_dev(hwdev);
2789         struct dmar_domain *domain;
2790         phys_addr_t start_paddr;
2791         struct iova *iova;
2792         int prot = 0;
2793         int ret;
2794         struct intel_iommu *iommu;
2795         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2796
2797         BUG_ON(dir == DMA_NONE);
2798
2799         if (iommu_no_mapping(hwdev))
2800                 return paddr;
2801
2802         domain = get_valid_domain_for_dev(pdev);
2803         if (!domain)
2804                 return 0;
2805
2806         iommu = domain_get_iommu(domain);
2807         size = aligned_nrpages(paddr, size);
2808
2809         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2810         if (!iova)
2811                 goto error;
2812
2813         /*
2814          * Check if DMAR supports zero-length reads on write only
2815          * mappings..
2816          */
2817         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2818                         !cap_zlr(iommu->cap))
2819                 prot |= DMA_PTE_READ;
2820         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2821                 prot |= DMA_PTE_WRITE;
2822         /*
2823          * paddr - (paddr + size) might be partial page, we should map the whole
2824          * page.  Note: if two part of one page are separately mapped, we
2825          * might have two guest_addr mapping to the same host paddr, but this
2826          * is not a big problem
2827          */
2828         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2829                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2830         if (ret)
2831                 goto error;
2832
2833         /* it's a non-present to present mapping. Only flush if caching mode */
2834         if (cap_caching_mode(iommu->cap))
2835                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2836         else
2837                 iommu_flush_write_buffer(iommu);
2838
2839         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2840         start_paddr += paddr & ~PAGE_MASK;
2841         return start_paddr;
2842
2843 error:
2844         if (iova)
2845                 __free_iova(&domain->iovad, iova);
2846         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2847                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2848         return 0;
2849 }
2850
2851 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2852                                  unsigned long offset, size_t size,
2853                                  enum dma_data_direction dir,
2854                                  struct dma_attrs *attrs)
2855 {
2856         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2857                                   dir, to_pci_dev(dev)->dma_mask);
2858 }
2859
2860 static void flush_unmaps(void)
2861 {
2862         int i, j;
2863
2864         timer_on = 0;
2865
2866         /* just flush them all */
2867         for (i = 0; i < g_num_of_iommus; i++) {
2868                 struct intel_iommu *iommu = g_iommus[i];
2869                 if (!iommu)
2870                         continue;
2871
2872                 if (!deferred_flush[i].next)
2873                         continue;
2874
2875                 /* In caching mode, global flushes turn emulation expensive */
2876                 if (!cap_caching_mode(iommu->cap))
2877                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2878                                          DMA_TLB_GLOBAL_FLUSH);
2879                 for (j = 0; j < deferred_flush[i].next; j++) {
2880                         unsigned long mask;
2881                         struct iova *iova = deferred_flush[i].iova[j];
2882                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2883
2884                         /* On real hardware multiple invalidations are expensive */
2885                         if (cap_caching_mode(iommu->cap))
2886                                 iommu_flush_iotlb_psi(iommu, domain->id,
2887                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2888                         else {
2889                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2890                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2891                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2892                         }
2893                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2894                 }
2895                 deferred_flush[i].next = 0;
2896         }
2897
2898         list_size = 0;
2899 }
2900
2901 static void flush_unmaps_timeout(unsigned long data)
2902 {
2903         unsigned long flags;
2904
2905         spin_lock_irqsave(&async_umap_flush_lock, flags);
2906         flush_unmaps();
2907         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2908 }
2909
2910 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2911 {
2912         unsigned long flags;
2913         int next, iommu_id;
2914         struct intel_iommu *iommu;
2915
2916         spin_lock_irqsave(&async_umap_flush_lock, flags);
2917         if (list_size == HIGH_WATER_MARK)
2918                 flush_unmaps();
2919
2920         iommu = domain_get_iommu(dom);
2921         iommu_id = iommu->seq_id;
2922
2923         next = deferred_flush[iommu_id].next;
2924         deferred_flush[iommu_id].domain[next] = dom;
2925         deferred_flush[iommu_id].iova[next] = iova;
2926         deferred_flush[iommu_id].next++;
2927
2928         if (!timer_on) {
2929                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2930                 timer_on = 1;
2931         }
2932         list_size++;
2933         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2934 }
2935
2936 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2937                              size_t size, enum dma_data_direction dir,
2938                              struct dma_attrs *attrs)
2939 {
2940         struct pci_dev *pdev = to_pci_dev(dev);
2941         struct dmar_domain *domain;
2942         unsigned long start_pfn, last_pfn;
2943         struct iova *iova;
2944         struct intel_iommu *iommu;
2945
2946         if (iommu_no_mapping(dev))
2947                 return;
2948
2949         domain = find_domain(pdev);
2950         BUG_ON(!domain);
2951
2952         iommu = domain_get_iommu(domain);
2953
2954         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2955         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2956                       (unsigned long long)dev_addr))
2957                 return;
2958
2959         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2960         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2961
2962         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2963                  pci_name(pdev), start_pfn, last_pfn);
2964
2965         /*  clear the whole page */
2966         dma_pte_clear_range(domain, start_pfn, last_pfn);
2967
2968         /* free page tables */
2969         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2970
2971         if (intel_iommu_strict) {
2972                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2973                                       last_pfn - start_pfn + 1, 0);
2974                 /* free iova */
2975                 __free_iova(&domain->iovad, iova);
2976         } else {
2977                 add_unmap(domain, iova);
2978                 /*
2979                  * queue up the release of the unmap to save the 1/6th of the
2980                  * cpu used up by the iotlb flush operation...
2981                  */
2982         }
2983 }
2984
2985 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2986                                   dma_addr_t *dma_handle, gfp_t flags)
2987 {
2988         void *vaddr;
2989         int order;
2990
2991         size = PAGE_ALIGN(size);
2992         order = get_order(size);
2993
2994         if (!iommu_no_mapping(hwdev))
2995                 flags &= ~(GFP_DMA | GFP_DMA32);
2996         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2997                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2998                         flags |= GFP_DMA;
2999                 else
3000                         flags |= GFP_DMA32;
3001         }
3002
3003         vaddr = (void *)__get_free_pages(flags, order);
3004         if (!vaddr)
3005                 return NULL;
3006         memset(vaddr, 0, size);
3007
3008         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3009                                          DMA_BIDIRECTIONAL,
3010                                          hwdev->coherent_dma_mask);
3011         if (*dma_handle)
3012                 return vaddr;
3013         free_pages((unsigned long)vaddr, order);
3014         return NULL;
3015 }
3016
3017 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3018                                 dma_addr_t dma_handle)
3019 {
3020         int order;
3021
3022         size = PAGE_ALIGN(size);
3023         order = get_order(size);
3024
3025         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3026         free_pages((unsigned long)vaddr, order);
3027 }
3028
3029 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3030                            int nelems, enum dma_data_direction dir,
3031                            struct dma_attrs *attrs)
3032 {
3033         struct pci_dev *pdev = to_pci_dev(hwdev);
3034         struct dmar_domain *domain;
3035         unsigned long start_pfn, last_pfn;
3036         struct iova *iova;
3037         struct intel_iommu *iommu;
3038
3039         if (iommu_no_mapping(hwdev))
3040                 return;
3041
3042         domain = find_domain(pdev);
3043         BUG_ON(!domain);
3044
3045         iommu = domain_get_iommu(domain);
3046
3047         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3048         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3049                       (unsigned long long)sglist[0].dma_address))
3050                 return;
3051
3052         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3053         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3054
3055         /*  clear the whole page */
3056         dma_pte_clear_range(domain, start_pfn, last_pfn);
3057
3058         /* free page tables */
3059         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3060
3061         if (intel_iommu_strict) {
3062                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3063                                       last_pfn - start_pfn + 1, 0);
3064                 /* free iova */
3065                 __free_iova(&domain->iovad, iova);
3066         } else {
3067                 add_unmap(domain, iova);
3068                 /*
3069                  * queue up the release of the unmap to save the 1/6th of the
3070                  * cpu used up by the iotlb flush operation...
3071                  */
3072         }
3073 }
3074
3075 static int intel_nontranslate_map_sg(struct device *hddev,
3076         struct scatterlist *sglist, int nelems, int dir)
3077 {
3078         int i;
3079         struct scatterlist *sg;
3080
3081         for_each_sg(sglist, sg, nelems, i) {
3082                 BUG_ON(!sg_page(sg));
3083                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3084                 sg->dma_length = sg->length;
3085         }
3086         return nelems;
3087 }
3088
3089 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3090                         enum dma_data_direction dir, struct dma_attrs *attrs)
3091 {
3092         int i;
3093         struct pci_dev *pdev = to_pci_dev(hwdev);
3094         struct dmar_domain *domain;
3095         size_t size = 0;
3096         int prot = 0;
3097         struct iova *iova = NULL;
3098         int ret;
3099         struct scatterlist *sg;
3100         unsigned long start_vpfn;
3101         struct intel_iommu *iommu;
3102
3103         BUG_ON(dir == DMA_NONE);
3104         if (iommu_no_mapping(hwdev))
3105                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3106
3107         domain = get_valid_domain_for_dev(pdev);
3108         if (!domain)
3109                 return 0;
3110
3111         iommu = domain_get_iommu(domain);
3112
3113         for_each_sg(sglist, sg, nelems, i)
3114                 size += aligned_nrpages(sg->offset, sg->length);
3115
3116         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3117                                 pdev->dma_mask);
3118         if (!iova) {
3119                 sglist->dma_length = 0;
3120                 return 0;
3121         }
3122
3123         /*
3124          * Check if DMAR supports zero-length reads on write only
3125          * mappings..
3126          */
3127         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3128                         !cap_zlr(iommu->cap))
3129                 prot |= DMA_PTE_READ;
3130         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3131                 prot |= DMA_PTE_WRITE;
3132
3133         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3134
3135         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3136         if (unlikely(ret)) {
3137                 /*  clear the page */
3138                 dma_pte_clear_range(domain, start_vpfn,
3139                                     start_vpfn + size - 1);
3140                 /* free page tables */
3141                 dma_pte_free_pagetable(domain, start_vpfn,
3142                                        start_vpfn + size - 1);
3143                 /* free iova */
3144                 __free_iova(&domain->iovad, iova);
3145                 return 0;
3146         }
3147
3148         /* it's a non-present to present mapping. Only flush if caching mode */
3149         if (cap_caching_mode(iommu->cap))
3150                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3151         else
3152                 iommu_flush_write_buffer(iommu);
3153
3154         return nelems;
3155 }
3156
3157 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3158 {
3159         return !dma_addr;
3160 }
3161
3162 struct dma_map_ops intel_dma_ops = {
3163         .alloc_coherent = intel_alloc_coherent,
3164         .free_coherent = intel_free_coherent,
3165         .map_sg = intel_map_sg,
3166         .unmap_sg = intel_unmap_sg,
3167         .map_page = intel_map_page,
3168         .unmap_page = intel_unmap_page,
3169         .mapping_error = intel_mapping_error,
3170 };
3171
3172 static inline int iommu_domain_cache_init(void)
3173 {
3174         int ret = 0;
3175
3176         iommu_domain_cache = kmem_cache_create("iommu_domain",
3177                                          sizeof(struct dmar_domain),
3178                                          0,
3179                                          SLAB_HWCACHE_ALIGN,
3180
3181                                          NULL);
3182         if (!iommu_domain_cache) {
3183                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3184                 ret = -ENOMEM;
3185         }
3186
3187         return ret;
3188 }
3189
3190 static inline int iommu_devinfo_cache_init(void)
3191 {
3192         int ret = 0;
3193
3194         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3195                                          sizeof(struct device_domain_info),
3196                                          0,
3197                                          SLAB_HWCACHE_ALIGN,
3198                                          NULL);
3199         if (!iommu_devinfo_cache) {
3200                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3201                 ret = -ENOMEM;
3202         }
3203
3204         return ret;
3205 }
3206
3207 static inline int iommu_iova_cache_init(void)
3208 {
3209         int ret = 0;
3210
3211         iommu_iova_cache = kmem_cache_create("iommu_iova",
3212                                          sizeof(struct iova),
3213                                          0,
3214                                          SLAB_HWCACHE_ALIGN,
3215                                          NULL);
3216         if (!iommu_iova_cache) {
3217                 printk(KERN_ERR "Couldn't create iova cache\n");
3218                 ret = -ENOMEM;
3219         }
3220
3221         return ret;
3222 }
3223
3224 static int __init iommu_init_mempool(void)
3225 {
3226         int ret;
3227         ret = iommu_iova_cache_init();
3228         if (ret)
3229                 return ret;
3230
3231         ret = iommu_domain_cache_init();
3232         if (ret)
3233                 goto domain_error;
3234
3235         ret = iommu_devinfo_cache_init();
3236         if (!ret)
3237                 return ret;
3238
3239         kmem_cache_destroy(iommu_domain_cache);
3240 domain_error:
3241         kmem_cache_destroy(iommu_iova_cache);
3242
3243         return -ENOMEM;
3244 }
3245
3246 static void __init iommu_exit_mempool(void)
3247 {
3248         kmem_cache_destroy(iommu_devinfo_cache);
3249         kmem_cache_destroy(iommu_domain_cache);
3250         kmem_cache_destroy(iommu_iova_cache);
3251
3252 }
3253
3254 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3255 {
3256         struct dmar_drhd_unit *drhd;
3257         u32 vtbar;
3258         int rc;
3259
3260         /* We know that this device on this chipset has its own IOMMU.
3261          * If we find it under a different IOMMU, then the BIOS is lying
3262          * to us. Hope that the IOMMU for this device is actually
3263          * disabled, and it needs no translation...
3264          */
3265         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3266         if (rc) {
3267                 /* "can't" happen */
3268                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3269                 return;
3270         }
3271         vtbar &= 0xffff0000;
3272
3273         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3274         drhd = dmar_find_matched_drhd_unit(pdev);
3275         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3276                             TAINT_FIRMWARE_WORKAROUND,
3277                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3278                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3279 }
3280 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3281
3282 static void __init init_no_remapping_devices(void)
3283 {
3284         struct dmar_drhd_unit *drhd;
3285
3286         for_each_drhd_unit(drhd) {
3287                 if (!drhd->include_all) {
3288                         int i;
3289                         for (i = 0; i < drhd->devices_cnt; i++)
3290                                 if (drhd->devices[i] != NULL)
3291                                         break;
3292                         /* ignore DMAR unit if no pci devices exist */
3293                         if (i == drhd->devices_cnt)
3294                                 drhd->ignored = 1;
3295                 }
3296         }
3297
3298         for_each_drhd_unit(drhd) {
3299                 int i;
3300                 if (drhd->ignored || drhd->include_all)
3301                         continue;
3302
3303                 for (i = 0; i < drhd->devices_cnt; i++)
3304                         if (drhd->devices[i] &&
3305                             !IS_GFX_DEVICE(drhd->devices[i]))
3306                                 break;
3307
3308                 if (i < drhd->devices_cnt)
3309                         continue;
3310
3311                 /* This IOMMU has *only* gfx devices. Either bypass it or
3312                    set the gfx_mapped flag, as appropriate */
3313                 if (dmar_map_gfx) {
3314                         intel_iommu_gfx_mapped = 1;
3315                 } else {
3316                         drhd->ignored = 1;
3317                         for (i = 0; i < drhd->devices_cnt; i++) {
3318                                 if (!drhd->devices[i])
3319                                         continue;
3320                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3321                         }
3322                 }
3323         }
3324 }
3325
3326 #ifdef CONFIG_SUSPEND
3327 static int init_iommu_hw(void)
3328 {
3329         struct dmar_drhd_unit *drhd;
3330         struct intel_iommu *iommu = NULL;
3331
3332         for_each_active_iommu(iommu, drhd)
3333                 if (iommu->qi)
3334                         dmar_reenable_qi(iommu);
3335
3336         for_each_iommu(iommu, drhd) {
3337                 if (drhd->ignored) {
3338                         /*
3339                          * we always have to disable PMRs or DMA may fail on
3340                          * this device
3341                          */
3342                         if (force_on)
3343                                 iommu_disable_protect_mem_regions(iommu);
3344                         continue;
3345                 }
3346         
3347                 iommu_flush_write_buffer(iommu);
3348
3349                 iommu_set_root_entry(iommu);
3350
3351                 iommu->flush.flush_context(iommu, 0, 0, 0,
3352                                            DMA_CCMD_GLOBAL_INVL);
3353                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3354                                          DMA_TLB_GLOBAL_FLUSH);
3355                 if (iommu_enable_translation(iommu))
3356                         return 1;
3357                 iommu_disable_protect_mem_regions(iommu);
3358         }
3359
3360         return 0;
3361 }
3362
3363 static void iommu_flush_all(void)
3364 {
3365         struct dmar_drhd_unit *drhd;
3366         struct intel_iommu *iommu;
3367
3368         for_each_active_iommu(iommu, drhd) {
3369                 iommu->flush.flush_context(iommu, 0, 0, 0,
3370                                            DMA_CCMD_GLOBAL_INVL);
3371                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3372                                          DMA_TLB_GLOBAL_FLUSH);
3373         }
3374 }
3375
3376 static int iommu_suspend(void)
3377 {
3378         struct dmar_drhd_unit *drhd;
3379         struct intel_iommu *iommu = NULL;
3380         unsigned long flag;
3381
3382         for_each_active_iommu(iommu, drhd) {
3383                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3384                                                  GFP_ATOMIC);
3385                 if (!iommu->iommu_state)
3386                         goto nomem;
3387         }
3388
3389         iommu_flush_all();
3390
3391         for_each_active_iommu(iommu, drhd) {
3392                 iommu_disable_translation(iommu);
3393
3394                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3395
3396                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3397                         readl(iommu->reg + DMAR_FECTL_REG);
3398                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3399                         readl(iommu->reg + DMAR_FEDATA_REG);
3400                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3401                         readl(iommu->reg + DMAR_FEADDR_REG);
3402                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3403                         readl(iommu->reg + DMAR_FEUADDR_REG);
3404
3405                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3406         }
3407         return 0;
3408
3409 nomem:
3410         for_each_active_iommu(iommu, drhd)
3411                 kfree(iommu->iommu_state);
3412
3413         return -ENOMEM;
3414 }
3415
3416 static void iommu_resume(void)
3417 {
3418         struct dmar_drhd_unit *drhd;
3419         struct intel_iommu *iommu = NULL;
3420         unsigned long flag;
3421
3422         if (init_iommu_hw()) {
3423                 if (force_on)
3424                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3425                 else
3426                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3427                 return;
3428         }
3429
3430         for_each_active_iommu(iommu, drhd) {
3431
3432                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3433
3434                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3435                         iommu->reg + DMAR_FECTL_REG);
3436                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3437                         iommu->reg + DMAR_FEDATA_REG);
3438                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3439                         iommu->reg + DMAR_FEADDR_REG);
3440                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3441                         iommu->reg + DMAR_FEUADDR_REG);
3442
3443                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3444         }
3445
3446         for_each_active_iommu(iommu, drhd)
3447                 kfree(iommu->iommu_state);
3448 }
3449
3450 static struct syscore_ops iommu_syscore_ops = {
3451         .resume         = iommu_resume,
3452         .suspend        = iommu_suspend,
3453 };
3454
3455 static void __init init_iommu_pm_ops(void)
3456 {
3457         register_syscore_ops(&iommu_syscore_ops);
3458 }
3459
3460 #else
3461 static inline void init_iommu_pm_ops(void) {}
3462 #endif  /* CONFIG_PM */
3463
3464 LIST_HEAD(dmar_rmrr_units);
3465
3466 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3467 {
3468         list_add(&rmrr->list, &dmar_rmrr_units);
3469 }
3470
3471
3472 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3473 {
3474         struct acpi_dmar_reserved_memory *rmrr;
3475         struct dmar_rmrr_unit *rmrru;
3476
3477         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3478         if (!rmrru)
3479                 return -ENOMEM;
3480
3481         rmrru->hdr = header;
3482         rmrr = (struct acpi_dmar_reserved_memory *)header;
3483         rmrru->base_address = rmrr->base_address;
3484         rmrru->end_address = rmrr->end_address;
3485
3486         dmar_register_rmrr_unit(rmrru);
3487         return 0;
3488 }
3489
3490 static int __init
3491 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3492 {
3493         struct acpi_dmar_reserved_memory *rmrr;
3494         int ret;
3495
3496         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3497         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3498                 ((void *)rmrr) + rmrr->header.length,
3499                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3500
3501         if (ret || (rmrru->devices_cnt == 0)) {
3502                 list_del(&rmrru->list);
3503                 kfree(rmrru);
3504         }
3505         return ret;
3506 }
3507
3508 static LIST_HEAD(dmar_atsr_units);
3509
3510 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3511 {
3512         struct acpi_dmar_atsr *atsr;
3513         struct dmar_atsr_unit *atsru;
3514
3515         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3516         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3517         if (!atsru)
3518                 return -ENOMEM;
3519
3520         atsru->hdr = hdr;
3521         atsru->include_all = atsr->flags & 0x1;
3522
3523         list_add(&atsru->list, &dmar_atsr_units);
3524
3525         return 0;
3526 }
3527
3528 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3529 {
3530         int rc;
3531         struct acpi_dmar_atsr *atsr;
3532
3533         if (atsru->include_all)
3534                 return 0;
3535
3536         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3537         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3538                                 (void *)atsr + atsr->header.length,
3539                                 &atsru->devices_cnt, &atsru->devices,
3540                                 atsr->segment);
3541         if (rc || !atsru->devices_cnt) {
3542                 list_del(&atsru->list);
3543                 kfree(atsru);
3544         }
3545
3546         return rc;
3547 }
3548
3549 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3550 {
3551         int i;
3552         struct pci_bus *bus;
3553         struct acpi_dmar_atsr *atsr;
3554         struct dmar_atsr_unit *atsru;
3555
3556         dev = pci_physfn(dev);
3557
3558         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3559                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3560                 if (atsr->segment == pci_domain_nr(dev->bus))
3561                         goto found;
3562         }
3563
3564         return 0;
3565
3566 found:
3567         for (bus = dev->bus; bus; bus = bus->parent) {
3568                 struct pci_dev *bridge = bus->self;
3569
3570                 if (!bridge || !pci_is_pcie(bridge) ||
3571                     bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3572                         return 0;
3573
3574                 if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3575                         for (i = 0; i < atsru->devices_cnt; i++)
3576                                 if (atsru->devices[i] == bridge)
3577                                         return 1;
3578                         break;
3579                 }
3580         }
3581
3582         if (atsru->include_all)
3583                 return 1;
3584
3585         return 0;
3586 }
3587
3588 int __init dmar_parse_rmrr_atsr_dev(void)
3589 {
3590         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3591         struct dmar_atsr_unit *atsr, *atsr_n;
3592         int ret = 0;
3593
3594         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3595                 ret = rmrr_parse_dev(rmrr);
3596                 if (ret)
3597                         return ret;
3598         }
3599
3600         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3601                 ret = atsr_parse_dev(atsr);
3602                 if (ret)
3603                         return ret;
3604         }
3605
3606         return ret;
3607 }
3608
3609 /*
3610  * Here we only respond to action of unbound device from driver.
3611  *
3612  * Added device is not attached to its DMAR domain here yet. That will happen
3613  * when mapping the device to iova.
3614  */
3615 static int device_notifier(struct notifier_block *nb,
3616                                   unsigned long action, void *data)
3617 {
3618         struct device *dev = data;
3619         struct pci_dev *pdev = to_pci_dev(dev);
3620         struct dmar_domain *domain;
3621
3622         if (iommu_no_mapping(dev))
3623                 return 0;
3624
3625         domain = find_domain(pdev);
3626         if (!domain)
3627                 return 0;
3628
3629         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3630                 domain_remove_one_dev_info(domain, pdev);
3631
3632                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3633                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3634                     list_empty(&domain->devices))
3635                         domain_exit(domain);
3636         }
3637
3638         return 0;
3639 }
3640
3641 static struct notifier_block device_nb = {
3642         .notifier_call = device_notifier,
3643 };
3644
3645 int __init intel_iommu_init(void)
3646 {
3647         int ret = 0;
3648         struct dmar_drhd_unit *drhd;
3649
3650         /* VT-d is required for a TXT/tboot launch, so enforce that */
3651         force_on = tboot_force_iommu();
3652
3653         if (dmar_table_init()) {
3654                 if (force_on)
3655                         panic("tboot: Failed to initialize DMAR table\n");
3656                 return  -ENODEV;
3657         }
3658
3659         /*
3660          * Disable translation if already enabled prior to OS handover.
3661          */
3662         for_each_drhd_unit(drhd) {
3663                 struct intel_iommu *iommu;
3664
3665                 if (drhd->ignored)
3666                         continue;
3667
3668                 iommu = drhd->iommu;
3669                 if (iommu->gcmd & DMA_GCMD_TE)
3670                         iommu_disable_translation(iommu);
3671         }
3672
3673         if (dmar_dev_scope_init() < 0) {
3674                 if (force_on)
3675                         panic("tboot: Failed to initialize DMAR device scope\n");
3676                 return  -ENODEV;
3677         }
3678
3679         if (no_iommu || dmar_disabled)
3680                 return -ENODEV;
3681
3682         if (iommu_init_mempool()) {
3683                 if (force_on)
3684                         panic("tboot: Failed to initialize iommu memory\n");
3685                 return  -ENODEV;
3686         }
3687
3688         if (list_empty(&dmar_rmrr_units))
3689                 printk(KERN_INFO "DMAR: No RMRR found\n");
3690
3691         if (list_empty(&dmar_atsr_units))
3692                 printk(KERN_INFO "DMAR: No ATSR found\n");
3693
3694         if (dmar_init_reserved_ranges()) {
3695                 if (force_on)
3696                         panic("tboot: Failed to reserve iommu ranges\n");
3697                 return  -ENODEV;
3698         }
3699
3700         init_no_remapping_devices();
3701
3702         ret = init_dmars();
3703         if (ret) {
3704                 if (force_on)
3705                         panic("tboot: Failed to initialize DMARs\n");
3706                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3707                 put_iova_domain(&reserved_iova_list);
3708                 iommu_exit_mempool();
3709                 return ret;
3710         }
3711         printk(KERN_INFO
3712         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3713
3714         init_timer(&unmap_timer);
3715 #ifdef CONFIG_SWIOTLB
3716         swiotlb = 0;
3717 #endif
3718         dma_ops = &intel_dma_ops;
3719
3720         init_iommu_pm_ops();
3721
3722         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3723
3724         bus_register_notifier(&pci_bus_type, &device_nb);
3725
3726         intel_iommu_enabled = 1;
3727
3728         return 0;
3729 }
3730
3731 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3732                                            struct pci_dev *pdev)
3733 {
3734         struct pci_dev *tmp, *parent;
3735
3736         if (!iommu || !pdev)
3737                 return;
3738
3739         /* dependent device detach */
3740         tmp = pci_find_upstream_pcie_bridge(pdev);
3741         /* Secondary interface's bus number and devfn 0 */
3742         if (tmp) {
3743                 parent = pdev->bus->self;
3744                 while (parent != tmp) {
3745                         iommu_detach_dev(iommu, parent->bus->number,
3746                                          parent->devfn);
3747                         parent = parent->bus->self;
3748                 }
3749                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3750                         iommu_detach_dev(iommu,
3751                                 tmp->subordinate->number, 0);
3752                 else /* this is a legacy PCI bridge */
3753                         iommu_detach_dev(iommu, tmp->bus->number,
3754                                          tmp->devfn);
3755         }
3756 }
3757
3758 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3759                                           struct pci_dev *pdev)
3760 {
3761         struct device_domain_info *info;
3762         struct intel_iommu *iommu;
3763         unsigned long flags;
3764         int found = 0;
3765         struct list_head *entry, *tmp;
3766
3767         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3768                                 pdev->devfn);
3769         if (!iommu)
3770                 return;
3771
3772         spin_lock_irqsave(&device_domain_lock, flags);
3773         list_for_each_safe(entry, tmp, &domain->devices) {
3774                 info = list_entry(entry, struct device_domain_info, link);
3775                 if (info->segment == pci_domain_nr(pdev->bus) &&
3776                     info->bus == pdev->bus->number &&
3777                     info->devfn == pdev->devfn) {
3778                         list_del(&info->link);
3779                         list_del(&info->global);
3780                         if (info->dev)
3781                                 info->dev->dev.archdata.iommu = NULL;
3782                         spin_unlock_irqrestore(&device_domain_lock, flags);
3783
3784                         iommu_disable_dev_iotlb(info);
3785                         iommu_detach_dev(iommu, info->bus, info->devfn);
3786                         iommu_detach_dependent_devices(iommu, pdev);
3787                         free_devinfo_mem(info);
3788
3789                         spin_lock_irqsave(&device_domain_lock, flags);
3790
3791                         if (found)
3792                                 break;
3793                         else
3794                                 continue;
3795                 }
3796
3797                 /* if there is no other devices under the same iommu
3798                  * owned by this domain, clear this iommu in iommu_bmp
3799                  * update iommu count and coherency
3800                  */
3801                 if (iommu == device_to_iommu(info->segment, info->bus,
3802                                             info->devfn))
3803                         found = 1;
3804         }
3805
3806         spin_unlock_irqrestore(&device_domain_lock, flags);
3807
3808         if (found == 0) {
3809                 unsigned long tmp_flags;
3810                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3811                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3812                 domain->iommu_count--;
3813                 domain_update_iommu_cap(domain);
3814                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3815
3816                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3817                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3818                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3819                         clear_bit(domain->id, iommu->domain_ids);
3820                         iommu->domains[domain->id] = NULL;
3821                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3822                 }
3823         }
3824 }
3825
3826 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3827 {
3828         struct device_domain_info *info;
3829         struct intel_iommu *iommu;
3830         unsigned long flags1, flags2;
3831
3832         spin_lock_irqsave(&device_domain_lock, flags1);
3833         while (!list_empty(&domain->devices)) {
3834                 info = list_entry(domain->devices.next,
3835                         struct device_domain_info, link);
3836                 list_del(&info->link);
3837                 list_del(&info->global);
3838                 if (info->dev)
3839                         info->dev->dev.archdata.iommu = NULL;
3840
3841                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3842
3843                 iommu_disable_dev_iotlb(info);
3844                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3845                 iommu_detach_dev(iommu, info->bus, info->devfn);
3846                 iommu_detach_dependent_devices(iommu, info->dev);
3847
3848                 /* clear this iommu in iommu_bmp, update iommu count
3849                  * and capabilities
3850                  */
3851                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3852                 if (test_and_clear_bit(iommu->seq_id,
3853                                        &domain->iommu_bmp)) {
3854                         domain->iommu_count--;
3855                         domain_update_iommu_cap(domain);
3856                 }
3857                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3858
3859                 free_devinfo_mem(info);
3860                 spin_lock_irqsave(&device_domain_lock, flags1);
3861         }
3862         spin_unlock_irqrestore(&device_domain_lock, flags1);
3863 }
3864
3865 /* domain id for virtual machine, it won't be set in context */
3866 static unsigned long vm_domid;
3867
3868 static struct dmar_domain *iommu_alloc_vm_domain(void)
3869 {
3870         struct dmar_domain *domain;
3871
3872         domain = alloc_domain_mem();
3873         if (!domain)
3874                 return NULL;
3875
3876         domain->id = vm_domid++;
3877         domain->nid = -1;
3878         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3879         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3880
3881         return domain;
3882 }
3883
3884 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3885 {
3886         int adjust_width;
3887
3888         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3889         spin_lock_init(&domain->iommu_lock);
3890
3891         domain_reserve_special_ranges(domain);
3892
3893         /* calculate AGAW */
3894         domain->gaw = guest_width;
3895         adjust_width = guestwidth_to_adjustwidth(guest_width);
3896         domain->agaw = width_to_agaw(adjust_width);
3897
3898         INIT_LIST_HEAD(&domain->devices);
3899
3900         domain->iommu_count = 0;
3901         domain->iommu_coherency = 0;
3902         domain->iommu_snooping = 0;
3903         domain->iommu_superpage = 0;
3904         domain->max_addr = 0;
3905         domain->nid = -1;
3906
3907         /* always allocate the top pgd */
3908         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3909         if (!domain->pgd)
3910                 return -ENOMEM;
3911         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3912         return 0;
3913 }
3914
3915 static void iommu_free_vm_domain(struct dmar_domain *domain)
3916 {
3917         unsigned long flags;
3918         struct dmar_drhd_unit *drhd;
3919         struct intel_iommu *iommu;
3920         unsigned long i;
3921         unsigned long ndomains;
3922
3923         for_each_drhd_unit(drhd) {
3924                 if (drhd->ignored)
3925                         continue;
3926                 iommu = drhd->iommu;
3927
3928                 ndomains = cap_ndoms(iommu->cap);
3929                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3930                         if (iommu->domains[i] == domain) {
3931                                 spin_lock_irqsave(&iommu->lock, flags);
3932                                 clear_bit(i, iommu->domain_ids);
3933                                 iommu->domains[i] = NULL;
3934                                 spin_unlock_irqrestore(&iommu->lock, flags);
3935                                 break;
3936                         }
3937                 }
3938         }
3939 }
3940
3941 static void vm_domain_exit(struct dmar_domain *domain)
3942 {
3943         /* Domain 0 is reserved, so dont process it */
3944         if (!domain)
3945                 return;
3946
3947         vm_domain_remove_all_dev_info(domain);
3948         /* destroy iovas */
3949         put_iova_domain(&domain->iovad);
3950
3951         /* clear ptes */
3952         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3953
3954         /* free page tables */
3955         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3956
3957         iommu_free_vm_domain(domain);
3958         free_domain_mem(domain);
3959 }
3960
3961 static int intel_iommu_domain_init(struct iommu_domain *domain)
3962 {
3963         struct dmar_domain *dmar_domain;
3964
3965         dmar_domain = iommu_alloc_vm_domain();
3966         if (!dmar_domain) {
3967                 printk(KERN_ERR
3968                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3969                 return -ENOMEM;
3970         }
3971         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3972                 printk(KERN_ERR
3973                         "intel_iommu_domain_init() failed\n");
3974                 vm_domain_exit(dmar_domain);
3975                 return -ENOMEM;
3976         }
3977         domain_update_iommu_cap(dmar_domain);
3978         domain->priv = dmar_domain;
3979
3980         return 0;
3981 }
3982
3983 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3984 {
3985         struct dmar_domain *dmar_domain = domain->priv;
3986
3987         domain->priv = NULL;
3988         vm_domain_exit(dmar_domain);
3989 }
3990
3991 static int intel_iommu_attach_device(struct iommu_domain *domain,
3992                                      struct device *dev)
3993 {
3994         struct dmar_domain *dmar_domain = domain->priv;
3995         struct pci_dev *pdev = to_pci_dev(dev);
3996         struct intel_iommu *iommu;
3997         int addr_width;
3998
3999         if (device_is_rmrr_locked(pdev)) {
4000                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4001                 return -EPERM;
4002         }
4003
4004         /* normally pdev is not mapped */
4005         if (unlikely(domain_context_mapped(pdev))) {
4006                 struct dmar_domain *old_domain;
4007
4008                 old_domain = find_domain(pdev);
4009                 if (old_domain) {
4010                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4011                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4012                                 domain_remove_one_dev_info(old_domain, pdev);
4013                         else
4014                                 domain_remove_dev_info(old_domain);
4015                 }
4016         }
4017
4018         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4019                                 pdev->devfn);
4020         if (!iommu)
4021                 return -ENODEV;
4022
4023         /* check if this iommu agaw is sufficient for max mapped address */
4024         addr_width = agaw_to_width(iommu->agaw);
4025         if (addr_width > cap_mgaw(iommu->cap))
4026                 addr_width = cap_mgaw(iommu->cap);
4027
4028         if (dmar_domain->max_addr > (1LL << addr_width)) {
4029                 printk(KERN_ERR "%s: iommu width (%d) is not "
4030                        "sufficient for the mapped address (%llx)\n",
4031                        __func__, addr_width, dmar_domain->max_addr);
4032                 return -EFAULT;
4033         }
4034         dmar_domain->gaw = addr_width;
4035
4036         /*
4037          * Knock out extra levels of page tables if necessary
4038          */
4039         while (iommu->agaw < dmar_domain->agaw) {
4040                 struct dma_pte *pte;
4041
4042                 pte = dmar_domain->pgd;
4043                 if (dma_pte_present(pte)) {
4044                         dmar_domain->pgd = (struct dma_pte *)
4045                                 phys_to_virt(dma_pte_addr(pte));
4046                         free_pgtable_page(pte);
4047                 }
4048                 dmar_domain->agaw--;
4049         }
4050
4051         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4052 }
4053
4054 static void intel_iommu_detach_device(struct iommu_domain *domain,
4055                                       struct device *dev)
4056 {
4057         struct dmar_domain *dmar_domain = domain->priv;
4058         struct pci_dev *pdev = to_pci_dev(dev);
4059
4060         domain_remove_one_dev_info(dmar_domain, pdev);
4061 }
4062
4063 static int intel_iommu_map(struct iommu_domain *domain,
4064                            unsigned long iova, phys_addr_t hpa,
4065                            int gfp_order, int iommu_prot)
4066 {
4067         struct dmar_domain *dmar_domain = domain->priv;
4068         u64 max_addr;
4069         int prot = 0;
4070         size_t size;
4071         int ret;
4072
4073         if (iommu_prot & IOMMU_READ)
4074                 prot |= DMA_PTE_READ;
4075         if (iommu_prot & IOMMU_WRITE)
4076                 prot |= DMA_PTE_WRITE;
4077         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4078                 prot |= DMA_PTE_SNP;
4079
4080         size     = PAGE_SIZE << gfp_order;
4081         max_addr = iova + size;
4082         if (dmar_domain->max_addr < max_addr) {
4083                 u64 end;
4084
4085                 /* check if minimum agaw is sufficient for mapped address */
4086                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4087                 if (end < max_addr) {
4088                         printk(KERN_ERR "%s: iommu width (%d) is not "
4089                                "sufficient for the mapped address (%llx)\n",
4090                                __func__, dmar_domain->gaw, max_addr);
4091                         return -EFAULT;
4092                 }
4093                 dmar_domain->max_addr = max_addr;
4094         }
4095         /* Round up size to next multiple of PAGE_SIZE, if it and
4096            the low bits of hpa would take us onto the next page */
4097         size = aligned_nrpages(hpa, size);
4098         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4099                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4100         return ret;
4101 }
4102
4103 static int intel_iommu_unmap(struct iommu_domain *domain,
4104                              unsigned long iova, int gfp_order)
4105 {
4106         struct dmar_domain *dmar_domain = domain->priv;
4107         size_t size = PAGE_SIZE << gfp_order;
4108         int order, iommu_id;
4109
4110         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4111                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4112
4113         if (dmar_domain->max_addr == iova + size)
4114                 dmar_domain->max_addr = iova;
4115
4116         for_each_set_bit(iommu_id, &dmar_domain->iommu_bmp, g_num_of_iommus) {
4117                 struct intel_iommu *iommu = g_iommus[iommu_id];
4118                 int num, ndomains;
4119
4120                 /*
4121                  * find bit position of dmar_domain
4122                  */
4123                 ndomains = cap_ndoms(iommu->cap);
4124                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
4125                         if (iommu->domains[num] == dmar_domain)
4126                                 iommu_flush_iotlb_psi(iommu, num,
4127                                                       iova >> VTD_PAGE_SHIFT,
4128                                                       1 << order, 0);
4129                 }
4130         }
4131
4132         return order;
4133 }
4134
4135 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4136                                             unsigned long iova)
4137 {
4138         struct dmar_domain *dmar_domain = domain->priv;
4139         struct dma_pte *pte;
4140         u64 phys = 0;
4141
4142         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4143         if (pte)
4144                 phys = dma_pte_addr(pte);
4145
4146         return phys;
4147 }
4148
4149 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4150                                       unsigned long cap)
4151 {
4152         struct dmar_domain *dmar_domain = domain->priv;
4153
4154         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4155                 return dmar_domain->iommu_snooping;
4156         if (cap == IOMMU_CAP_INTR_REMAP)
4157                 return intr_remapping_enabled;
4158
4159         return 0;
4160 }
4161
4162 static struct iommu_ops intel_iommu_ops = {
4163         .domain_init    = intel_iommu_domain_init,
4164         .domain_destroy = intel_iommu_domain_destroy,
4165         .attach_dev     = intel_iommu_attach_device,
4166         .detach_dev     = intel_iommu_detach_device,
4167         .map            = intel_iommu_map,
4168         .unmap          = intel_iommu_unmap,
4169         .iova_to_phys   = intel_iommu_iova_to_phys,
4170         .domain_has_cap = intel_iommu_domain_has_cap,
4171 };
4172
4173 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4174 {
4175         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4176         printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4177         dmar_map_gfx = 0;
4178 }
4179
4180 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4181 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4182 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4183 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4184 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4185 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4186 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4187
4188 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4189 {
4190         /*
4191          * Mobile 4 Series Chipset neglects to set RWBF capability,
4192          * but needs it. Same seems to hold for the desktop versions.
4193          */
4194         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4195         rwbf_quirk = 1;
4196 }
4197
4198 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4199 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4200 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4201 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4202 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4203 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4204 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4205
4206 #define GGC 0x52
4207 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4208 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4209 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4210 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4211 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4212 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4213 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4214 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4215
4216 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4217 {
4218         unsigned short ggc;
4219
4220         if (pci_read_config_word(dev, GGC, &ggc))
4221                 return;
4222
4223         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4224                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4225                 dmar_map_gfx = 0;
4226         } else if (dmar_map_gfx) {
4227                 /* we have to ensure the gfx device is idle before we flush */
4228                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4229                 intel_iommu_strict = 1;
4230        }
4231 }
4232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4234 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4235 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4236
4237 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4238    ISOCH DMAR unit for the Azalia sound device, but not give it any
4239    TLB entries, which causes it to deadlock. Check for that.  We do
4240    this in a function called from init_dmars(), instead of in a PCI
4241    quirk, because we don't want to print the obnoxious "BIOS broken"
4242    message if VT-d is actually disabled.
4243 */
4244 static void __init check_tylersburg_isoch(void)
4245 {
4246         struct pci_dev *pdev;
4247         uint32_t vtisochctrl;
4248
4249         /* If there's no Azalia in the system anyway, forget it. */
4250         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4251         if (!pdev)
4252                 return;
4253         pci_dev_put(pdev);
4254
4255         /* System Management Registers. Might be hidden, in which case
4256            we can't do the sanity check. But that's OK, because the
4257            known-broken BIOSes _don't_ actually hide it, so far. */
4258         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4259         if (!pdev)
4260                 return;
4261
4262         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4263                 pci_dev_put(pdev);
4264                 return;
4265         }
4266
4267         pci_dev_put(pdev);
4268
4269         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4270         if (vtisochctrl & 1)
4271                 return;
4272
4273         /* Drop all bits other than the number of TLB entries */
4274         vtisochctrl &= 0x1c;
4275
4276         /* If we have the recommended number of TLB entries (16), fine. */
4277         if (vtisochctrl == 0x10)
4278                 return;
4279
4280         /* Zero TLB entries? You get to ride the short bus to school. */
4281         if (!vtisochctrl) {
4282                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4283                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4284                      dmi_get_system_info(DMI_BIOS_VENDOR),
4285                      dmi_get_system_info(DMI_BIOS_VERSION),
4286                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4287                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4288                 return;
4289         }
4290         
4291         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4292                vtisochctrl);
4293 }