iommu/vt-d: Fix an off-by-one bug in __domain_mapping()
[pandora-kernel.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
46
47 #define ROOT_SIZE               VTD_PAGE_SIZE
48 #define CONTEXT_SIZE            VTD_PAGE_SIZE
49
50 #define IS_BRIDGE_HOST_DEVICE(pdev) \
51                             ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63
64 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66
67 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
70                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72
73 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
74 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
75 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
76
77 /* page table handling */
78 #define LEVEL_STRIDE            (9)
79 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
80
81 static inline int agaw_to_level(int agaw)
82 {
83         return agaw + 2;
84 }
85
86 static inline int agaw_to_width(int agaw)
87 {
88         return 30 + agaw * LEVEL_STRIDE;
89 }
90
91 static inline int width_to_agaw(int width)
92 {
93         return (width - 30) / LEVEL_STRIDE;
94 }
95
96 static inline unsigned int level_to_offset_bits(int level)
97 {
98         return (level - 1) * LEVEL_STRIDE;
99 }
100
101 static inline int pfn_level_offset(unsigned long pfn, int level)
102 {
103         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
104 }
105
106 static inline unsigned long level_mask(int level)
107 {
108         return -1UL << level_to_offset_bits(level);
109 }
110
111 static inline unsigned long level_size(int level)
112 {
113         return 1UL << level_to_offset_bits(level);
114 }
115
116 static inline unsigned long align_to_level(unsigned long pfn, int level)
117 {
118         return (pfn + level_size(level) - 1) & level_mask(level);
119 }
120
121 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
122 {
123         return  1 << ((lvl - 1) * LEVEL_STRIDE);
124 }
125
126 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
127    are never going to work. */
128 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
129 {
130         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
131 }
132
133 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
134 {
135         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
136 }
137 static inline unsigned long page_to_dma_pfn(struct page *pg)
138 {
139         return mm_to_dma_pfn(page_to_pfn(pg));
140 }
141 static inline unsigned long virt_to_dma_pfn(void *p)
142 {
143         return page_to_dma_pfn(virt_to_page(p));
144 }
145
146 /* global iommu list, set NULL for ignored DMAR units */
147 static struct intel_iommu **g_iommus;
148
149 static void __init check_tylersburg_isoch(void);
150 static int rwbf_quirk;
151
152 /*
153  * set to 1 to panic kernel if can't successfully enable VT-d
154  * (used when kernel is launched w/ TXT)
155  */
156 static int force_on = 0;
157
158 /*
159  * 0: Present
160  * 1-11: Reserved
161  * 12-63: Context Ptr (12 - (haw-1))
162  * 64-127: Reserved
163  */
164 struct root_entry {
165         u64     val;
166         u64     rsvd1;
167 };
168 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
169 static inline bool root_present(struct root_entry *root)
170 {
171         return (root->val & 1);
172 }
173 static inline void set_root_present(struct root_entry *root)
174 {
175         root->val |= 1;
176 }
177 static inline void set_root_value(struct root_entry *root, unsigned long value)
178 {
179         root->val |= value & VTD_PAGE_MASK;
180 }
181
182 static inline struct context_entry *
183 get_context_addr_from_root(struct root_entry *root)
184 {
185         return (struct context_entry *)
186                 (root_present(root)?phys_to_virt(
187                 root->val & VTD_PAGE_MASK) :
188                 NULL);
189 }
190
191 /*
192  * low 64 bits:
193  * 0: present
194  * 1: fault processing disable
195  * 2-3: translation type
196  * 12-63: address space root
197  * high 64 bits:
198  * 0-2: address width
199  * 3-6: aval
200  * 8-23: domain id
201  */
202 struct context_entry {
203         u64 lo;
204         u64 hi;
205 };
206
207 static inline bool context_present(struct context_entry *context)
208 {
209         return (context->lo & 1);
210 }
211 static inline void context_set_present(struct context_entry *context)
212 {
213         context->lo |= 1;
214 }
215
216 static inline void context_set_fault_enable(struct context_entry *context)
217 {
218         context->lo &= (((u64)-1) << 2) | 1;
219 }
220
221 static inline void context_set_translation_type(struct context_entry *context,
222                                                 unsigned long value)
223 {
224         context->lo &= (((u64)-1) << 4) | 3;
225         context->lo |= (value & 3) << 2;
226 }
227
228 static inline void context_set_address_root(struct context_entry *context,
229                                             unsigned long value)
230 {
231         context->lo |= value & VTD_PAGE_MASK;
232 }
233
234 static inline void context_set_address_width(struct context_entry *context,
235                                              unsigned long value)
236 {
237         context->hi |= value & 7;
238 }
239
240 static inline void context_set_domain_id(struct context_entry *context,
241                                          unsigned long value)
242 {
243         context->hi |= (value & ((1 << 16) - 1)) << 8;
244 }
245
246 static inline void context_clear_entry(struct context_entry *context)
247 {
248         context->lo = 0;
249         context->hi = 0;
250 }
251
252 /*
253  * 0: readable
254  * 1: writable
255  * 2-6: reserved
256  * 7: super page
257  * 8-10: available
258  * 11: snoop behavior
259  * 12-63: Host physcial address
260  */
261 struct dma_pte {
262         u64 val;
263 };
264
265 static inline void dma_clear_pte(struct dma_pte *pte)
266 {
267         pte->val = 0;
268 }
269
270 static inline void dma_set_pte_readable(struct dma_pte *pte)
271 {
272         pte->val |= DMA_PTE_READ;
273 }
274
275 static inline void dma_set_pte_writable(struct dma_pte *pte)
276 {
277         pte->val |= DMA_PTE_WRITE;
278 }
279
280 static inline void dma_set_pte_snp(struct dma_pte *pte)
281 {
282         pte->val |= DMA_PTE_SNP;
283 }
284
285 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
286 {
287         pte->val = (pte->val & ~3) | (prot & 3);
288 }
289
290 static inline u64 dma_pte_addr(struct dma_pte *pte)
291 {
292 #ifdef CONFIG_64BIT
293         return pte->val & VTD_PAGE_MASK;
294 #else
295         /* Must have a full atomic 64-bit read */
296         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
297 #endif
298 }
299
300 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
301 {
302         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
303 }
304
305 static inline bool dma_pte_present(struct dma_pte *pte)
306 {
307         return (pte->val & 3) != 0;
308 }
309
310 static inline bool dma_pte_superpage(struct dma_pte *pte)
311 {
312         return (pte->val & (1 << 7));
313 }
314
315 static inline int first_pte_in_page(struct dma_pte *pte)
316 {
317         return !((unsigned long)pte & ~VTD_PAGE_MASK);
318 }
319
320 /*
321  * This domain is a statically identity mapping domain.
322  *      1. This domain creats a static 1:1 mapping to all usable memory.
323  *      2. It maps to each iommu if successful.
324  *      3. Each iommu mapps to this domain if successful.
325  */
326 static struct dmar_domain *si_domain;
327 static int hw_pass_through = 1;
328
329 /* devices under the same p2p bridge are owned in one domain */
330 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
331
332 /* domain represents a virtual machine, more than one devices
333  * across iommus may be owned in one domain, e.g. kvm guest.
334  */
335 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
336
337 /* si_domain contains mulitple devices */
338 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
339
340 struct dmar_domain {
341         int     id;                     /* domain id */
342         int     nid;                    /* node id */
343         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
344
345         struct list_head devices;       /* all devices' list */
346         struct iova_domain iovad;       /* iova's that belong to this domain */
347
348         struct dma_pte  *pgd;           /* virtual address */
349         int             gaw;            /* max guest address width */
350
351         /* adjusted guest address width, 0 is level 2 30-bit */
352         int             agaw;
353
354         int             flags;          /* flags to find out type of domain */
355
356         int             iommu_coherency;/* indicate coherency of iommu access */
357         int             iommu_snooping; /* indicate snooping control feature*/
358         int             iommu_count;    /* reference count of iommu */
359         int             iommu_superpage;/* Level of superpages supported:
360                                            0 == 4KiB (no superpages), 1 == 2MiB,
361                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
362         spinlock_t      iommu_lock;     /* protect iommu set in domain */
363         u64             max_addr;       /* maximum mapped address */
364 };
365
366 /* PCI domain-device relationship */
367 struct device_domain_info {
368         struct list_head link;  /* link to domain siblings */
369         struct list_head global; /* link to global list */
370         int segment;            /* PCI domain */
371         u8 bus;                 /* PCI bus number */
372         u8 devfn;               /* PCI devfn number */
373         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
374         struct intel_iommu *iommu; /* IOMMU used by this device */
375         struct dmar_domain *domain; /* pointer to domain */
376 };
377
378 static void flush_unmaps_timeout(unsigned long data);
379
380 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
381
382 #define HIGH_WATER_MARK 250
383 struct deferred_flush_tables {
384         int next;
385         struct iova *iova[HIGH_WATER_MARK];
386         struct dmar_domain *domain[HIGH_WATER_MARK];
387 };
388
389 static struct deferred_flush_tables *deferred_flush;
390
391 /* bitmap for indexing intel_iommus */
392 static int g_num_of_iommus;
393
394 static DEFINE_SPINLOCK(async_umap_flush_lock);
395 static LIST_HEAD(unmaps_to_do);
396
397 static int timer_on;
398 static long list_size;
399
400 static void domain_remove_dev_info(struct dmar_domain *domain);
401
402 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
403 int dmar_disabled = 0;
404 #else
405 int dmar_disabled = 1;
406 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
407
408 int intel_iommu_enabled = 0;
409 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
410
411 static int dmar_map_gfx = 1;
412 static int dmar_forcedac;
413 static int intel_iommu_strict;
414 static int intel_iommu_superpage = 1;
415
416 int intel_iommu_gfx_mapped;
417 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
418
419 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
420 static DEFINE_SPINLOCK(device_domain_lock);
421 static LIST_HEAD(device_domain_list);
422
423 static struct iommu_ops intel_iommu_ops;
424
425 static int __init intel_iommu_setup(char *str)
426 {
427         if (!str)
428                 return -EINVAL;
429         while (*str) {
430                 if (!strncmp(str, "on", 2)) {
431                         dmar_disabled = 0;
432                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
433                 } else if (!strncmp(str, "off", 3)) {
434                         dmar_disabled = 1;
435                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
436                 } else if (!strncmp(str, "igfx_off", 8)) {
437                         dmar_map_gfx = 0;
438                         printk(KERN_INFO
439                                 "Intel-IOMMU: disable GFX device mapping\n");
440                 } else if (!strncmp(str, "forcedac", 8)) {
441                         printk(KERN_INFO
442                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
443                         dmar_forcedac = 1;
444                 } else if (!strncmp(str, "strict", 6)) {
445                         printk(KERN_INFO
446                                 "Intel-IOMMU: disable batched IOTLB flush\n");
447                         intel_iommu_strict = 1;
448                 } else if (!strncmp(str, "sp_off", 6)) {
449                         printk(KERN_INFO
450                                 "Intel-IOMMU: disable supported super page\n");
451                         intel_iommu_superpage = 0;
452                 }
453
454                 str += strcspn(str, ",");
455                 while (*str == ',')
456                         str++;
457         }
458         return 0;
459 }
460 __setup("intel_iommu=", intel_iommu_setup);
461
462 static struct kmem_cache *iommu_domain_cache;
463 static struct kmem_cache *iommu_devinfo_cache;
464 static struct kmem_cache *iommu_iova_cache;
465
466 static inline void *alloc_pgtable_page(int node)
467 {
468         struct page *page;
469         void *vaddr = NULL;
470
471         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
472         if (page)
473                 vaddr = page_address(page);
474         return vaddr;
475 }
476
477 static inline void free_pgtable_page(void *vaddr)
478 {
479         free_page((unsigned long)vaddr);
480 }
481
482 static inline void *alloc_domain_mem(void)
483 {
484         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
485 }
486
487 static void free_domain_mem(void *vaddr)
488 {
489         kmem_cache_free(iommu_domain_cache, vaddr);
490 }
491
492 static inline void * alloc_devinfo_mem(void)
493 {
494         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
495 }
496
497 static inline void free_devinfo_mem(void *vaddr)
498 {
499         kmem_cache_free(iommu_devinfo_cache, vaddr);
500 }
501
502 struct iova *alloc_iova_mem(void)
503 {
504         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
505 }
506
507 void free_iova_mem(struct iova *iova)
508 {
509         kmem_cache_free(iommu_iova_cache, iova);
510 }
511
512
513 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
514 {
515         unsigned long sagaw;
516         int agaw = -1;
517
518         sagaw = cap_sagaw(iommu->cap);
519         for (agaw = width_to_agaw(max_gaw);
520              agaw >= 0; agaw--) {
521                 if (test_bit(agaw, &sagaw))
522                         break;
523         }
524
525         return agaw;
526 }
527
528 /*
529  * Calculate max SAGAW for each iommu.
530  */
531 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
532 {
533         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
534 }
535
536 /*
537  * calculate agaw for each iommu.
538  * "SAGAW" may be different across iommus, use a default agaw, and
539  * get a supported less agaw for iommus that don't support the default agaw.
540  */
541 int iommu_calculate_agaw(struct intel_iommu *iommu)
542 {
543         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
544 }
545
546 /* This functionin only returns single iommu in a domain */
547 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
548 {
549         int iommu_id;
550
551         /* si_domain and vm domain should not get here. */
552         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
553         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
554
555         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
556         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
557                 return NULL;
558
559         return g_iommus[iommu_id];
560 }
561
562 static void domain_update_iommu_coherency(struct dmar_domain *domain)
563 {
564         int i;
565
566         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
567
568         domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
569
570         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
571                 if (!ecap_coherent(g_iommus[i]->ecap)) {
572                         domain->iommu_coherency = 0;
573                         break;
574                 }
575         }
576 }
577
578 static void domain_update_iommu_snooping(struct dmar_domain *domain)
579 {
580         int i;
581
582         domain->iommu_snooping = 1;
583
584         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
585                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
586                         domain->iommu_snooping = 0;
587                         break;
588                 }
589         }
590 }
591
592 static void domain_update_iommu_superpage(struct dmar_domain *domain)
593 {
594         struct dmar_drhd_unit *drhd;
595         struct intel_iommu *iommu = NULL;
596         int mask = 0xf;
597
598         if (!intel_iommu_superpage) {
599                 domain->iommu_superpage = 0;
600                 return;
601         }
602
603         /* set iommu_superpage to the smallest common denominator */
604         for_each_active_iommu(iommu, drhd) {
605                 mask &= cap_super_page_val(iommu->cap);
606                 if (!mask) {
607                         break;
608                 }
609         }
610         domain->iommu_superpage = fls(mask);
611 }
612
613 /* Some capabilities may be different across iommus */
614 static void domain_update_iommu_cap(struct dmar_domain *domain)
615 {
616         domain_update_iommu_coherency(domain);
617         domain_update_iommu_snooping(domain);
618         domain_update_iommu_superpage(domain);
619 }
620
621 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
622 {
623         struct dmar_drhd_unit *drhd = NULL;
624         int i;
625
626         for_each_drhd_unit(drhd) {
627                 if (drhd->ignored)
628                         continue;
629                 if (segment != drhd->segment)
630                         continue;
631
632                 for (i = 0; i < drhd->devices_cnt; i++) {
633                         if (drhd->devices[i] &&
634                             drhd->devices[i]->bus->number == bus &&
635                             drhd->devices[i]->devfn == devfn)
636                                 return drhd->iommu;
637                         if (drhd->devices[i] &&
638                             drhd->devices[i]->subordinate &&
639                             drhd->devices[i]->subordinate->number <= bus &&
640                             drhd->devices[i]->subordinate->subordinate >= bus)
641                                 return drhd->iommu;
642                 }
643
644                 if (drhd->include_all)
645                         return drhd->iommu;
646         }
647
648         return NULL;
649 }
650
651 static void domain_flush_cache(struct dmar_domain *domain,
652                                void *addr, int size)
653 {
654         if (!domain->iommu_coherency)
655                 clflush_cache_range(addr, size);
656 }
657
658 /* Gets context entry for a given bus and devfn */
659 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
660                 u8 bus, u8 devfn)
661 {
662         struct root_entry *root;
663         struct context_entry *context;
664         unsigned long phy_addr;
665         unsigned long flags;
666
667         spin_lock_irqsave(&iommu->lock, flags);
668         root = &iommu->root_entry[bus];
669         context = get_context_addr_from_root(root);
670         if (!context) {
671                 context = (struct context_entry *)
672                                 alloc_pgtable_page(iommu->node);
673                 if (!context) {
674                         spin_unlock_irqrestore(&iommu->lock, flags);
675                         return NULL;
676                 }
677                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
678                 phy_addr = virt_to_phys((void *)context);
679                 set_root_value(root, phy_addr);
680                 set_root_present(root);
681                 __iommu_flush_cache(iommu, root, sizeof(*root));
682         }
683         spin_unlock_irqrestore(&iommu->lock, flags);
684         return &context[devfn];
685 }
686
687 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
688 {
689         struct root_entry *root;
690         struct context_entry *context;
691         int ret;
692         unsigned long flags;
693
694         spin_lock_irqsave(&iommu->lock, flags);
695         root = &iommu->root_entry[bus];
696         context = get_context_addr_from_root(root);
697         if (!context) {
698                 ret = 0;
699                 goto out;
700         }
701         ret = context_present(&context[devfn]);
702 out:
703         spin_unlock_irqrestore(&iommu->lock, flags);
704         return ret;
705 }
706
707 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
708 {
709         struct root_entry *root;
710         struct context_entry *context;
711         unsigned long flags;
712
713         spin_lock_irqsave(&iommu->lock, flags);
714         root = &iommu->root_entry[bus];
715         context = get_context_addr_from_root(root);
716         if (context) {
717                 context_clear_entry(&context[devfn]);
718                 __iommu_flush_cache(iommu, &context[devfn], \
719                         sizeof(*context));
720         }
721         spin_unlock_irqrestore(&iommu->lock, flags);
722 }
723
724 static void free_context_table(struct intel_iommu *iommu)
725 {
726         struct root_entry *root;
727         int i;
728         unsigned long flags;
729         struct context_entry *context;
730
731         spin_lock_irqsave(&iommu->lock, flags);
732         if (!iommu->root_entry) {
733                 goto out;
734         }
735         for (i = 0; i < ROOT_ENTRY_NR; i++) {
736                 root = &iommu->root_entry[i];
737                 context = get_context_addr_from_root(root);
738                 if (context)
739                         free_pgtable_page(context);
740         }
741         free_pgtable_page(iommu->root_entry);
742         iommu->root_entry = NULL;
743 out:
744         spin_unlock_irqrestore(&iommu->lock, flags);
745 }
746
747 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
748                                       unsigned long pfn, int target_level)
749 {
750         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
751         struct dma_pte *parent, *pte = NULL;
752         int level = agaw_to_level(domain->agaw);
753         int offset;
754
755         BUG_ON(!domain->pgd);
756         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
757         parent = domain->pgd;
758
759         while (level > 0) {
760                 void *tmp_page;
761
762                 offset = pfn_level_offset(pfn, level);
763                 pte = &parent[offset];
764                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
765                         break;
766                 if (level == target_level)
767                         break;
768
769                 if (!dma_pte_present(pte)) {
770                         uint64_t pteval;
771
772                         tmp_page = alloc_pgtable_page(domain->nid);
773
774                         if (!tmp_page)
775                                 return NULL;
776
777                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
778                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
779                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
780                                 /* Someone else set it while we were thinking; use theirs. */
781                                 free_pgtable_page(tmp_page);
782                         } else {
783                                 dma_pte_addr(pte);
784                                 domain_flush_cache(domain, pte, sizeof(*pte));
785                         }
786                 }
787                 parent = phys_to_virt(dma_pte_addr(pte));
788                 level--;
789         }
790
791         return pte;
792 }
793
794
795 /* return address's pte at specific level */
796 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
797                                          unsigned long pfn,
798                                          int level, int *large_page)
799 {
800         struct dma_pte *parent, *pte = NULL;
801         int total = agaw_to_level(domain->agaw);
802         int offset;
803
804         parent = domain->pgd;
805         while (level <= total) {
806                 offset = pfn_level_offset(pfn, total);
807                 pte = &parent[offset];
808                 if (level == total)
809                         return pte;
810
811                 if (!dma_pte_present(pte)) {
812                         *large_page = total;
813                         break;
814                 }
815
816                 if (pte->val & DMA_PTE_LARGE_PAGE) {
817                         *large_page = total;
818                         return pte;
819                 }
820
821                 parent = phys_to_virt(dma_pte_addr(pte));
822                 total--;
823         }
824         return NULL;
825 }
826
827 /* clear last level pte, a tlb flush should be followed */
828 static int dma_pte_clear_range(struct dmar_domain *domain,
829                                 unsigned long start_pfn,
830                                 unsigned long last_pfn)
831 {
832         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
833         unsigned int large_page = 1;
834         struct dma_pte *first_pte, *pte;
835         int order;
836
837         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
838         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
839         BUG_ON(start_pfn > last_pfn);
840
841         /* we don't need lock here; nobody else touches the iova range */
842         do {
843                 large_page = 1;
844                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
845                 if (!pte) {
846                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
847                         continue;
848                 }
849                 do {
850                         dma_clear_pte(pte);
851                         start_pfn += lvl_to_nr_pages(large_page);
852                         pte++;
853                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
854
855                 domain_flush_cache(domain, first_pte,
856                                    (void *)pte - (void *)first_pte);
857
858         } while (start_pfn && start_pfn <= last_pfn);
859
860         order = (large_page - 1) * 9;
861         return order;
862 }
863
864 static void dma_pte_free_level(struct dmar_domain *domain, int level,
865                                struct dma_pte *pte, unsigned long pfn,
866                                unsigned long start_pfn, unsigned long last_pfn)
867 {
868         pfn = max(start_pfn, pfn);
869         pte = &pte[pfn_level_offset(pfn, level)];
870
871         do {
872                 unsigned long level_pfn;
873                 struct dma_pte *level_pte;
874
875                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
876                         goto next;
877
878                 level_pfn = pfn & level_mask(level - 1);
879                 level_pte = phys_to_virt(dma_pte_addr(pte));
880
881                 if (level > 2)
882                         dma_pte_free_level(domain, level - 1, level_pte,
883                                            level_pfn, start_pfn, last_pfn);
884
885                 /* If range covers entire pagetable, free it */
886                 if (!(start_pfn > level_pfn ||
887                       last_pfn < level_pfn + level_size(level) - 1)) {
888                         dma_clear_pte(pte);
889                         domain_flush_cache(domain, pte, sizeof(*pte));
890                         free_pgtable_page(level_pte);
891                 }
892 next:
893                 pfn += level_size(level);
894         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
895 }
896
897 /* free page table pages. last level pte should already be cleared */
898 static void dma_pte_free_pagetable(struct dmar_domain *domain,
899                                    unsigned long start_pfn,
900                                    unsigned long last_pfn)
901 {
902         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
903
904         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
905         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
906         BUG_ON(start_pfn > last_pfn);
907
908         /* We don't need lock here; nobody else touches the iova range */
909         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
910                            domain->pgd, 0, start_pfn, last_pfn);
911
912         /* free pgd */
913         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
914                 free_pgtable_page(domain->pgd);
915                 domain->pgd = NULL;
916         }
917 }
918
919 /* iommu handling */
920 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
921 {
922         struct root_entry *root;
923         unsigned long flags;
924
925         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
926         if (!root)
927                 return -ENOMEM;
928
929         __iommu_flush_cache(iommu, root, ROOT_SIZE);
930
931         spin_lock_irqsave(&iommu->lock, flags);
932         iommu->root_entry = root;
933         spin_unlock_irqrestore(&iommu->lock, flags);
934
935         return 0;
936 }
937
938 static void iommu_set_root_entry(struct intel_iommu *iommu)
939 {
940         void *addr;
941         u32 sts;
942         unsigned long flag;
943
944         addr = iommu->root_entry;
945
946         raw_spin_lock_irqsave(&iommu->register_lock, flag);
947         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
948
949         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
950
951         /* Make sure hardware complete it */
952         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
953                       readl, (sts & DMA_GSTS_RTPS), sts);
954
955         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
956 }
957
958 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
959 {
960         u32 val;
961         unsigned long flag;
962
963         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
964                 return;
965
966         raw_spin_lock_irqsave(&iommu->register_lock, flag);
967         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
968
969         /* Make sure hardware complete it */
970         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
971                       readl, (!(val & DMA_GSTS_WBFS)), val);
972
973         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
974 }
975
976 /* return value determine if we need a write buffer flush */
977 static void __iommu_flush_context(struct intel_iommu *iommu,
978                                   u16 did, u16 source_id, u8 function_mask,
979                                   u64 type)
980 {
981         u64 val = 0;
982         unsigned long flag;
983
984         switch (type) {
985         case DMA_CCMD_GLOBAL_INVL:
986                 val = DMA_CCMD_GLOBAL_INVL;
987                 break;
988         case DMA_CCMD_DOMAIN_INVL:
989                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
990                 break;
991         case DMA_CCMD_DEVICE_INVL:
992                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
993                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
994                 break;
995         default:
996                 BUG();
997         }
998         val |= DMA_CCMD_ICC;
999
1000         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1001         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1002
1003         /* Make sure hardware complete it */
1004         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1005                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1006
1007         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1008 }
1009
1010 /* return value determine if we need a write buffer flush */
1011 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1012                                 u64 addr, unsigned int size_order, u64 type)
1013 {
1014         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1015         u64 val = 0, val_iva = 0;
1016         unsigned long flag;
1017
1018         switch (type) {
1019         case DMA_TLB_GLOBAL_FLUSH:
1020                 /* global flush doesn't need set IVA_REG */
1021                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1022                 break;
1023         case DMA_TLB_DSI_FLUSH:
1024                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1025                 break;
1026         case DMA_TLB_PSI_FLUSH:
1027                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1028                 /* Note: always flush non-leaf currently */
1029                 val_iva = size_order | addr;
1030                 break;
1031         default:
1032                 BUG();
1033         }
1034         /* Note: set drain read/write */
1035 #if 0
1036         /*
1037          * This is probably to be super secure.. Looks like we can
1038          * ignore it without any impact.
1039          */
1040         if (cap_read_drain(iommu->cap))
1041                 val |= DMA_TLB_READ_DRAIN;
1042 #endif
1043         if (cap_write_drain(iommu->cap))
1044                 val |= DMA_TLB_WRITE_DRAIN;
1045
1046         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1047         /* Note: Only uses first TLB reg currently */
1048         if (val_iva)
1049                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1050         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1051
1052         /* Make sure hardware complete it */
1053         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1054                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1055
1056         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1057
1058         /* check IOTLB invalidation granularity */
1059         if (DMA_TLB_IAIG(val) == 0)
1060                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1061         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1062                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1063                         (unsigned long long)DMA_TLB_IIRG(type),
1064                         (unsigned long long)DMA_TLB_IAIG(val));
1065 }
1066
1067 static struct device_domain_info *iommu_support_dev_iotlb(
1068         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1069 {
1070         int found = 0;
1071         unsigned long flags;
1072         struct device_domain_info *info;
1073         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1074
1075         if (!ecap_dev_iotlb_support(iommu->ecap))
1076                 return NULL;
1077
1078         if (!iommu->qi)
1079                 return NULL;
1080
1081         spin_lock_irqsave(&device_domain_lock, flags);
1082         list_for_each_entry(info, &domain->devices, link)
1083                 if (info->bus == bus && info->devfn == devfn) {
1084                         found = 1;
1085                         break;
1086                 }
1087         spin_unlock_irqrestore(&device_domain_lock, flags);
1088
1089         if (!found || !info->dev)
1090                 return NULL;
1091
1092         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1093                 return NULL;
1094
1095         if (!dmar_find_matched_atsr_unit(info->dev))
1096                 return NULL;
1097
1098         info->iommu = iommu;
1099
1100         return info;
1101 }
1102
1103 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1104 {
1105         if (!info)
1106                 return;
1107
1108         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1109 }
1110
1111 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1112 {
1113         if (!info->dev || !pci_ats_enabled(info->dev))
1114                 return;
1115
1116         pci_disable_ats(info->dev);
1117 }
1118
1119 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1120                                   u64 addr, unsigned mask)
1121 {
1122         u16 sid, qdep;
1123         unsigned long flags;
1124         struct device_domain_info *info;
1125
1126         spin_lock_irqsave(&device_domain_lock, flags);
1127         list_for_each_entry(info, &domain->devices, link) {
1128                 if (!info->dev || !pci_ats_enabled(info->dev))
1129                         continue;
1130
1131                 sid = info->bus << 8 | info->devfn;
1132                 qdep = pci_ats_queue_depth(info->dev);
1133                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1134         }
1135         spin_unlock_irqrestore(&device_domain_lock, flags);
1136 }
1137
1138 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1139                                   unsigned long pfn, unsigned int pages, int map)
1140 {
1141         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1142         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1143
1144         BUG_ON(pages == 0);
1145
1146         /*
1147          * Fallback to domain selective flush if no PSI support or the size is
1148          * too big.
1149          * PSI requires page size to be 2 ^ x, and the base address is naturally
1150          * aligned to the size
1151          */
1152         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1153                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1154                                                 DMA_TLB_DSI_FLUSH);
1155         else
1156                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1157                                                 DMA_TLB_PSI_FLUSH);
1158
1159         /*
1160          * In caching mode, changes of pages from non-present to present require
1161          * flush. However, device IOTLB doesn't need to be flushed in this case.
1162          */
1163         if (!cap_caching_mode(iommu->cap) || !map)
1164                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1165 }
1166
1167 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1168 {
1169         u32 pmen;
1170         unsigned long flags;
1171
1172         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1173         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1174         pmen &= ~DMA_PMEN_EPM;
1175         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1176
1177         /* wait for the protected region status bit to clear */
1178         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1179                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1180
1181         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1182 }
1183
1184 static int iommu_enable_translation(struct intel_iommu *iommu)
1185 {
1186         u32 sts;
1187         unsigned long flags;
1188
1189         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1190         iommu->gcmd |= DMA_GCMD_TE;
1191         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1192
1193         /* Make sure hardware complete it */
1194         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1195                       readl, (sts & DMA_GSTS_TES), sts);
1196
1197         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1198         return 0;
1199 }
1200
1201 static int iommu_disable_translation(struct intel_iommu *iommu)
1202 {
1203         u32 sts;
1204         unsigned long flag;
1205
1206         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1207         iommu->gcmd &= ~DMA_GCMD_TE;
1208         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1209
1210         /* Make sure hardware complete it */
1211         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1212                       readl, (!(sts & DMA_GSTS_TES)), sts);
1213
1214         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1215         return 0;
1216 }
1217
1218
1219 static int iommu_init_domains(struct intel_iommu *iommu)
1220 {
1221         unsigned long ndomains;
1222         unsigned long nlongs;
1223
1224         ndomains = cap_ndoms(iommu->cap);
1225         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1226                         ndomains);
1227         nlongs = BITS_TO_LONGS(ndomains);
1228
1229         spin_lock_init(&iommu->lock);
1230
1231         /* TBD: there might be 64K domains,
1232          * consider other allocation for future chip
1233          */
1234         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1235         if (!iommu->domain_ids) {
1236                 printk(KERN_ERR "Allocating domain id array failed\n");
1237                 return -ENOMEM;
1238         }
1239         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1240                         GFP_KERNEL);
1241         if (!iommu->domains) {
1242                 printk(KERN_ERR "Allocating domain array failed\n");
1243                 return -ENOMEM;
1244         }
1245
1246         /*
1247          * if Caching mode is set, then invalid translations are tagged
1248          * with domainid 0. Hence we need to pre-allocate it.
1249          */
1250         if (cap_caching_mode(iommu->cap))
1251                 set_bit(0, iommu->domain_ids);
1252         return 0;
1253 }
1254
1255
1256 static void domain_exit(struct dmar_domain *domain);
1257 static void vm_domain_exit(struct dmar_domain *domain);
1258
1259 void free_dmar_iommu(struct intel_iommu *iommu)
1260 {
1261         struct dmar_domain *domain;
1262         int i;
1263         unsigned long flags;
1264
1265         if ((iommu->domains) && (iommu->domain_ids)) {
1266                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1267                         domain = iommu->domains[i];
1268                         clear_bit(i, iommu->domain_ids);
1269
1270                         spin_lock_irqsave(&domain->iommu_lock, flags);
1271                         if (--domain->iommu_count == 0) {
1272                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1273                                         vm_domain_exit(domain);
1274                                 else
1275                                         domain_exit(domain);
1276                         }
1277                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1278                 }
1279         }
1280
1281         if (iommu->gcmd & DMA_GCMD_TE)
1282                 iommu_disable_translation(iommu);
1283
1284         if (iommu->irq) {
1285                 irq_set_handler_data(iommu->irq, NULL);
1286                 /* This will mask the irq */
1287                 free_irq(iommu->irq, iommu);
1288                 destroy_irq(iommu->irq);
1289         }
1290
1291         kfree(iommu->domains);
1292         kfree(iommu->domain_ids);
1293
1294         g_iommus[iommu->seq_id] = NULL;
1295
1296         /* if all iommus are freed, free g_iommus */
1297         for (i = 0; i < g_num_of_iommus; i++) {
1298                 if (g_iommus[i])
1299                         break;
1300         }
1301
1302         if (i == g_num_of_iommus)
1303                 kfree(g_iommus);
1304
1305         /* free context mapping */
1306         free_context_table(iommu);
1307 }
1308
1309 static struct dmar_domain *alloc_domain(void)
1310 {
1311         struct dmar_domain *domain;
1312
1313         domain = alloc_domain_mem();
1314         if (!domain)
1315                 return NULL;
1316
1317         domain->nid = -1;
1318         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1319         domain->flags = 0;
1320
1321         return domain;
1322 }
1323
1324 static int iommu_attach_domain(struct dmar_domain *domain,
1325                                struct intel_iommu *iommu)
1326 {
1327         int num;
1328         unsigned long ndomains;
1329         unsigned long flags;
1330
1331         ndomains = cap_ndoms(iommu->cap);
1332
1333         spin_lock_irqsave(&iommu->lock, flags);
1334
1335         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1336         if (num >= ndomains) {
1337                 spin_unlock_irqrestore(&iommu->lock, flags);
1338                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1339                 return -ENOMEM;
1340         }
1341
1342         domain->id = num;
1343         set_bit(num, iommu->domain_ids);
1344         set_bit(iommu->seq_id, &domain->iommu_bmp);
1345         iommu->domains[num] = domain;
1346         spin_unlock_irqrestore(&iommu->lock, flags);
1347
1348         return 0;
1349 }
1350
1351 static void iommu_detach_domain(struct dmar_domain *domain,
1352                                 struct intel_iommu *iommu)
1353 {
1354         unsigned long flags;
1355         int num, ndomains;
1356         int found = 0;
1357
1358         spin_lock_irqsave(&iommu->lock, flags);
1359         ndomains = cap_ndoms(iommu->cap);
1360         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1361                 if (iommu->domains[num] == domain) {
1362                         found = 1;
1363                         break;
1364                 }
1365         }
1366
1367         if (found) {
1368                 clear_bit(num, iommu->domain_ids);
1369                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1370                 iommu->domains[num] = NULL;
1371         }
1372         spin_unlock_irqrestore(&iommu->lock, flags);
1373 }
1374
1375 static struct iova_domain reserved_iova_list;
1376 static struct lock_class_key reserved_rbtree_key;
1377
1378 static int dmar_init_reserved_ranges(void)
1379 {
1380         struct pci_dev *pdev = NULL;
1381         struct iova *iova;
1382         int i;
1383
1384         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1385
1386         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1387                 &reserved_rbtree_key);
1388
1389         /* IOAPIC ranges shouldn't be accessed by DMA */
1390         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1391                 IOVA_PFN(IOAPIC_RANGE_END));
1392         if (!iova) {
1393                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1394                 return -ENODEV;
1395         }
1396
1397         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1398         for_each_pci_dev(pdev) {
1399                 struct resource *r;
1400
1401                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1402                         r = &pdev->resource[i];
1403                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1404                                 continue;
1405                         iova = reserve_iova(&reserved_iova_list,
1406                                             IOVA_PFN(r->start),
1407                                             IOVA_PFN(r->end));
1408                         if (!iova) {
1409                                 printk(KERN_ERR "Reserve iova failed\n");
1410                                 return -ENODEV;
1411                         }
1412                 }
1413         }
1414         return 0;
1415 }
1416
1417 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1418 {
1419         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1420 }
1421
1422 static inline int guestwidth_to_adjustwidth(int gaw)
1423 {
1424         int agaw;
1425         int r = (gaw - 12) % 9;
1426
1427         if (r == 0)
1428                 agaw = gaw;
1429         else
1430                 agaw = gaw + 9 - r;
1431         if (agaw > 64)
1432                 agaw = 64;
1433         return agaw;
1434 }
1435
1436 static int domain_init(struct dmar_domain *domain, int guest_width)
1437 {
1438         struct intel_iommu *iommu;
1439         int adjust_width, agaw;
1440         unsigned long sagaw;
1441
1442         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1443         spin_lock_init(&domain->iommu_lock);
1444
1445         domain_reserve_special_ranges(domain);
1446
1447         /* calculate AGAW */
1448         iommu = domain_get_iommu(domain);
1449         if (guest_width > cap_mgaw(iommu->cap))
1450                 guest_width = cap_mgaw(iommu->cap);
1451         domain->gaw = guest_width;
1452         adjust_width = guestwidth_to_adjustwidth(guest_width);
1453         agaw = width_to_agaw(adjust_width);
1454         sagaw = cap_sagaw(iommu->cap);
1455         if (!test_bit(agaw, &sagaw)) {
1456                 /* hardware doesn't support it, choose a bigger one */
1457                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1458                 agaw = find_next_bit(&sagaw, 5, agaw);
1459                 if (agaw >= 5)
1460                         return -ENODEV;
1461         }
1462         domain->agaw = agaw;
1463         INIT_LIST_HEAD(&domain->devices);
1464
1465         if (ecap_coherent(iommu->ecap))
1466                 domain->iommu_coherency = 1;
1467         else
1468                 domain->iommu_coherency = 0;
1469
1470         if (ecap_sc_support(iommu->ecap))
1471                 domain->iommu_snooping = 1;
1472         else
1473                 domain->iommu_snooping = 0;
1474
1475         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1476         domain->iommu_count = 1;
1477         domain->nid = iommu->node;
1478
1479         /* always allocate the top pgd */
1480         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1481         if (!domain->pgd)
1482                 return -ENOMEM;
1483         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1484         return 0;
1485 }
1486
1487 static void domain_exit(struct dmar_domain *domain)
1488 {
1489         struct dmar_drhd_unit *drhd;
1490         struct intel_iommu *iommu;
1491
1492         /* Domain 0 is reserved, so dont process it */
1493         if (!domain)
1494                 return;
1495
1496         /* Flush any lazy unmaps that may reference this domain */
1497         if (!intel_iommu_strict)
1498                 flush_unmaps_timeout(0);
1499
1500         domain_remove_dev_info(domain);
1501         /* destroy iovas */
1502         put_iova_domain(&domain->iovad);
1503
1504         /* clear ptes */
1505         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1506
1507         /* free page tables */
1508         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1509
1510         for_each_active_iommu(iommu, drhd)
1511                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1512                         iommu_detach_domain(domain, iommu);
1513
1514         free_domain_mem(domain);
1515 }
1516
1517 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1518                                  u8 bus, u8 devfn, int translation)
1519 {
1520         struct context_entry *context;
1521         unsigned long flags;
1522         struct intel_iommu *iommu;
1523         struct dma_pte *pgd;
1524         unsigned long num;
1525         unsigned long ndomains;
1526         int id;
1527         int agaw;
1528         struct device_domain_info *info = NULL;
1529
1530         pr_debug("Set context mapping for %02x:%02x.%d\n",
1531                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1532
1533         BUG_ON(!domain->pgd);
1534         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1535                translation != CONTEXT_TT_MULTI_LEVEL);
1536
1537         iommu = device_to_iommu(segment, bus, devfn);
1538         if (!iommu)
1539                 return -ENODEV;
1540
1541         context = device_to_context_entry(iommu, bus, devfn);
1542         if (!context)
1543                 return -ENOMEM;
1544         spin_lock_irqsave(&iommu->lock, flags);
1545         if (context_present(context)) {
1546                 spin_unlock_irqrestore(&iommu->lock, flags);
1547                 return 0;
1548         }
1549
1550         id = domain->id;
1551         pgd = domain->pgd;
1552
1553         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1554             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1555                 int found = 0;
1556
1557                 /* find an available domain id for this device in iommu */
1558                 ndomains = cap_ndoms(iommu->cap);
1559                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1560                         if (iommu->domains[num] == domain) {
1561                                 id = num;
1562                                 found = 1;
1563                                 break;
1564                         }
1565                 }
1566
1567                 if (found == 0) {
1568                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1569                         if (num >= ndomains) {
1570                                 spin_unlock_irqrestore(&iommu->lock, flags);
1571                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1572                                 return -EFAULT;
1573                         }
1574
1575                         set_bit(num, iommu->domain_ids);
1576                         iommu->domains[num] = domain;
1577                         id = num;
1578                 }
1579
1580                 /* Skip top levels of page tables for
1581                  * iommu which has less agaw than default.
1582                  * Unnecessary for PT mode.
1583                  */
1584                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1585                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1586                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1587                                 if (!dma_pte_present(pgd)) {
1588                                         spin_unlock_irqrestore(&iommu->lock, flags);
1589                                         return -ENOMEM;
1590                                 }
1591                         }
1592                 }
1593         }
1594
1595         context_set_domain_id(context, id);
1596
1597         if (translation != CONTEXT_TT_PASS_THROUGH) {
1598                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1599                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1600                                      CONTEXT_TT_MULTI_LEVEL;
1601         }
1602         /*
1603          * In pass through mode, AW must be programmed to indicate the largest
1604          * AGAW value supported by hardware. And ASR is ignored by hardware.
1605          */
1606         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1607                 context_set_address_width(context, iommu->msagaw);
1608         else {
1609                 context_set_address_root(context, virt_to_phys(pgd));
1610                 context_set_address_width(context, iommu->agaw);
1611         }
1612
1613         context_set_translation_type(context, translation);
1614         context_set_fault_enable(context);
1615         context_set_present(context);
1616         domain_flush_cache(domain, context, sizeof(*context));
1617
1618         /*
1619          * It's a non-present to present mapping. If hardware doesn't cache
1620          * non-present entry we only need to flush the write-buffer. If the
1621          * _does_ cache non-present entries, then it does so in the special
1622          * domain #0, which we have to flush:
1623          */
1624         if (cap_caching_mode(iommu->cap)) {
1625                 iommu->flush.flush_context(iommu, 0,
1626                                            (((u16)bus) << 8) | devfn,
1627                                            DMA_CCMD_MASK_NOBIT,
1628                                            DMA_CCMD_DEVICE_INVL);
1629                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1630         } else {
1631                 iommu_flush_write_buffer(iommu);
1632         }
1633         iommu_enable_dev_iotlb(info);
1634         spin_unlock_irqrestore(&iommu->lock, flags);
1635
1636         spin_lock_irqsave(&domain->iommu_lock, flags);
1637         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1638                 domain->iommu_count++;
1639                 if (domain->iommu_count == 1)
1640                         domain->nid = iommu->node;
1641                 domain_update_iommu_cap(domain);
1642         }
1643         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1644         return 0;
1645 }
1646
1647 static int
1648 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1649                         int translation)
1650 {
1651         int ret;
1652         struct pci_dev *tmp, *parent;
1653
1654         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1655                                          pdev->bus->number, pdev->devfn,
1656                                          translation);
1657         if (ret)
1658                 return ret;
1659
1660         /* dependent device mapping */
1661         tmp = pci_find_upstream_pcie_bridge(pdev);
1662         if (!tmp)
1663                 return 0;
1664         /* Secondary interface's bus number and devfn 0 */
1665         parent = pdev->bus->self;
1666         while (parent != tmp) {
1667                 ret = domain_context_mapping_one(domain,
1668                                                  pci_domain_nr(parent->bus),
1669                                                  parent->bus->number,
1670                                                  parent->devfn, translation);
1671                 if (ret)
1672                         return ret;
1673                 parent = parent->bus->self;
1674         }
1675         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1676                 return domain_context_mapping_one(domain,
1677                                         pci_domain_nr(tmp->subordinate),
1678                                         tmp->subordinate->number, 0,
1679                                         translation);
1680         else /* this is a legacy PCI bridge */
1681                 return domain_context_mapping_one(domain,
1682                                                   pci_domain_nr(tmp->bus),
1683                                                   tmp->bus->number,
1684                                                   tmp->devfn,
1685                                                   translation);
1686 }
1687
1688 static int domain_context_mapped(struct pci_dev *pdev)
1689 {
1690         int ret;
1691         struct pci_dev *tmp, *parent;
1692         struct intel_iommu *iommu;
1693
1694         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1695                                 pdev->devfn);
1696         if (!iommu)
1697                 return -ENODEV;
1698
1699         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1700         if (!ret)
1701                 return ret;
1702         /* dependent device mapping */
1703         tmp = pci_find_upstream_pcie_bridge(pdev);
1704         if (!tmp)
1705                 return ret;
1706         /* Secondary interface's bus number and devfn 0 */
1707         parent = pdev->bus->self;
1708         while (parent != tmp) {
1709                 ret = device_context_mapped(iommu, parent->bus->number,
1710                                             parent->devfn);
1711                 if (!ret)
1712                         return ret;
1713                 parent = parent->bus->self;
1714         }
1715         if (pci_is_pcie(tmp))
1716                 return device_context_mapped(iommu, tmp->subordinate->number,
1717                                              0);
1718         else
1719                 return device_context_mapped(iommu, tmp->bus->number,
1720                                              tmp->devfn);
1721 }
1722
1723 /* Returns a number of VTD pages, but aligned to MM page size */
1724 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1725                                             size_t size)
1726 {
1727         host_addr &= ~PAGE_MASK;
1728         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1729 }
1730
1731 /* Return largest possible superpage level for a given mapping */
1732 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1733                                           unsigned long iov_pfn,
1734                                           unsigned long phy_pfn,
1735                                           unsigned long pages)
1736 {
1737         int support, level = 1;
1738         unsigned long pfnmerge;
1739
1740         support = domain->iommu_superpage;
1741
1742         /* To use a large page, the virtual *and* physical addresses
1743            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1744            of them will mean we have to use smaller pages. So just
1745            merge them and check both at once. */
1746         pfnmerge = iov_pfn | phy_pfn;
1747
1748         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1749                 pages >>= VTD_STRIDE_SHIFT;
1750                 if (!pages)
1751                         break;
1752                 pfnmerge >>= VTD_STRIDE_SHIFT;
1753                 level++;
1754                 support--;
1755         }
1756         return level;
1757 }
1758
1759 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1760                             struct scatterlist *sg, unsigned long phys_pfn,
1761                             unsigned long nr_pages, int prot)
1762 {
1763         struct dma_pte *first_pte = NULL, *pte = NULL;
1764         phys_addr_t uninitialized_var(pteval);
1765         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1766         unsigned long sg_res = 0;
1767         unsigned int largepage_lvl = 0;
1768         unsigned long lvl_pages = 0;
1769
1770         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1771
1772         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1773                 return -EINVAL;
1774
1775         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1776
1777         if (!sg) {
1778                 sg_res = nr_pages;
1779                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1780         }
1781
1782         while (nr_pages > 0) {
1783                 uint64_t tmp;
1784
1785                 if (!sg_res) {
1786                         sg_res = aligned_nrpages(sg->offset, sg->length);
1787                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1788                         sg->dma_length = sg->length;
1789                         pteval = page_to_phys(sg_page(sg)) | prot;
1790                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1791                 }
1792
1793                 if (!pte) {
1794                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1795
1796                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1797                         if (!pte)
1798                                 return -ENOMEM;
1799                         /* It is large page*/
1800                         if (largepage_lvl > 1) {
1801                                 pteval |= DMA_PTE_LARGE_PAGE;
1802                                 /* Ensure that old small page tables are removed to make room
1803                                    for superpage, if they exist. */
1804                                 dma_pte_clear_range(domain, iov_pfn,
1805                                                     iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1806                                 dma_pte_free_pagetable(domain, iov_pfn,
1807                                                        iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1808                         } else {
1809                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1810                         }
1811
1812                 }
1813                 /* We don't need lock here, nobody else
1814                  * touches the iova range
1815                  */
1816                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1817                 if (tmp) {
1818                         static int dumps = 5;
1819                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1820                                iov_pfn, tmp, (unsigned long long)pteval);
1821                         if (dumps) {
1822                                 dumps--;
1823                                 debug_dma_dump_mappings(NULL);
1824                         }
1825                         WARN_ON(1);
1826                 }
1827
1828                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1829
1830                 BUG_ON(nr_pages < lvl_pages);
1831                 BUG_ON(sg_res < lvl_pages);
1832
1833                 nr_pages -= lvl_pages;
1834                 iov_pfn += lvl_pages;
1835                 phys_pfn += lvl_pages;
1836                 pteval += lvl_pages * VTD_PAGE_SIZE;
1837                 sg_res -= lvl_pages;
1838
1839                 /* If the next PTE would be the first in a new page, then we
1840                    need to flush the cache on the entries we've just written.
1841                    And then we'll need to recalculate 'pte', so clear it and
1842                    let it get set again in the if (!pte) block above.
1843
1844                    If we're done (!nr_pages) we need to flush the cache too.
1845
1846                    Also if we've been setting superpages, we may need to
1847                    recalculate 'pte' and switch back to smaller pages for the
1848                    end of the mapping, if the trailing size is not enough to
1849                    use another superpage (i.e. sg_res < lvl_pages). */
1850                 pte++;
1851                 if (!nr_pages || first_pte_in_page(pte) ||
1852                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1853                         domain_flush_cache(domain, first_pte,
1854                                            (void *)pte - (void *)first_pte);
1855                         pte = NULL;
1856                 }
1857
1858                 if (!sg_res && nr_pages)
1859                         sg = sg_next(sg);
1860         }
1861         return 0;
1862 }
1863
1864 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1865                                     struct scatterlist *sg, unsigned long nr_pages,
1866                                     int prot)
1867 {
1868         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1869 }
1870
1871 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1872                                      unsigned long phys_pfn, unsigned long nr_pages,
1873                                      int prot)
1874 {
1875         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1876 }
1877
1878 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1879 {
1880         if (!iommu)
1881                 return;
1882
1883         clear_context_table(iommu, bus, devfn);
1884         iommu->flush.flush_context(iommu, 0, 0, 0,
1885                                            DMA_CCMD_GLOBAL_INVL);
1886         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1887 }
1888
1889 static void domain_remove_dev_info(struct dmar_domain *domain)
1890 {
1891         struct device_domain_info *info;
1892         unsigned long flags;
1893         struct intel_iommu *iommu;
1894
1895         spin_lock_irqsave(&device_domain_lock, flags);
1896         while (!list_empty(&domain->devices)) {
1897                 info = list_entry(domain->devices.next,
1898                         struct device_domain_info, link);
1899                 list_del(&info->link);
1900                 list_del(&info->global);
1901                 if (info->dev)
1902                         info->dev->dev.archdata.iommu = NULL;
1903                 spin_unlock_irqrestore(&device_domain_lock, flags);
1904
1905                 iommu_disable_dev_iotlb(info);
1906                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1907                 iommu_detach_dev(iommu, info->bus, info->devfn);
1908                 free_devinfo_mem(info);
1909
1910                 spin_lock_irqsave(&device_domain_lock, flags);
1911         }
1912         spin_unlock_irqrestore(&device_domain_lock, flags);
1913 }
1914
1915 /*
1916  * find_domain
1917  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1918  */
1919 static struct dmar_domain *
1920 find_domain(struct pci_dev *pdev)
1921 {
1922         struct device_domain_info *info;
1923
1924         /* No lock here, assumes no domain exit in normal case */
1925         info = pdev->dev.archdata.iommu;
1926         if (info)
1927                 return info->domain;
1928         return NULL;
1929 }
1930
1931 /* domain is initialized */
1932 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1933 {
1934         struct dmar_domain *domain, *found = NULL;
1935         struct intel_iommu *iommu;
1936         struct dmar_drhd_unit *drhd;
1937         struct device_domain_info *info, *tmp;
1938         struct pci_dev *dev_tmp;
1939         unsigned long flags;
1940         int bus = 0, devfn = 0;
1941         int segment;
1942         int ret;
1943
1944         domain = find_domain(pdev);
1945         if (domain)
1946                 return domain;
1947
1948         segment = pci_domain_nr(pdev->bus);
1949
1950         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1951         if (dev_tmp) {
1952                 if (pci_is_pcie(dev_tmp)) {
1953                         bus = dev_tmp->subordinate->number;
1954                         devfn = 0;
1955                 } else {
1956                         bus = dev_tmp->bus->number;
1957                         devfn = dev_tmp->devfn;
1958                 }
1959                 spin_lock_irqsave(&device_domain_lock, flags);
1960                 list_for_each_entry(info, &device_domain_list, global) {
1961                         if (info->segment == segment &&
1962                             info->bus == bus && info->devfn == devfn) {
1963                                 found = info->domain;
1964                                 break;
1965                         }
1966                 }
1967                 spin_unlock_irqrestore(&device_domain_lock, flags);
1968                 /* pcie-pci bridge already has a domain, uses it */
1969                 if (found) {
1970                         domain = found;
1971                         goto found_domain;
1972                 }
1973         }
1974
1975         domain = alloc_domain();
1976         if (!domain)
1977                 goto error;
1978
1979         /* Allocate new domain for the device */
1980         drhd = dmar_find_matched_drhd_unit(pdev);
1981         if (!drhd) {
1982                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1983                         pci_name(pdev));
1984                 return NULL;
1985         }
1986         iommu = drhd->iommu;
1987
1988         ret = iommu_attach_domain(domain, iommu);
1989         if (ret) {
1990                 free_domain_mem(domain);
1991                 goto error;
1992         }
1993
1994         if (domain_init(domain, gaw)) {
1995                 domain_exit(domain);
1996                 goto error;
1997         }
1998
1999         /* register pcie-to-pci device */
2000         if (dev_tmp) {
2001                 info = alloc_devinfo_mem();
2002                 if (!info) {
2003                         domain_exit(domain);
2004                         goto error;
2005                 }
2006                 info->segment = segment;
2007                 info->bus = bus;
2008                 info->devfn = devfn;
2009                 info->dev = NULL;
2010                 info->domain = domain;
2011                 /* This domain is shared by devices under p2p bridge */
2012                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2013
2014                 /* pcie-to-pci bridge already has a domain, uses it */
2015                 found = NULL;
2016                 spin_lock_irqsave(&device_domain_lock, flags);
2017                 list_for_each_entry(tmp, &device_domain_list, global) {
2018                         if (tmp->segment == segment &&
2019                             tmp->bus == bus && tmp->devfn == devfn) {
2020                                 found = tmp->domain;
2021                                 break;
2022                         }
2023                 }
2024                 if (found) {
2025                         spin_unlock_irqrestore(&device_domain_lock, flags);
2026                         free_devinfo_mem(info);
2027                         domain_exit(domain);
2028                         domain = found;
2029                 } else {
2030                         list_add(&info->link, &domain->devices);
2031                         list_add(&info->global, &device_domain_list);
2032                         spin_unlock_irqrestore(&device_domain_lock, flags);
2033                 }
2034         }
2035
2036 found_domain:
2037         info = alloc_devinfo_mem();
2038         if (!info)
2039                 goto error;
2040         info->segment = segment;
2041         info->bus = pdev->bus->number;
2042         info->devfn = pdev->devfn;
2043         info->dev = pdev;
2044         info->domain = domain;
2045         spin_lock_irqsave(&device_domain_lock, flags);
2046         /* somebody is fast */
2047         found = find_domain(pdev);
2048         if (found != NULL) {
2049                 spin_unlock_irqrestore(&device_domain_lock, flags);
2050                 if (found != domain) {
2051                         domain_exit(domain);
2052                         domain = found;
2053                 }
2054                 free_devinfo_mem(info);
2055                 return domain;
2056         }
2057         list_add(&info->link, &domain->devices);
2058         list_add(&info->global, &device_domain_list);
2059         pdev->dev.archdata.iommu = info;
2060         spin_unlock_irqrestore(&device_domain_lock, flags);
2061         return domain;
2062 error:
2063         /* recheck it here, maybe others set it */
2064         return find_domain(pdev);
2065 }
2066
2067 static int iommu_identity_mapping;
2068 #define IDENTMAP_ALL            1
2069 #define IDENTMAP_GFX            2
2070 #define IDENTMAP_AZALIA         4
2071
2072 static int iommu_domain_identity_map(struct dmar_domain *domain,
2073                                      unsigned long long start,
2074                                      unsigned long long end)
2075 {
2076         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2077         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2078
2079         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2080                           dma_to_mm_pfn(last_vpfn))) {
2081                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2082                 return -ENOMEM;
2083         }
2084
2085         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2086                  start, end, domain->id);
2087         /*
2088          * RMRR range might have overlap with physical memory range,
2089          * clear it first
2090          */
2091         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2092
2093         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2094                                   last_vpfn - first_vpfn + 1,
2095                                   DMA_PTE_READ|DMA_PTE_WRITE);
2096 }
2097
2098 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2099                                       unsigned long long start,
2100                                       unsigned long long end)
2101 {
2102         struct dmar_domain *domain;
2103         int ret;
2104
2105         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2106         if (!domain)
2107                 return -ENOMEM;
2108
2109         /* For _hardware_ passthrough, don't bother. But for software
2110            passthrough, we do it anyway -- it may indicate a memory
2111            range which is reserved in E820, so which didn't get set
2112            up to start with in si_domain */
2113         if (domain == si_domain && hw_pass_through) {
2114                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2115                        pci_name(pdev), start, end);
2116                 return 0;
2117         }
2118
2119         printk(KERN_INFO
2120                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2121                pci_name(pdev), start, end);
2122         
2123         if (end < start) {
2124                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2125                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2126                         dmi_get_system_info(DMI_BIOS_VENDOR),
2127                         dmi_get_system_info(DMI_BIOS_VERSION),
2128                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2129                 ret = -EIO;
2130                 goto error;
2131         }
2132
2133         if (end >> agaw_to_width(domain->agaw)) {
2134                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2135                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2136                      agaw_to_width(domain->agaw),
2137                      dmi_get_system_info(DMI_BIOS_VENDOR),
2138                      dmi_get_system_info(DMI_BIOS_VERSION),
2139                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2140                 ret = -EIO;
2141                 goto error;
2142         }
2143
2144         ret = iommu_domain_identity_map(domain, start, end);
2145         if (ret)
2146                 goto error;
2147
2148         /* context entry init */
2149         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2150         if (ret)
2151                 goto error;
2152
2153         return 0;
2154
2155  error:
2156         domain_exit(domain);
2157         return ret;
2158 }
2159
2160 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2161         struct pci_dev *pdev)
2162 {
2163         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2164                 return 0;
2165         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2166                 rmrr->end_address);
2167 }
2168
2169 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2170 static inline void iommu_prepare_isa(void)
2171 {
2172         struct pci_dev *pdev;
2173         int ret;
2174
2175         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2176         if (!pdev)
2177                 return;
2178
2179         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2180         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2181
2182         if (ret)
2183                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2184                        "floppy might not work\n");
2185
2186 }
2187 #else
2188 static inline void iommu_prepare_isa(void)
2189 {
2190         return;
2191 }
2192 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2193
2194 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2195
2196 static int __init si_domain_work_fn(unsigned long start_pfn,
2197                                     unsigned long end_pfn, void *datax)
2198 {
2199         int *ret = datax;
2200
2201         *ret = iommu_domain_identity_map(si_domain,
2202                                          (uint64_t)start_pfn << PAGE_SHIFT,
2203                                          (uint64_t)end_pfn << PAGE_SHIFT);
2204         return *ret;
2205
2206 }
2207
2208 static int __init si_domain_init(int hw)
2209 {
2210         struct dmar_drhd_unit *drhd;
2211         struct intel_iommu *iommu;
2212         int nid, ret = 0;
2213
2214         si_domain = alloc_domain();
2215         if (!si_domain)
2216                 return -EFAULT;
2217
2218         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2219
2220         for_each_active_iommu(iommu, drhd) {
2221                 ret = iommu_attach_domain(si_domain, iommu);
2222                 if (ret) {
2223                         domain_exit(si_domain);
2224                         return -EFAULT;
2225                 }
2226         }
2227
2228         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2229                 domain_exit(si_domain);
2230                 return -EFAULT;
2231         }
2232
2233         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2234
2235         if (hw)
2236                 return 0;
2237
2238         for_each_online_node(nid) {
2239                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2240                 if (ret)
2241                         return ret;
2242         }
2243
2244         return 0;
2245 }
2246
2247 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2248                                           struct pci_dev *pdev);
2249 static int identity_mapping(struct pci_dev *pdev)
2250 {
2251         struct device_domain_info *info;
2252
2253         if (likely(!iommu_identity_mapping))
2254                 return 0;
2255
2256         info = pdev->dev.archdata.iommu;
2257         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2258                 return (info->domain == si_domain);
2259
2260         return 0;
2261 }
2262
2263 static int domain_add_dev_info(struct dmar_domain *domain,
2264                                struct pci_dev *pdev,
2265                                int translation)
2266 {
2267         struct device_domain_info *info;
2268         unsigned long flags;
2269         int ret;
2270
2271         info = alloc_devinfo_mem();
2272         if (!info)
2273                 return -ENOMEM;
2274
2275         info->segment = pci_domain_nr(pdev->bus);
2276         info->bus = pdev->bus->number;
2277         info->devfn = pdev->devfn;
2278         info->dev = pdev;
2279         info->domain = domain;
2280
2281         spin_lock_irqsave(&device_domain_lock, flags);
2282         list_add(&info->link, &domain->devices);
2283         list_add(&info->global, &device_domain_list);
2284         pdev->dev.archdata.iommu = info;
2285         spin_unlock_irqrestore(&device_domain_lock, flags);
2286
2287         ret = domain_context_mapping(domain, pdev, translation);
2288         if (ret) {
2289                 spin_lock_irqsave(&device_domain_lock, flags);
2290                 list_del(&info->link);
2291                 list_del(&info->global);
2292                 pdev->dev.archdata.iommu = NULL;
2293                 spin_unlock_irqrestore(&device_domain_lock, flags);
2294                 free_devinfo_mem(info);
2295                 return ret;
2296         }
2297
2298         return 0;
2299 }
2300
2301 static bool device_has_rmrr(struct pci_dev *dev)
2302 {
2303         struct dmar_rmrr_unit *rmrr;
2304         int i;
2305
2306         for_each_rmrr_units(rmrr) {
2307                 for (i = 0; i < rmrr->devices_cnt; i++) {
2308                         /*
2309                          * Return TRUE if this RMRR contains the device that
2310                          * is passed in.
2311                          */
2312                         if (rmrr->devices[i] == dev)
2313                                 return true;
2314                 }
2315         }
2316         return false;
2317 }
2318
2319 /*
2320  * There are a couple cases where we need to restrict the functionality of
2321  * devices associated with RMRRs.  The first is when evaluating a device for
2322  * identity mapping because problems exist when devices are moved in and out
2323  * of domains and their respective RMRR information is lost.  This means that
2324  * a device with associated RMRRs will never be in a "passthrough" domain.
2325  * The second is use of the device through the IOMMU API.  This interface
2326  * expects to have full control of the IOVA space for the device.  We cannot
2327  * satisfy both the requirement that RMRR access is maintained and have an
2328  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2329  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2330  * We therefore prevent devices associated with an RMRR from participating in
2331  * the IOMMU API, which eliminates them from device assignment.
2332  *
2333  * In both cases we assume that PCI USB devices with RMRRs have them largely
2334  * for historical reasons and that the RMRR space is not actively used post
2335  * boot.  This exclusion may change if vendors begin to abuse it.
2336  */
2337 static bool device_is_rmrr_locked(struct pci_dev *pdev)
2338 {
2339         return device_has_rmrr(pdev) &&
2340                 (pdev->class >> 8) != PCI_CLASS_SERIAL_USB;
2341 }
2342
2343 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2344 {
2345
2346         if (device_is_rmrr_locked(pdev))
2347                 return 0;
2348
2349         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2350                 return 1;
2351
2352         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2353                 return 1;
2354
2355         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2356                 return 0;
2357
2358         /*
2359          * We want to start off with all devices in the 1:1 domain, and
2360          * take them out later if we find they can't access all of memory.
2361          *
2362          * However, we can't do this for PCI devices behind bridges,
2363          * because all PCI devices behind the same bridge will end up
2364          * with the same source-id on their transactions.
2365          *
2366          * Practically speaking, we can't change things around for these
2367          * devices at run-time, because we can't be sure there'll be no
2368          * DMA transactions in flight for any of their siblings.
2369          * 
2370          * So PCI devices (unless they're on the root bus) as well as
2371          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2372          * the 1:1 domain, just in _case_ one of their siblings turns out
2373          * not to be able to map all of memory.
2374          */
2375         if (!pci_is_pcie(pdev)) {
2376                 if (!pci_is_root_bus(pdev->bus))
2377                         return 0;
2378                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2379                         return 0;
2380         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2381                 return 0;
2382
2383         /* 
2384          * At boot time, we don't yet know if devices will be 64-bit capable.
2385          * Assume that they will -- if they turn out not to be, then we can 
2386          * take them out of the 1:1 domain later.
2387          */
2388         if (!startup) {
2389                 /*
2390                  * If the device's dma_mask is less than the system's memory
2391                  * size then this is not a candidate for identity mapping.
2392                  */
2393                 u64 dma_mask = pdev->dma_mask;
2394
2395                 if (pdev->dev.coherent_dma_mask &&
2396                     pdev->dev.coherent_dma_mask < dma_mask)
2397                         dma_mask = pdev->dev.coherent_dma_mask;
2398
2399                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2400         }
2401
2402         return 1;
2403 }
2404
2405 static int __init iommu_prepare_static_identity_mapping(int hw)
2406 {
2407         struct pci_dev *pdev = NULL;
2408         int ret;
2409
2410         ret = si_domain_init(hw);
2411         if (ret)
2412                 return -EFAULT;
2413
2414         for_each_pci_dev(pdev) {
2415                 /* Skip Host/PCI Bridge devices */
2416                 if (IS_BRIDGE_HOST_DEVICE(pdev))
2417                         continue;
2418                 if (iommu_should_identity_map(pdev, 1)) {
2419                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2420                                hw ? "hardware" : "software", pci_name(pdev));
2421
2422                         ret = domain_add_dev_info(si_domain, pdev,
2423                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2424                                                      CONTEXT_TT_MULTI_LEVEL);
2425                         if (ret)
2426                                 return ret;
2427                 }
2428         }
2429
2430         return 0;
2431 }
2432
2433 static int __init init_dmars(void)
2434 {
2435         struct dmar_drhd_unit *drhd;
2436         struct dmar_rmrr_unit *rmrr;
2437         struct pci_dev *pdev;
2438         struct intel_iommu *iommu;
2439         int i, ret;
2440
2441         /*
2442          * for each drhd
2443          *    allocate root
2444          *    initialize and program root entry to not present
2445          * endfor
2446          */
2447         for_each_drhd_unit(drhd) {
2448                 g_num_of_iommus++;
2449                 /*
2450                  * lock not needed as this is only incremented in the single
2451                  * threaded kernel __init code path all other access are read
2452                  * only
2453                  */
2454         }
2455
2456         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2457                         GFP_KERNEL);
2458         if (!g_iommus) {
2459                 printk(KERN_ERR "Allocating global iommu array failed\n");
2460                 ret = -ENOMEM;
2461                 goto error;
2462         }
2463
2464         deferred_flush = kzalloc(g_num_of_iommus *
2465                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2466         if (!deferred_flush) {
2467                 ret = -ENOMEM;
2468                 goto error;
2469         }
2470
2471         for_each_drhd_unit(drhd) {
2472                 if (drhd->ignored)
2473                         continue;
2474
2475                 iommu = drhd->iommu;
2476                 g_iommus[iommu->seq_id] = iommu;
2477
2478                 ret = iommu_init_domains(iommu);
2479                 if (ret)
2480                         goto error;
2481
2482                 /*
2483                  * TBD:
2484                  * we could share the same root & context tables
2485                  * among all IOMMU's. Need to Split it later.
2486                  */
2487                 ret = iommu_alloc_root_entry(iommu);
2488                 if (ret) {
2489                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2490                         goto error;
2491                 }
2492                 if (!ecap_pass_through(iommu->ecap))
2493                         hw_pass_through = 0;
2494         }
2495
2496         /*
2497          * Start from the sane iommu hardware state.
2498          */
2499         for_each_drhd_unit(drhd) {
2500                 if (drhd->ignored)
2501                         continue;
2502
2503                 iommu = drhd->iommu;
2504
2505                 /*
2506                  * If the queued invalidation is already initialized by us
2507                  * (for example, while enabling interrupt-remapping) then
2508                  * we got the things already rolling from a sane state.
2509                  */
2510                 if (iommu->qi)
2511                         continue;
2512
2513                 /*
2514                  * Clear any previous faults.
2515                  */
2516                 dmar_fault(-1, iommu);
2517                 /*
2518                  * Disable queued invalidation if supported and already enabled
2519                  * before OS handover.
2520                  */
2521                 dmar_disable_qi(iommu);
2522         }
2523
2524         for_each_drhd_unit(drhd) {
2525                 if (drhd->ignored)
2526                         continue;
2527
2528                 iommu = drhd->iommu;
2529
2530                 if (dmar_enable_qi(iommu)) {
2531                         /*
2532                          * Queued Invalidate not enabled, use Register Based
2533                          * Invalidate
2534                          */
2535                         iommu->flush.flush_context = __iommu_flush_context;
2536                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2537                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2538                                "invalidation\n",
2539                                 iommu->seq_id,
2540                                (unsigned long long)drhd->reg_base_addr);
2541                 } else {
2542                         iommu->flush.flush_context = qi_flush_context;
2543                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2544                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2545                                "invalidation\n",
2546                                 iommu->seq_id,
2547                                (unsigned long long)drhd->reg_base_addr);
2548                 }
2549         }
2550
2551         if (iommu_pass_through)
2552                 iommu_identity_mapping |= IDENTMAP_ALL;
2553
2554 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2555         iommu_identity_mapping |= IDENTMAP_GFX;
2556 #endif
2557
2558         check_tylersburg_isoch();
2559
2560         /*
2561          * If pass through is not set or not enabled, setup context entries for
2562          * identity mappings for rmrr, gfx, and isa and may fall back to static
2563          * identity mapping if iommu_identity_mapping is set.
2564          */
2565         if (iommu_identity_mapping) {
2566                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2567                 if (ret) {
2568                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2569                         goto error;
2570                 }
2571         }
2572         /*
2573          * For each rmrr
2574          *   for each dev attached to rmrr
2575          *   do
2576          *     locate drhd for dev, alloc domain for dev
2577          *     allocate free domain
2578          *     allocate page table entries for rmrr
2579          *     if context not allocated for bus
2580          *           allocate and init context
2581          *           set present in root table for this bus
2582          *     init context with domain, translation etc
2583          *    endfor
2584          * endfor
2585          */
2586         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2587         for_each_rmrr_units(rmrr) {
2588                 for (i = 0; i < rmrr->devices_cnt; i++) {
2589                         pdev = rmrr->devices[i];
2590                         /*
2591                          * some BIOS lists non-exist devices in DMAR
2592                          * table.
2593                          */
2594                         if (!pdev)
2595                                 continue;
2596                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2597                         if (ret)
2598                                 printk(KERN_ERR
2599                                        "IOMMU: mapping reserved region failed\n");
2600                 }
2601         }
2602
2603         iommu_prepare_isa();
2604
2605         /*
2606          * for each drhd
2607          *   enable fault log
2608          *   global invalidate context cache
2609          *   global invalidate iotlb
2610          *   enable translation
2611          */
2612         for_each_drhd_unit(drhd) {
2613                 if (drhd->ignored) {
2614                         /*
2615                          * we always have to disable PMRs or DMA may fail on
2616                          * this device
2617                          */
2618                         if (force_on)
2619                                 iommu_disable_protect_mem_regions(drhd->iommu);
2620                         continue;
2621                 }
2622                 iommu = drhd->iommu;
2623
2624                 iommu_flush_write_buffer(iommu);
2625
2626                 ret = dmar_set_interrupt(iommu);
2627                 if (ret)
2628                         goto error;
2629
2630                 iommu_set_root_entry(iommu);
2631
2632                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2633                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2634
2635                 ret = iommu_enable_translation(iommu);
2636                 if (ret)
2637                         goto error;
2638
2639                 iommu_disable_protect_mem_regions(iommu);
2640         }
2641
2642         return 0;
2643 error:
2644         for_each_drhd_unit(drhd) {
2645                 if (drhd->ignored)
2646                         continue;
2647                 iommu = drhd->iommu;
2648                 free_iommu(iommu);
2649         }
2650         kfree(g_iommus);
2651         return ret;
2652 }
2653
2654 /* This takes a number of _MM_ pages, not VTD pages */
2655 static struct iova *intel_alloc_iova(struct device *dev,
2656                                      struct dmar_domain *domain,
2657                                      unsigned long nrpages, uint64_t dma_mask)
2658 {
2659         struct pci_dev *pdev = to_pci_dev(dev);
2660         struct iova *iova = NULL;
2661
2662         /* Restrict dma_mask to the width that the iommu can handle */
2663         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2664
2665         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2666                 /*
2667                  * First try to allocate an io virtual address in
2668                  * DMA_BIT_MASK(32) and if that fails then try allocating
2669                  * from higher range
2670                  */
2671                 iova = alloc_iova(&domain->iovad, nrpages,
2672                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2673                 if (iova)
2674                         return iova;
2675         }
2676         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2677         if (unlikely(!iova)) {
2678                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2679                        nrpages, pci_name(pdev));
2680                 return NULL;
2681         }
2682
2683         return iova;
2684 }
2685
2686 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2687 {
2688         struct dmar_domain *domain;
2689         int ret;
2690
2691         domain = get_domain_for_dev(pdev,
2692                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2693         if (!domain) {
2694                 printk(KERN_ERR
2695                         "Allocating domain for %s failed", pci_name(pdev));
2696                 return NULL;
2697         }
2698
2699         /* make sure context mapping is ok */
2700         if (unlikely(!domain_context_mapped(pdev))) {
2701                 ret = domain_context_mapping(domain, pdev,
2702                                              CONTEXT_TT_MULTI_LEVEL);
2703                 if (ret) {
2704                         printk(KERN_ERR
2705                                 "Domain context map for %s failed",
2706                                 pci_name(pdev));
2707                         return NULL;
2708                 }
2709         }
2710
2711         return domain;
2712 }
2713
2714 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2715 {
2716         struct device_domain_info *info;
2717
2718         /* No lock here, assumes no domain exit in normal case */
2719         info = dev->dev.archdata.iommu;
2720         if (likely(info))
2721                 return info->domain;
2722
2723         return __get_valid_domain_for_dev(dev);
2724 }
2725
2726 static int iommu_dummy(struct pci_dev *pdev)
2727 {
2728         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2729 }
2730
2731 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2732 static int iommu_no_mapping(struct device *dev)
2733 {
2734         struct pci_dev *pdev;
2735         int found;
2736
2737         if (unlikely(dev->bus != &pci_bus_type))
2738                 return 1;
2739
2740         pdev = to_pci_dev(dev);
2741         if (iommu_dummy(pdev))
2742                 return 1;
2743
2744         if (!iommu_identity_mapping)
2745                 return 0;
2746
2747         found = identity_mapping(pdev);
2748         if (found) {
2749                 if (iommu_should_identity_map(pdev, 0))
2750                         return 1;
2751                 else {
2752                         /*
2753                          * 32 bit DMA is removed from si_domain and fall back
2754                          * to non-identity mapping.
2755                          */
2756                         domain_remove_one_dev_info(si_domain, pdev);
2757                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2758                                pci_name(pdev));
2759                         return 0;
2760                 }
2761         } else {
2762                 /*
2763                  * In case of a detached 64 bit DMA device from vm, the device
2764                  * is put into si_domain for identity mapping.
2765                  */
2766                 if (iommu_should_identity_map(pdev, 0)) {
2767                         int ret;
2768                         ret = domain_add_dev_info(si_domain, pdev,
2769                                                   hw_pass_through ?
2770                                                   CONTEXT_TT_PASS_THROUGH :
2771                                                   CONTEXT_TT_MULTI_LEVEL);
2772                         if (!ret) {
2773                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2774                                        pci_name(pdev));
2775                                 return 1;
2776                         }
2777                 }
2778         }
2779
2780         return 0;
2781 }
2782
2783 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2784                                      size_t size, int dir, u64 dma_mask)
2785 {
2786         struct pci_dev *pdev = to_pci_dev(hwdev);
2787         struct dmar_domain *domain;
2788         phys_addr_t start_paddr;
2789         struct iova *iova;
2790         int prot = 0;
2791         int ret;
2792         struct intel_iommu *iommu;
2793         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2794
2795         BUG_ON(dir == DMA_NONE);
2796
2797         if (iommu_no_mapping(hwdev))
2798                 return paddr;
2799
2800         domain = get_valid_domain_for_dev(pdev);
2801         if (!domain)
2802                 return 0;
2803
2804         iommu = domain_get_iommu(domain);
2805         size = aligned_nrpages(paddr, size);
2806
2807         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2808         if (!iova)
2809                 goto error;
2810
2811         /*
2812          * Check if DMAR supports zero-length reads on write only
2813          * mappings..
2814          */
2815         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2816                         !cap_zlr(iommu->cap))
2817                 prot |= DMA_PTE_READ;
2818         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2819                 prot |= DMA_PTE_WRITE;
2820         /*
2821          * paddr - (paddr + size) might be partial page, we should map the whole
2822          * page.  Note: if two part of one page are separately mapped, we
2823          * might have two guest_addr mapping to the same host paddr, but this
2824          * is not a big problem
2825          */
2826         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2827                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2828         if (ret)
2829                 goto error;
2830
2831         /* it's a non-present to present mapping. Only flush if caching mode */
2832         if (cap_caching_mode(iommu->cap))
2833                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2834         else
2835                 iommu_flush_write_buffer(iommu);
2836
2837         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2838         start_paddr += paddr & ~PAGE_MASK;
2839         return start_paddr;
2840
2841 error:
2842         if (iova)
2843                 __free_iova(&domain->iovad, iova);
2844         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2845                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2846         return 0;
2847 }
2848
2849 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2850                                  unsigned long offset, size_t size,
2851                                  enum dma_data_direction dir,
2852                                  struct dma_attrs *attrs)
2853 {
2854         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2855                                   dir, to_pci_dev(dev)->dma_mask);
2856 }
2857
2858 static void flush_unmaps(void)
2859 {
2860         int i, j;
2861
2862         timer_on = 0;
2863
2864         /* just flush them all */
2865         for (i = 0; i < g_num_of_iommus; i++) {
2866                 struct intel_iommu *iommu = g_iommus[i];
2867                 if (!iommu)
2868                         continue;
2869
2870                 if (!deferred_flush[i].next)
2871                         continue;
2872
2873                 /* In caching mode, global flushes turn emulation expensive */
2874                 if (!cap_caching_mode(iommu->cap))
2875                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2876                                          DMA_TLB_GLOBAL_FLUSH);
2877                 for (j = 0; j < deferred_flush[i].next; j++) {
2878                         unsigned long mask;
2879                         struct iova *iova = deferred_flush[i].iova[j];
2880                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2881
2882                         /* On real hardware multiple invalidations are expensive */
2883                         if (cap_caching_mode(iommu->cap))
2884                                 iommu_flush_iotlb_psi(iommu, domain->id,
2885                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2886                         else {
2887                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2888                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2889                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2890                         }
2891                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2892                 }
2893                 deferred_flush[i].next = 0;
2894         }
2895
2896         list_size = 0;
2897 }
2898
2899 static void flush_unmaps_timeout(unsigned long data)
2900 {
2901         unsigned long flags;
2902
2903         spin_lock_irqsave(&async_umap_flush_lock, flags);
2904         flush_unmaps();
2905         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2906 }
2907
2908 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2909 {
2910         unsigned long flags;
2911         int next, iommu_id;
2912         struct intel_iommu *iommu;
2913
2914         spin_lock_irqsave(&async_umap_flush_lock, flags);
2915         if (list_size == HIGH_WATER_MARK)
2916                 flush_unmaps();
2917
2918         iommu = domain_get_iommu(dom);
2919         iommu_id = iommu->seq_id;
2920
2921         next = deferred_flush[iommu_id].next;
2922         deferred_flush[iommu_id].domain[next] = dom;
2923         deferred_flush[iommu_id].iova[next] = iova;
2924         deferred_flush[iommu_id].next++;
2925
2926         if (!timer_on) {
2927                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2928                 timer_on = 1;
2929         }
2930         list_size++;
2931         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2932 }
2933
2934 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2935                              size_t size, enum dma_data_direction dir,
2936                              struct dma_attrs *attrs)
2937 {
2938         struct pci_dev *pdev = to_pci_dev(dev);
2939         struct dmar_domain *domain;
2940         unsigned long start_pfn, last_pfn;
2941         struct iova *iova;
2942         struct intel_iommu *iommu;
2943
2944         if (iommu_no_mapping(dev))
2945                 return;
2946
2947         domain = find_domain(pdev);
2948         BUG_ON(!domain);
2949
2950         iommu = domain_get_iommu(domain);
2951
2952         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2953         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2954                       (unsigned long long)dev_addr))
2955                 return;
2956
2957         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2958         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2959
2960         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2961                  pci_name(pdev), start_pfn, last_pfn);
2962
2963         /*  clear the whole page */
2964         dma_pte_clear_range(domain, start_pfn, last_pfn);
2965
2966         /* free page tables */
2967         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2968
2969         if (intel_iommu_strict) {
2970                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2971                                       last_pfn - start_pfn + 1, 0);
2972                 /* free iova */
2973                 __free_iova(&domain->iovad, iova);
2974         } else {
2975                 add_unmap(domain, iova);
2976                 /*
2977                  * queue up the release of the unmap to save the 1/6th of the
2978                  * cpu used up by the iotlb flush operation...
2979                  */
2980         }
2981 }
2982
2983 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2984                                   dma_addr_t *dma_handle, gfp_t flags)
2985 {
2986         void *vaddr;
2987         int order;
2988
2989         size = PAGE_ALIGN(size);
2990         order = get_order(size);
2991
2992         if (!iommu_no_mapping(hwdev))
2993                 flags &= ~(GFP_DMA | GFP_DMA32);
2994         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2995                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2996                         flags |= GFP_DMA;
2997                 else
2998                         flags |= GFP_DMA32;
2999         }
3000
3001         vaddr = (void *)__get_free_pages(flags, order);
3002         if (!vaddr)
3003                 return NULL;
3004         memset(vaddr, 0, size);
3005
3006         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3007                                          DMA_BIDIRECTIONAL,
3008                                          hwdev->coherent_dma_mask);
3009         if (*dma_handle)
3010                 return vaddr;
3011         free_pages((unsigned long)vaddr, order);
3012         return NULL;
3013 }
3014
3015 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3016                                 dma_addr_t dma_handle)
3017 {
3018         int order;
3019
3020         size = PAGE_ALIGN(size);
3021         order = get_order(size);
3022
3023         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3024         free_pages((unsigned long)vaddr, order);
3025 }
3026
3027 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3028                            int nelems, enum dma_data_direction dir,
3029                            struct dma_attrs *attrs)
3030 {
3031         struct pci_dev *pdev = to_pci_dev(hwdev);
3032         struct dmar_domain *domain;
3033         unsigned long start_pfn, last_pfn;
3034         struct iova *iova;
3035         struct intel_iommu *iommu;
3036
3037         if (iommu_no_mapping(hwdev))
3038                 return;
3039
3040         domain = find_domain(pdev);
3041         BUG_ON(!domain);
3042
3043         iommu = domain_get_iommu(domain);
3044
3045         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3046         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3047                       (unsigned long long)sglist[0].dma_address))
3048                 return;
3049
3050         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3051         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3052
3053         /*  clear the whole page */
3054         dma_pte_clear_range(domain, start_pfn, last_pfn);
3055
3056         /* free page tables */
3057         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3058
3059         if (intel_iommu_strict) {
3060                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3061                                       last_pfn - start_pfn + 1, 0);
3062                 /* free iova */
3063                 __free_iova(&domain->iovad, iova);
3064         } else {
3065                 add_unmap(domain, iova);
3066                 /*
3067                  * queue up the release of the unmap to save the 1/6th of the
3068                  * cpu used up by the iotlb flush operation...
3069                  */
3070         }
3071 }
3072
3073 static int intel_nontranslate_map_sg(struct device *hddev,
3074         struct scatterlist *sglist, int nelems, int dir)
3075 {
3076         int i;
3077         struct scatterlist *sg;
3078
3079         for_each_sg(sglist, sg, nelems, i) {
3080                 BUG_ON(!sg_page(sg));
3081                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3082                 sg->dma_length = sg->length;
3083         }
3084         return nelems;
3085 }
3086
3087 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3088                         enum dma_data_direction dir, struct dma_attrs *attrs)
3089 {
3090         int i;
3091         struct pci_dev *pdev = to_pci_dev(hwdev);
3092         struct dmar_domain *domain;
3093         size_t size = 0;
3094         int prot = 0;
3095         struct iova *iova = NULL;
3096         int ret;
3097         struct scatterlist *sg;
3098         unsigned long start_vpfn;
3099         struct intel_iommu *iommu;
3100
3101         BUG_ON(dir == DMA_NONE);
3102         if (iommu_no_mapping(hwdev))
3103                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3104
3105         domain = get_valid_domain_for_dev(pdev);
3106         if (!domain)
3107                 return 0;
3108
3109         iommu = domain_get_iommu(domain);
3110
3111         for_each_sg(sglist, sg, nelems, i)
3112                 size += aligned_nrpages(sg->offset, sg->length);
3113
3114         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3115                                 pdev->dma_mask);
3116         if (!iova) {
3117                 sglist->dma_length = 0;
3118                 return 0;
3119         }
3120
3121         /*
3122          * Check if DMAR supports zero-length reads on write only
3123          * mappings..
3124          */
3125         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3126                         !cap_zlr(iommu->cap))
3127                 prot |= DMA_PTE_READ;
3128         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3129                 prot |= DMA_PTE_WRITE;
3130
3131         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3132
3133         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3134         if (unlikely(ret)) {
3135                 /*  clear the page */
3136                 dma_pte_clear_range(domain, start_vpfn,
3137                                     start_vpfn + size - 1);
3138                 /* free page tables */
3139                 dma_pte_free_pagetable(domain, start_vpfn,
3140                                        start_vpfn + size - 1);
3141                 /* free iova */
3142                 __free_iova(&domain->iovad, iova);
3143                 return 0;
3144         }
3145
3146         /* it's a non-present to present mapping. Only flush if caching mode */
3147         if (cap_caching_mode(iommu->cap))
3148                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3149         else
3150                 iommu_flush_write_buffer(iommu);
3151
3152         return nelems;
3153 }
3154
3155 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3156 {
3157         return !dma_addr;
3158 }
3159
3160 struct dma_map_ops intel_dma_ops = {
3161         .alloc_coherent = intel_alloc_coherent,
3162         .free_coherent = intel_free_coherent,
3163         .map_sg = intel_map_sg,
3164         .unmap_sg = intel_unmap_sg,
3165         .map_page = intel_map_page,
3166         .unmap_page = intel_unmap_page,
3167         .mapping_error = intel_mapping_error,
3168 };
3169
3170 static inline int iommu_domain_cache_init(void)
3171 {
3172         int ret = 0;
3173
3174         iommu_domain_cache = kmem_cache_create("iommu_domain",
3175                                          sizeof(struct dmar_domain),
3176                                          0,
3177                                          SLAB_HWCACHE_ALIGN,
3178
3179                                          NULL);
3180         if (!iommu_domain_cache) {
3181                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3182                 ret = -ENOMEM;
3183         }
3184
3185         return ret;
3186 }
3187
3188 static inline int iommu_devinfo_cache_init(void)
3189 {
3190         int ret = 0;
3191
3192         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3193                                          sizeof(struct device_domain_info),
3194                                          0,
3195                                          SLAB_HWCACHE_ALIGN,
3196                                          NULL);
3197         if (!iommu_devinfo_cache) {
3198                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3199                 ret = -ENOMEM;
3200         }
3201
3202         return ret;
3203 }
3204
3205 static inline int iommu_iova_cache_init(void)
3206 {
3207         int ret = 0;
3208
3209         iommu_iova_cache = kmem_cache_create("iommu_iova",
3210                                          sizeof(struct iova),
3211                                          0,
3212                                          SLAB_HWCACHE_ALIGN,
3213                                          NULL);
3214         if (!iommu_iova_cache) {
3215                 printk(KERN_ERR "Couldn't create iova cache\n");
3216                 ret = -ENOMEM;
3217         }
3218
3219         return ret;
3220 }
3221
3222 static int __init iommu_init_mempool(void)
3223 {
3224         int ret;
3225         ret = iommu_iova_cache_init();
3226         if (ret)
3227                 return ret;
3228
3229         ret = iommu_domain_cache_init();
3230         if (ret)
3231                 goto domain_error;
3232
3233         ret = iommu_devinfo_cache_init();
3234         if (!ret)
3235                 return ret;
3236
3237         kmem_cache_destroy(iommu_domain_cache);
3238 domain_error:
3239         kmem_cache_destroy(iommu_iova_cache);
3240
3241         return -ENOMEM;
3242 }
3243
3244 static void __init iommu_exit_mempool(void)
3245 {
3246         kmem_cache_destroy(iommu_devinfo_cache);
3247         kmem_cache_destroy(iommu_domain_cache);
3248         kmem_cache_destroy(iommu_iova_cache);
3249
3250 }
3251
3252 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3253 {
3254         struct dmar_drhd_unit *drhd;
3255         u32 vtbar;
3256         int rc;
3257
3258         /* We know that this device on this chipset has its own IOMMU.
3259          * If we find it under a different IOMMU, then the BIOS is lying
3260          * to us. Hope that the IOMMU for this device is actually
3261          * disabled, and it needs no translation...
3262          */
3263         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3264         if (rc) {
3265                 /* "can't" happen */
3266                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3267                 return;
3268         }
3269         vtbar &= 0xffff0000;
3270
3271         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3272         drhd = dmar_find_matched_drhd_unit(pdev);
3273         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3274                             TAINT_FIRMWARE_WORKAROUND,
3275                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3276                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3277 }
3278 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3279
3280 static void __init init_no_remapping_devices(void)
3281 {
3282         struct dmar_drhd_unit *drhd;
3283
3284         for_each_drhd_unit(drhd) {
3285                 if (!drhd->include_all) {
3286                         int i;
3287                         for (i = 0; i < drhd->devices_cnt; i++)
3288                                 if (drhd->devices[i] != NULL)
3289                                         break;
3290                         /* ignore DMAR unit if no pci devices exist */
3291                         if (i == drhd->devices_cnt)
3292                                 drhd->ignored = 1;
3293                 }
3294         }
3295
3296         for_each_drhd_unit(drhd) {
3297                 int i;
3298                 if (drhd->ignored || drhd->include_all)
3299                         continue;
3300
3301                 for (i = 0; i < drhd->devices_cnt; i++)
3302                         if (drhd->devices[i] &&
3303                             !IS_GFX_DEVICE(drhd->devices[i]))
3304                                 break;
3305
3306                 if (i < drhd->devices_cnt)
3307                         continue;
3308
3309                 /* This IOMMU has *only* gfx devices. Either bypass it or
3310                    set the gfx_mapped flag, as appropriate */
3311                 if (dmar_map_gfx) {
3312                         intel_iommu_gfx_mapped = 1;
3313                 } else {
3314                         drhd->ignored = 1;
3315                         for (i = 0; i < drhd->devices_cnt; i++) {
3316                                 if (!drhd->devices[i])
3317                                         continue;
3318                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3319                         }
3320                 }
3321         }
3322 }
3323
3324 #ifdef CONFIG_SUSPEND
3325 static int init_iommu_hw(void)
3326 {
3327         struct dmar_drhd_unit *drhd;
3328         struct intel_iommu *iommu = NULL;
3329
3330         for_each_active_iommu(iommu, drhd)
3331                 if (iommu->qi)
3332                         dmar_reenable_qi(iommu);
3333
3334         for_each_iommu(iommu, drhd) {
3335                 if (drhd->ignored) {
3336                         /*
3337                          * we always have to disable PMRs or DMA may fail on
3338                          * this device
3339                          */
3340                         if (force_on)
3341                                 iommu_disable_protect_mem_regions(iommu);
3342                         continue;
3343                 }
3344         
3345                 iommu_flush_write_buffer(iommu);
3346
3347                 iommu_set_root_entry(iommu);
3348
3349                 iommu->flush.flush_context(iommu, 0, 0, 0,
3350                                            DMA_CCMD_GLOBAL_INVL);
3351                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3352                                          DMA_TLB_GLOBAL_FLUSH);
3353                 if (iommu_enable_translation(iommu))
3354                         return 1;
3355                 iommu_disable_protect_mem_regions(iommu);
3356         }
3357
3358         return 0;
3359 }
3360
3361 static void iommu_flush_all(void)
3362 {
3363         struct dmar_drhd_unit *drhd;
3364         struct intel_iommu *iommu;
3365
3366         for_each_active_iommu(iommu, drhd) {
3367                 iommu->flush.flush_context(iommu, 0, 0, 0,
3368                                            DMA_CCMD_GLOBAL_INVL);
3369                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3370                                          DMA_TLB_GLOBAL_FLUSH);
3371         }
3372 }
3373
3374 static int iommu_suspend(void)
3375 {
3376         struct dmar_drhd_unit *drhd;
3377         struct intel_iommu *iommu = NULL;
3378         unsigned long flag;
3379
3380         for_each_active_iommu(iommu, drhd) {
3381                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3382                                                  GFP_ATOMIC);
3383                 if (!iommu->iommu_state)
3384                         goto nomem;
3385         }
3386
3387         iommu_flush_all();
3388
3389         for_each_active_iommu(iommu, drhd) {
3390                 iommu_disable_translation(iommu);
3391
3392                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3393
3394                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3395                         readl(iommu->reg + DMAR_FECTL_REG);
3396                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3397                         readl(iommu->reg + DMAR_FEDATA_REG);
3398                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3399                         readl(iommu->reg + DMAR_FEADDR_REG);
3400                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3401                         readl(iommu->reg + DMAR_FEUADDR_REG);
3402
3403                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3404         }
3405         return 0;
3406
3407 nomem:
3408         for_each_active_iommu(iommu, drhd)
3409                 kfree(iommu->iommu_state);
3410
3411         return -ENOMEM;
3412 }
3413
3414 static void iommu_resume(void)
3415 {
3416         struct dmar_drhd_unit *drhd;
3417         struct intel_iommu *iommu = NULL;
3418         unsigned long flag;
3419
3420         if (init_iommu_hw()) {
3421                 if (force_on)
3422                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3423                 else
3424                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3425                 return;
3426         }
3427
3428         for_each_active_iommu(iommu, drhd) {
3429
3430                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3431
3432                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3433                         iommu->reg + DMAR_FECTL_REG);
3434                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3435                         iommu->reg + DMAR_FEDATA_REG);
3436                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3437                         iommu->reg + DMAR_FEADDR_REG);
3438                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3439                         iommu->reg + DMAR_FEUADDR_REG);
3440
3441                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3442         }
3443
3444         for_each_active_iommu(iommu, drhd)
3445                 kfree(iommu->iommu_state);
3446 }
3447
3448 static struct syscore_ops iommu_syscore_ops = {
3449         .resume         = iommu_resume,
3450         .suspend        = iommu_suspend,
3451 };
3452
3453 static void __init init_iommu_pm_ops(void)
3454 {
3455         register_syscore_ops(&iommu_syscore_ops);
3456 }
3457
3458 #else
3459 static inline void init_iommu_pm_ops(void) {}
3460 #endif  /* CONFIG_PM */
3461
3462 LIST_HEAD(dmar_rmrr_units);
3463
3464 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3465 {
3466         list_add(&rmrr->list, &dmar_rmrr_units);
3467 }
3468
3469
3470 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3471 {
3472         struct acpi_dmar_reserved_memory *rmrr;
3473         struct dmar_rmrr_unit *rmrru;
3474
3475         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3476         if (!rmrru)
3477                 return -ENOMEM;
3478
3479         rmrru->hdr = header;
3480         rmrr = (struct acpi_dmar_reserved_memory *)header;
3481         rmrru->base_address = rmrr->base_address;
3482         rmrru->end_address = rmrr->end_address;
3483
3484         dmar_register_rmrr_unit(rmrru);
3485         return 0;
3486 }
3487
3488 static int __init
3489 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3490 {
3491         struct acpi_dmar_reserved_memory *rmrr;
3492         int ret;
3493
3494         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3495         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3496                 ((void *)rmrr) + rmrr->header.length,
3497                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3498
3499         if (ret || (rmrru->devices_cnt == 0)) {
3500                 list_del(&rmrru->list);
3501                 kfree(rmrru);
3502         }
3503         return ret;
3504 }
3505
3506 static LIST_HEAD(dmar_atsr_units);
3507
3508 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3509 {
3510         struct acpi_dmar_atsr *atsr;
3511         struct dmar_atsr_unit *atsru;
3512
3513         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3514         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3515         if (!atsru)
3516                 return -ENOMEM;
3517
3518         atsru->hdr = hdr;
3519         atsru->include_all = atsr->flags & 0x1;
3520
3521         list_add(&atsru->list, &dmar_atsr_units);
3522
3523         return 0;
3524 }
3525
3526 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3527 {
3528         int rc;
3529         struct acpi_dmar_atsr *atsr;
3530
3531         if (atsru->include_all)
3532                 return 0;
3533
3534         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3535         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3536                                 (void *)atsr + atsr->header.length,
3537                                 &atsru->devices_cnt, &atsru->devices,
3538                                 atsr->segment);
3539         if (rc || !atsru->devices_cnt) {
3540                 list_del(&atsru->list);
3541                 kfree(atsru);
3542         }
3543
3544         return rc;
3545 }
3546
3547 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3548 {
3549         int i;
3550         struct pci_bus *bus;
3551         struct acpi_dmar_atsr *atsr;
3552         struct dmar_atsr_unit *atsru;
3553
3554         dev = pci_physfn(dev);
3555
3556         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3557                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3558                 if (atsr->segment == pci_domain_nr(dev->bus))
3559                         goto found;
3560         }
3561
3562         return 0;
3563
3564 found:
3565         for (bus = dev->bus; bus; bus = bus->parent) {
3566                 struct pci_dev *bridge = bus->self;
3567
3568                 if (!bridge || !pci_is_pcie(bridge) ||
3569                     bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3570                         return 0;
3571
3572                 if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3573                         for (i = 0; i < atsru->devices_cnt; i++)
3574                                 if (atsru->devices[i] == bridge)
3575                                         return 1;
3576                         break;
3577                 }
3578         }
3579
3580         if (atsru->include_all)
3581                 return 1;
3582
3583         return 0;
3584 }
3585
3586 int __init dmar_parse_rmrr_atsr_dev(void)
3587 {
3588         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3589         struct dmar_atsr_unit *atsr, *atsr_n;
3590         int ret = 0;
3591
3592         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3593                 ret = rmrr_parse_dev(rmrr);
3594                 if (ret)
3595                         return ret;
3596         }
3597
3598         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3599                 ret = atsr_parse_dev(atsr);
3600                 if (ret)
3601                         return ret;
3602         }
3603
3604         return ret;
3605 }
3606
3607 /*
3608  * Here we only respond to action of unbound device from driver.
3609  *
3610  * Added device is not attached to its DMAR domain here yet. That will happen
3611  * when mapping the device to iova.
3612  */
3613 static int device_notifier(struct notifier_block *nb,
3614                                   unsigned long action, void *data)
3615 {
3616         struct device *dev = data;
3617         struct pci_dev *pdev = to_pci_dev(dev);
3618         struct dmar_domain *domain;
3619
3620         if (iommu_no_mapping(dev))
3621                 return 0;
3622
3623         domain = find_domain(pdev);
3624         if (!domain)
3625                 return 0;
3626
3627         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3628                 domain_remove_one_dev_info(domain, pdev);
3629
3630                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3631                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3632                     list_empty(&domain->devices))
3633                         domain_exit(domain);
3634         }
3635
3636         return 0;
3637 }
3638
3639 static struct notifier_block device_nb = {
3640         .notifier_call = device_notifier,
3641 };
3642
3643 int __init intel_iommu_init(void)
3644 {
3645         int ret = 0;
3646         struct dmar_drhd_unit *drhd;
3647
3648         /* VT-d is required for a TXT/tboot launch, so enforce that */
3649         force_on = tboot_force_iommu();
3650
3651         if (dmar_table_init()) {
3652                 if (force_on)
3653                         panic("tboot: Failed to initialize DMAR table\n");
3654                 return  -ENODEV;
3655         }
3656
3657         /*
3658          * Disable translation if already enabled prior to OS handover.
3659          */
3660         for_each_drhd_unit(drhd) {
3661                 struct intel_iommu *iommu;
3662
3663                 if (drhd->ignored)
3664                         continue;
3665
3666                 iommu = drhd->iommu;
3667                 if (iommu->gcmd & DMA_GCMD_TE)
3668                         iommu_disable_translation(iommu);
3669         }
3670
3671         if (dmar_dev_scope_init() < 0) {
3672                 if (force_on)
3673                         panic("tboot: Failed to initialize DMAR device scope\n");
3674                 return  -ENODEV;
3675         }
3676
3677         if (no_iommu || dmar_disabled)
3678                 return -ENODEV;
3679
3680         if (iommu_init_mempool()) {
3681                 if (force_on)
3682                         panic("tboot: Failed to initialize iommu memory\n");
3683                 return  -ENODEV;
3684         }
3685
3686         if (list_empty(&dmar_rmrr_units))
3687                 printk(KERN_INFO "DMAR: No RMRR found\n");
3688
3689         if (list_empty(&dmar_atsr_units))
3690                 printk(KERN_INFO "DMAR: No ATSR found\n");
3691
3692         if (dmar_init_reserved_ranges()) {
3693                 if (force_on)
3694                         panic("tboot: Failed to reserve iommu ranges\n");
3695                 return  -ENODEV;
3696         }
3697
3698         init_no_remapping_devices();
3699
3700         ret = init_dmars();
3701         if (ret) {
3702                 if (force_on)
3703                         panic("tboot: Failed to initialize DMARs\n");
3704                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3705                 put_iova_domain(&reserved_iova_list);
3706                 iommu_exit_mempool();
3707                 return ret;
3708         }
3709         printk(KERN_INFO
3710         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3711
3712         init_timer(&unmap_timer);
3713 #ifdef CONFIG_SWIOTLB
3714         swiotlb = 0;
3715 #endif
3716         dma_ops = &intel_dma_ops;
3717
3718         init_iommu_pm_ops();
3719
3720         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3721
3722         bus_register_notifier(&pci_bus_type, &device_nb);
3723
3724         intel_iommu_enabled = 1;
3725
3726         return 0;
3727 }
3728
3729 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3730                                            struct pci_dev *pdev)
3731 {
3732         struct pci_dev *tmp, *parent;
3733
3734         if (!iommu || !pdev)
3735                 return;
3736
3737         /* dependent device detach */
3738         tmp = pci_find_upstream_pcie_bridge(pdev);
3739         /* Secondary interface's bus number and devfn 0 */
3740         if (tmp) {
3741                 parent = pdev->bus->self;
3742                 while (parent != tmp) {
3743                         iommu_detach_dev(iommu, parent->bus->number,
3744                                          parent->devfn);
3745                         parent = parent->bus->self;
3746                 }
3747                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3748                         iommu_detach_dev(iommu,
3749                                 tmp->subordinate->number, 0);
3750                 else /* this is a legacy PCI bridge */
3751                         iommu_detach_dev(iommu, tmp->bus->number,
3752                                          tmp->devfn);
3753         }
3754 }
3755
3756 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3757                                           struct pci_dev *pdev)
3758 {
3759         struct device_domain_info *info;
3760         struct intel_iommu *iommu;
3761         unsigned long flags;
3762         int found = 0;
3763         struct list_head *entry, *tmp;
3764
3765         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3766                                 pdev->devfn);
3767         if (!iommu)
3768                 return;
3769
3770         spin_lock_irqsave(&device_domain_lock, flags);
3771         list_for_each_safe(entry, tmp, &domain->devices) {
3772                 info = list_entry(entry, struct device_domain_info, link);
3773                 if (info->segment == pci_domain_nr(pdev->bus) &&
3774                     info->bus == pdev->bus->number &&
3775                     info->devfn == pdev->devfn) {
3776                         list_del(&info->link);
3777                         list_del(&info->global);
3778                         if (info->dev)
3779                                 info->dev->dev.archdata.iommu = NULL;
3780                         spin_unlock_irqrestore(&device_domain_lock, flags);
3781
3782                         iommu_disable_dev_iotlb(info);
3783                         iommu_detach_dev(iommu, info->bus, info->devfn);
3784                         iommu_detach_dependent_devices(iommu, pdev);
3785                         free_devinfo_mem(info);
3786
3787                         spin_lock_irqsave(&device_domain_lock, flags);
3788
3789                         if (found)
3790                                 break;
3791                         else
3792                                 continue;
3793                 }
3794
3795                 /* if there is no other devices under the same iommu
3796                  * owned by this domain, clear this iommu in iommu_bmp
3797                  * update iommu count and coherency
3798                  */
3799                 if (iommu == device_to_iommu(info->segment, info->bus,
3800                                             info->devfn))
3801                         found = 1;
3802         }
3803
3804         spin_unlock_irqrestore(&device_domain_lock, flags);
3805
3806         if (found == 0) {
3807                 unsigned long tmp_flags;
3808                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3809                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3810                 domain->iommu_count--;
3811                 domain_update_iommu_cap(domain);
3812                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3813
3814                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3815                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3816                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3817                         clear_bit(domain->id, iommu->domain_ids);
3818                         iommu->domains[domain->id] = NULL;
3819                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3820                 }
3821         }
3822 }
3823
3824 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3825 {
3826         struct device_domain_info *info;
3827         struct intel_iommu *iommu;
3828         unsigned long flags1, flags2;
3829
3830         spin_lock_irqsave(&device_domain_lock, flags1);
3831         while (!list_empty(&domain->devices)) {
3832                 info = list_entry(domain->devices.next,
3833                         struct device_domain_info, link);
3834                 list_del(&info->link);
3835                 list_del(&info->global);
3836                 if (info->dev)
3837                         info->dev->dev.archdata.iommu = NULL;
3838
3839                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3840
3841                 iommu_disable_dev_iotlb(info);
3842                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3843                 iommu_detach_dev(iommu, info->bus, info->devfn);
3844                 iommu_detach_dependent_devices(iommu, info->dev);
3845
3846                 /* clear this iommu in iommu_bmp, update iommu count
3847                  * and capabilities
3848                  */
3849                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3850                 if (test_and_clear_bit(iommu->seq_id,
3851                                        &domain->iommu_bmp)) {
3852                         domain->iommu_count--;
3853                         domain_update_iommu_cap(domain);
3854                 }
3855                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3856
3857                 free_devinfo_mem(info);
3858                 spin_lock_irqsave(&device_domain_lock, flags1);
3859         }
3860         spin_unlock_irqrestore(&device_domain_lock, flags1);
3861 }
3862
3863 /* domain id for virtual machine, it won't be set in context */
3864 static unsigned long vm_domid;
3865
3866 static struct dmar_domain *iommu_alloc_vm_domain(void)
3867 {
3868         struct dmar_domain *domain;
3869
3870         domain = alloc_domain_mem();
3871         if (!domain)
3872                 return NULL;
3873
3874         domain->id = vm_domid++;
3875         domain->nid = -1;
3876         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3877         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3878
3879         return domain;
3880 }
3881
3882 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3883 {
3884         int adjust_width;
3885
3886         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3887         spin_lock_init(&domain->iommu_lock);
3888
3889         domain_reserve_special_ranges(domain);
3890
3891         /* calculate AGAW */
3892         domain->gaw = guest_width;
3893         adjust_width = guestwidth_to_adjustwidth(guest_width);
3894         domain->agaw = width_to_agaw(adjust_width);
3895
3896         INIT_LIST_HEAD(&domain->devices);
3897
3898         domain->iommu_count = 0;
3899         domain->iommu_coherency = 0;
3900         domain->iommu_snooping = 0;
3901         domain->iommu_superpage = 0;
3902         domain->max_addr = 0;
3903         domain->nid = -1;
3904
3905         /* always allocate the top pgd */
3906         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3907         if (!domain->pgd)
3908                 return -ENOMEM;
3909         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3910         return 0;
3911 }
3912
3913 static void iommu_free_vm_domain(struct dmar_domain *domain)
3914 {
3915         unsigned long flags;
3916         struct dmar_drhd_unit *drhd;
3917         struct intel_iommu *iommu;
3918         unsigned long i;
3919         unsigned long ndomains;
3920
3921         for_each_drhd_unit(drhd) {
3922                 if (drhd->ignored)
3923                         continue;
3924                 iommu = drhd->iommu;
3925
3926                 ndomains = cap_ndoms(iommu->cap);
3927                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3928                         if (iommu->domains[i] == domain) {
3929                                 spin_lock_irqsave(&iommu->lock, flags);
3930                                 clear_bit(i, iommu->domain_ids);
3931                                 iommu->domains[i] = NULL;
3932                                 spin_unlock_irqrestore(&iommu->lock, flags);
3933                                 break;
3934                         }
3935                 }
3936         }
3937 }
3938
3939 static void vm_domain_exit(struct dmar_domain *domain)
3940 {
3941         /* Domain 0 is reserved, so dont process it */
3942         if (!domain)
3943                 return;
3944
3945         vm_domain_remove_all_dev_info(domain);
3946         /* destroy iovas */
3947         put_iova_domain(&domain->iovad);
3948
3949         /* clear ptes */
3950         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3951
3952         /* free page tables */
3953         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3954
3955         iommu_free_vm_domain(domain);
3956         free_domain_mem(domain);
3957 }
3958
3959 static int intel_iommu_domain_init(struct iommu_domain *domain)
3960 {
3961         struct dmar_domain *dmar_domain;
3962
3963         dmar_domain = iommu_alloc_vm_domain();
3964         if (!dmar_domain) {
3965                 printk(KERN_ERR
3966                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3967                 return -ENOMEM;
3968         }
3969         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3970                 printk(KERN_ERR
3971                         "intel_iommu_domain_init() failed\n");
3972                 vm_domain_exit(dmar_domain);
3973                 return -ENOMEM;
3974         }
3975         domain_update_iommu_cap(dmar_domain);
3976         domain->priv = dmar_domain;
3977
3978         return 0;
3979 }
3980
3981 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3982 {
3983         struct dmar_domain *dmar_domain = domain->priv;
3984
3985         domain->priv = NULL;
3986         vm_domain_exit(dmar_domain);
3987 }
3988
3989 static int intel_iommu_attach_device(struct iommu_domain *domain,
3990                                      struct device *dev)
3991 {
3992         struct dmar_domain *dmar_domain = domain->priv;
3993         struct pci_dev *pdev = to_pci_dev(dev);
3994         struct intel_iommu *iommu;
3995         int addr_width;
3996
3997         if (device_is_rmrr_locked(pdev)) {
3998                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
3999                 return -EPERM;
4000         }
4001
4002         /* normally pdev is not mapped */
4003         if (unlikely(domain_context_mapped(pdev))) {
4004                 struct dmar_domain *old_domain;
4005
4006                 old_domain = find_domain(pdev);
4007                 if (old_domain) {
4008                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4009                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4010                                 domain_remove_one_dev_info(old_domain, pdev);
4011                         else
4012                                 domain_remove_dev_info(old_domain);
4013                 }
4014         }
4015
4016         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4017                                 pdev->devfn);
4018         if (!iommu)
4019                 return -ENODEV;
4020
4021         /* check if this iommu agaw is sufficient for max mapped address */
4022         addr_width = agaw_to_width(iommu->agaw);
4023         if (addr_width > cap_mgaw(iommu->cap))
4024                 addr_width = cap_mgaw(iommu->cap);
4025
4026         if (dmar_domain->max_addr > (1LL << addr_width)) {
4027                 printk(KERN_ERR "%s: iommu width (%d) is not "
4028                        "sufficient for the mapped address (%llx)\n",
4029                        __func__, addr_width, dmar_domain->max_addr);
4030                 return -EFAULT;
4031         }
4032         dmar_domain->gaw = addr_width;
4033
4034         /*
4035          * Knock out extra levels of page tables if necessary
4036          */
4037         while (iommu->agaw < dmar_domain->agaw) {
4038                 struct dma_pte *pte;
4039
4040                 pte = dmar_domain->pgd;
4041                 if (dma_pte_present(pte)) {
4042                         dmar_domain->pgd = (struct dma_pte *)
4043                                 phys_to_virt(dma_pte_addr(pte));
4044                         free_pgtable_page(pte);
4045                 }
4046                 dmar_domain->agaw--;
4047         }
4048
4049         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4050 }
4051
4052 static void intel_iommu_detach_device(struct iommu_domain *domain,
4053                                       struct device *dev)
4054 {
4055         struct dmar_domain *dmar_domain = domain->priv;
4056         struct pci_dev *pdev = to_pci_dev(dev);
4057
4058         domain_remove_one_dev_info(dmar_domain, pdev);
4059 }
4060
4061 static int intel_iommu_map(struct iommu_domain *domain,
4062                            unsigned long iova, phys_addr_t hpa,
4063                            int gfp_order, int iommu_prot)
4064 {
4065         struct dmar_domain *dmar_domain = domain->priv;
4066         u64 max_addr;
4067         int prot = 0;
4068         size_t size;
4069         int ret;
4070
4071         if (iommu_prot & IOMMU_READ)
4072                 prot |= DMA_PTE_READ;
4073         if (iommu_prot & IOMMU_WRITE)
4074                 prot |= DMA_PTE_WRITE;
4075         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4076                 prot |= DMA_PTE_SNP;
4077
4078         size     = PAGE_SIZE << gfp_order;
4079         max_addr = iova + size;
4080         if (dmar_domain->max_addr < max_addr) {
4081                 u64 end;
4082
4083                 /* check if minimum agaw is sufficient for mapped address */
4084                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4085                 if (end < max_addr) {
4086                         printk(KERN_ERR "%s: iommu width (%d) is not "
4087                                "sufficient for the mapped address (%llx)\n",
4088                                __func__, dmar_domain->gaw, max_addr);
4089                         return -EFAULT;
4090                 }
4091                 dmar_domain->max_addr = max_addr;
4092         }
4093         /* Round up size to next multiple of PAGE_SIZE, if it and
4094            the low bits of hpa would take us onto the next page */
4095         size = aligned_nrpages(hpa, size);
4096         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4097                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4098         return ret;
4099 }
4100
4101 static int intel_iommu_unmap(struct iommu_domain *domain,
4102                              unsigned long iova, int gfp_order)
4103 {
4104         struct dmar_domain *dmar_domain = domain->priv;
4105         size_t size = PAGE_SIZE << gfp_order;
4106         int order, iommu_id;
4107
4108         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4109                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4110
4111         if (dmar_domain->max_addr == iova + size)
4112                 dmar_domain->max_addr = iova;
4113
4114         for_each_set_bit(iommu_id, &dmar_domain->iommu_bmp, g_num_of_iommus) {
4115                 struct intel_iommu *iommu = g_iommus[iommu_id];
4116                 int num, ndomains;
4117
4118                 /*
4119                  * find bit position of dmar_domain
4120                  */
4121                 ndomains = cap_ndoms(iommu->cap);
4122                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
4123                         if (iommu->domains[num] == dmar_domain)
4124                                 iommu_flush_iotlb_psi(iommu, num,
4125                                                       iova >> VTD_PAGE_SHIFT,
4126                                                       1 << order, 0);
4127                 }
4128         }
4129
4130         return order;
4131 }
4132
4133 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4134                                             unsigned long iova)
4135 {
4136         struct dmar_domain *dmar_domain = domain->priv;
4137         struct dma_pte *pte;
4138         u64 phys = 0;
4139
4140         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4141         if (pte)
4142                 phys = dma_pte_addr(pte);
4143
4144         return phys;
4145 }
4146
4147 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4148                                       unsigned long cap)
4149 {
4150         struct dmar_domain *dmar_domain = domain->priv;
4151
4152         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4153                 return dmar_domain->iommu_snooping;
4154         if (cap == IOMMU_CAP_INTR_REMAP)
4155                 return intr_remapping_enabled;
4156
4157         return 0;
4158 }
4159
4160 static struct iommu_ops intel_iommu_ops = {
4161         .domain_init    = intel_iommu_domain_init,
4162         .domain_destroy = intel_iommu_domain_destroy,
4163         .attach_dev     = intel_iommu_attach_device,
4164         .detach_dev     = intel_iommu_detach_device,
4165         .map            = intel_iommu_map,
4166         .unmap          = intel_iommu_unmap,
4167         .iova_to_phys   = intel_iommu_iova_to_phys,
4168         .domain_has_cap = intel_iommu_domain_has_cap,
4169 };
4170
4171 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4172 {
4173         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4174         printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4175         dmar_map_gfx = 0;
4176 }
4177
4178 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4179 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4180 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4181 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4182 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4183 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4184 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4185
4186 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4187 {
4188         /*
4189          * Mobile 4 Series Chipset neglects to set RWBF capability,
4190          * but needs it. Same seems to hold for the desktop versions.
4191          */
4192         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4193         rwbf_quirk = 1;
4194 }
4195
4196 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4197 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4198 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4199 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4200 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4201 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4202 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4203
4204 #define GGC 0x52
4205 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4206 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4207 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4208 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4209 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4210 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4211 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4212 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4213
4214 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4215 {
4216         unsigned short ggc;
4217
4218         if (pci_read_config_word(dev, GGC, &ggc))
4219                 return;
4220
4221         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4222                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4223                 dmar_map_gfx = 0;
4224         } else if (dmar_map_gfx) {
4225                 /* we have to ensure the gfx device is idle before we flush */
4226                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4227                 intel_iommu_strict = 1;
4228        }
4229 }
4230 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4234
4235 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4236    ISOCH DMAR unit for the Azalia sound device, but not give it any
4237    TLB entries, which causes it to deadlock. Check for that.  We do
4238    this in a function called from init_dmars(), instead of in a PCI
4239    quirk, because we don't want to print the obnoxious "BIOS broken"
4240    message if VT-d is actually disabled.
4241 */
4242 static void __init check_tylersburg_isoch(void)
4243 {
4244         struct pci_dev *pdev;
4245         uint32_t vtisochctrl;
4246
4247         /* If there's no Azalia in the system anyway, forget it. */
4248         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4249         if (!pdev)
4250                 return;
4251         pci_dev_put(pdev);
4252
4253         /* System Management Registers. Might be hidden, in which case
4254            we can't do the sanity check. But that's OK, because the
4255            known-broken BIOSes _don't_ actually hide it, so far. */
4256         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4257         if (!pdev)
4258                 return;
4259
4260         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4261                 pci_dev_put(pdev);
4262                 return;
4263         }
4264
4265         pci_dev_put(pdev);
4266
4267         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4268         if (vtisochctrl & 1)
4269                 return;
4270
4271         /* Drop all bits other than the number of TLB entries */
4272         vtisochctrl &= 0x1c;
4273
4274         /* If we have the recommended number of TLB entries (16), fine. */
4275         if (vtisochctrl == 0x10)
4276                 return;
4277
4278         /* Zero TLB entries? You get to ride the short bus to school. */
4279         if (!vtisochctrl) {
4280                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4281                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4282                      dmi_get_system_info(DMI_BIOS_VENDOR),
4283                      dmi_get_system_info(DMI_BIOS_VERSION),
4284                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4285                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4286                 return;
4287         }
4288         
4289         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4290                vtisochctrl);
4291 }