pandora: defconfig: update
[pandora-kernel.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
46
47 #define ROOT_SIZE               VTD_PAGE_SIZE
48 #define CONTEXT_SIZE            VTD_PAGE_SIZE
49
50 #define IS_BRIDGE_HOST_DEVICE(pdev) \
51                             ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63
64 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66
67 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
70                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72
73 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
74 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
75 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
76
77 /* page table handling */
78 #define LEVEL_STRIDE            (9)
79 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
80
81 static inline int agaw_to_level(int agaw)
82 {
83         return agaw + 2;
84 }
85
86 static inline int agaw_to_width(int agaw)
87 {
88         return 30 + agaw * LEVEL_STRIDE;
89 }
90
91 static inline int width_to_agaw(int width)
92 {
93         return (width - 30) / LEVEL_STRIDE;
94 }
95
96 static inline unsigned int level_to_offset_bits(int level)
97 {
98         return (level - 1) * LEVEL_STRIDE;
99 }
100
101 static inline int pfn_level_offset(unsigned long pfn, int level)
102 {
103         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
104 }
105
106 static inline unsigned long level_mask(int level)
107 {
108         return -1UL << level_to_offset_bits(level);
109 }
110
111 static inline unsigned long level_size(int level)
112 {
113         return 1UL << level_to_offset_bits(level);
114 }
115
116 static inline unsigned long align_to_level(unsigned long pfn, int level)
117 {
118         return (pfn + level_size(level) - 1) & level_mask(level);
119 }
120
121 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
122 {
123         return  1 << ((lvl - 1) * LEVEL_STRIDE);
124 }
125
126 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
127    are never going to work. */
128 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
129 {
130         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
131 }
132
133 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
134 {
135         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
136 }
137 static inline unsigned long page_to_dma_pfn(struct page *pg)
138 {
139         return mm_to_dma_pfn(page_to_pfn(pg));
140 }
141 static inline unsigned long virt_to_dma_pfn(void *p)
142 {
143         return page_to_dma_pfn(virt_to_page(p));
144 }
145
146 /* global iommu list, set NULL for ignored DMAR units */
147 static struct intel_iommu **g_iommus;
148
149 static void __init check_tylersburg_isoch(void);
150 static int rwbf_quirk;
151
152 /*
153  * set to 1 to panic kernel if can't successfully enable VT-d
154  * (used when kernel is launched w/ TXT)
155  */
156 static int force_on = 0;
157
158 /*
159  * 0: Present
160  * 1-11: Reserved
161  * 12-63: Context Ptr (12 - (haw-1))
162  * 64-127: Reserved
163  */
164 struct root_entry {
165         u64     val;
166         u64     rsvd1;
167 };
168 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
169 static inline bool root_present(struct root_entry *root)
170 {
171         return (root->val & 1);
172 }
173 static inline void set_root_present(struct root_entry *root)
174 {
175         root->val |= 1;
176 }
177 static inline void set_root_value(struct root_entry *root, unsigned long value)
178 {
179         root->val |= value & VTD_PAGE_MASK;
180 }
181
182 static inline struct context_entry *
183 get_context_addr_from_root(struct root_entry *root)
184 {
185         return (struct context_entry *)
186                 (root_present(root)?phys_to_virt(
187                 root->val & VTD_PAGE_MASK) :
188                 NULL);
189 }
190
191 /*
192  * low 64 bits:
193  * 0: present
194  * 1: fault processing disable
195  * 2-3: translation type
196  * 12-63: address space root
197  * high 64 bits:
198  * 0-2: address width
199  * 3-6: aval
200  * 8-23: domain id
201  */
202 struct context_entry {
203         u64 lo;
204         u64 hi;
205 };
206
207 static inline bool context_present(struct context_entry *context)
208 {
209         return (context->lo & 1);
210 }
211 static inline void context_set_present(struct context_entry *context)
212 {
213         context->lo |= 1;
214 }
215
216 static inline void context_set_fault_enable(struct context_entry *context)
217 {
218         context->lo &= (((u64)-1) << 2) | 1;
219 }
220
221 static inline void context_set_translation_type(struct context_entry *context,
222                                                 unsigned long value)
223 {
224         context->lo &= (((u64)-1) << 4) | 3;
225         context->lo |= (value & 3) << 2;
226 }
227
228 static inline void context_set_address_root(struct context_entry *context,
229                                             unsigned long value)
230 {
231         context->lo |= value & VTD_PAGE_MASK;
232 }
233
234 static inline void context_set_address_width(struct context_entry *context,
235                                              unsigned long value)
236 {
237         context->hi |= value & 7;
238 }
239
240 static inline void context_set_domain_id(struct context_entry *context,
241                                          unsigned long value)
242 {
243         context->hi |= (value & ((1 << 16) - 1)) << 8;
244 }
245
246 static inline void context_clear_entry(struct context_entry *context)
247 {
248         context->lo = 0;
249         context->hi = 0;
250 }
251
252 /*
253  * 0: readable
254  * 1: writable
255  * 2-6: reserved
256  * 7: super page
257  * 8-10: available
258  * 11: snoop behavior
259  * 12-63: Host physcial address
260  */
261 struct dma_pte {
262         u64 val;
263 };
264
265 static inline void dma_clear_pte(struct dma_pte *pte)
266 {
267         pte->val = 0;
268 }
269
270 static inline void dma_set_pte_readable(struct dma_pte *pte)
271 {
272         pte->val |= DMA_PTE_READ;
273 }
274
275 static inline void dma_set_pte_writable(struct dma_pte *pte)
276 {
277         pte->val |= DMA_PTE_WRITE;
278 }
279
280 static inline void dma_set_pte_snp(struct dma_pte *pte)
281 {
282         pte->val |= DMA_PTE_SNP;
283 }
284
285 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
286 {
287         pte->val = (pte->val & ~3) | (prot & 3);
288 }
289
290 static inline u64 dma_pte_addr(struct dma_pte *pte)
291 {
292 #ifdef CONFIG_64BIT
293         return pte->val & VTD_PAGE_MASK;
294 #else
295         /* Must have a full atomic 64-bit read */
296         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
297 #endif
298 }
299
300 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
301 {
302         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
303 }
304
305 static inline bool dma_pte_present(struct dma_pte *pte)
306 {
307         return (pte->val & 3) != 0;
308 }
309
310 static inline bool dma_pte_superpage(struct dma_pte *pte)
311 {
312         return (pte->val & (1 << 7));
313 }
314
315 static inline int first_pte_in_page(struct dma_pte *pte)
316 {
317         return !((unsigned long)pte & ~VTD_PAGE_MASK);
318 }
319
320 /*
321  * This domain is a statically identity mapping domain.
322  *      1. This domain creats a static 1:1 mapping to all usable memory.
323  *      2. It maps to each iommu if successful.
324  *      3. Each iommu mapps to this domain if successful.
325  */
326 static struct dmar_domain *si_domain;
327 static int hw_pass_through = 1;
328
329 /* devices under the same p2p bridge are owned in one domain */
330 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
331
332 /* domain represents a virtual machine, more than one devices
333  * across iommus may be owned in one domain, e.g. kvm guest.
334  */
335 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
336
337 /* si_domain contains mulitple devices */
338 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
339
340 struct dmar_domain {
341         int     id;                     /* domain id */
342         int     nid;                    /* node id */
343         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
344
345         struct list_head devices;       /* all devices' list */
346         struct iova_domain iovad;       /* iova's that belong to this domain */
347
348         struct dma_pte  *pgd;           /* virtual address */
349         int             gaw;            /* max guest address width */
350
351         /* adjusted guest address width, 0 is level 2 30-bit */
352         int             agaw;
353
354         int             flags;          /* flags to find out type of domain */
355
356         int             iommu_coherency;/* indicate coherency of iommu access */
357         int             iommu_snooping; /* indicate snooping control feature*/
358         int             iommu_count;    /* reference count of iommu */
359         int             iommu_superpage;/* Level of superpages supported:
360                                            0 == 4KiB (no superpages), 1 == 2MiB,
361                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
362         spinlock_t      iommu_lock;     /* protect iommu set in domain */
363         u64             max_addr;       /* maximum mapped address */
364 };
365
366 /* PCI domain-device relationship */
367 struct device_domain_info {
368         struct list_head link;  /* link to domain siblings */
369         struct list_head global; /* link to global list */
370         int segment;            /* PCI domain */
371         u8 bus;                 /* PCI bus number */
372         u8 devfn;               /* PCI devfn number */
373         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
374         struct intel_iommu *iommu; /* IOMMU used by this device */
375         struct dmar_domain *domain; /* pointer to domain */
376 };
377
378 static void flush_unmaps_timeout(unsigned long data);
379
380 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
381
382 #define HIGH_WATER_MARK 250
383 struct deferred_flush_tables {
384         int next;
385         struct iova *iova[HIGH_WATER_MARK];
386         struct dmar_domain *domain[HIGH_WATER_MARK];
387 };
388
389 static struct deferred_flush_tables *deferred_flush;
390
391 /* bitmap for indexing intel_iommus */
392 static int g_num_of_iommus;
393
394 static DEFINE_SPINLOCK(async_umap_flush_lock);
395 static LIST_HEAD(unmaps_to_do);
396
397 static int timer_on;
398 static long list_size;
399
400 static void domain_remove_dev_info(struct dmar_domain *domain);
401
402 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
403 int dmar_disabled = 0;
404 #else
405 int dmar_disabled = 1;
406 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
407
408 int intel_iommu_enabled = 0;
409 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
410
411 static int dmar_map_gfx = 1;
412 static int dmar_forcedac;
413 static int intel_iommu_strict;
414 static int intel_iommu_superpage = 1;
415
416 int intel_iommu_gfx_mapped;
417 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
418
419 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
420 static DEFINE_SPINLOCK(device_domain_lock);
421 static LIST_HEAD(device_domain_list);
422
423 static struct iommu_ops intel_iommu_ops;
424
425 static int __init intel_iommu_setup(char *str)
426 {
427         if (!str)
428                 return -EINVAL;
429         while (*str) {
430                 if (!strncmp(str, "on", 2)) {
431                         dmar_disabled = 0;
432                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
433                 } else if (!strncmp(str, "off", 3)) {
434                         dmar_disabled = 1;
435                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
436                 } else if (!strncmp(str, "igfx_off", 8)) {
437                         dmar_map_gfx = 0;
438                         printk(KERN_INFO
439                                 "Intel-IOMMU: disable GFX device mapping\n");
440                 } else if (!strncmp(str, "forcedac", 8)) {
441                         printk(KERN_INFO
442                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
443                         dmar_forcedac = 1;
444                 } else if (!strncmp(str, "strict", 6)) {
445                         printk(KERN_INFO
446                                 "Intel-IOMMU: disable batched IOTLB flush\n");
447                         intel_iommu_strict = 1;
448                 } else if (!strncmp(str, "sp_off", 6)) {
449                         printk(KERN_INFO
450                                 "Intel-IOMMU: disable supported super page\n");
451                         intel_iommu_superpage = 0;
452                 }
453
454                 str += strcspn(str, ",");
455                 while (*str == ',')
456                         str++;
457         }
458         return 0;
459 }
460 __setup("intel_iommu=", intel_iommu_setup);
461
462 static struct kmem_cache *iommu_domain_cache;
463 static struct kmem_cache *iommu_devinfo_cache;
464 static struct kmem_cache *iommu_iova_cache;
465
466 static inline void *alloc_pgtable_page(int node)
467 {
468         struct page *page;
469         void *vaddr = NULL;
470
471         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
472         if (page)
473                 vaddr = page_address(page);
474         return vaddr;
475 }
476
477 static inline void free_pgtable_page(void *vaddr)
478 {
479         free_page((unsigned long)vaddr);
480 }
481
482 static inline void *alloc_domain_mem(void)
483 {
484         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
485 }
486
487 static void free_domain_mem(void *vaddr)
488 {
489         kmem_cache_free(iommu_domain_cache, vaddr);
490 }
491
492 static inline void * alloc_devinfo_mem(void)
493 {
494         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
495 }
496
497 static inline void free_devinfo_mem(void *vaddr)
498 {
499         kmem_cache_free(iommu_devinfo_cache, vaddr);
500 }
501
502 struct iova *alloc_iova_mem(void)
503 {
504         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
505 }
506
507 void free_iova_mem(struct iova *iova)
508 {
509         kmem_cache_free(iommu_iova_cache, iova);
510 }
511
512
513 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
514 {
515         unsigned long sagaw;
516         int agaw = -1;
517
518         sagaw = cap_sagaw(iommu->cap);
519         for (agaw = width_to_agaw(max_gaw);
520              agaw >= 0; agaw--) {
521                 if (test_bit(agaw, &sagaw))
522                         break;
523         }
524
525         return agaw;
526 }
527
528 /*
529  * Calculate max SAGAW for each iommu.
530  */
531 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
532 {
533         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
534 }
535
536 /*
537  * calculate agaw for each iommu.
538  * "SAGAW" may be different across iommus, use a default agaw, and
539  * get a supported less agaw for iommus that don't support the default agaw.
540  */
541 int iommu_calculate_agaw(struct intel_iommu *iommu)
542 {
543         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
544 }
545
546 /* This functionin only returns single iommu in a domain */
547 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
548 {
549         int iommu_id;
550
551         /* si_domain and vm domain should not get here. */
552         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
553         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
554
555         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
556         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
557                 return NULL;
558
559         return g_iommus[iommu_id];
560 }
561
562 static void domain_update_iommu_coherency(struct dmar_domain *domain)
563 {
564         int i;
565
566         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
567
568         domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
569
570         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
571                 if (!ecap_coherent(g_iommus[i]->ecap)) {
572                         domain->iommu_coherency = 0;
573                         break;
574                 }
575         }
576 }
577
578 static void domain_update_iommu_snooping(struct dmar_domain *domain)
579 {
580         int i;
581
582         domain->iommu_snooping = 1;
583
584         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
585                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
586                         domain->iommu_snooping = 0;
587                         break;
588                 }
589         }
590 }
591
592 static void domain_update_iommu_superpage(struct dmar_domain *domain)
593 {
594         struct dmar_drhd_unit *drhd;
595         struct intel_iommu *iommu = NULL;
596         int mask = 0xf;
597
598         if (!intel_iommu_superpage) {
599                 domain->iommu_superpage = 0;
600                 return;
601         }
602
603         /* set iommu_superpage to the smallest common denominator */
604         for_each_active_iommu(iommu, drhd) {
605                 mask &= cap_super_page_val(iommu->cap);
606                 if (!mask) {
607                         break;
608                 }
609         }
610         domain->iommu_superpage = fls(mask);
611 }
612
613 /* Some capabilities may be different across iommus */
614 static void domain_update_iommu_cap(struct dmar_domain *domain)
615 {
616         domain_update_iommu_coherency(domain);
617         domain_update_iommu_snooping(domain);
618         domain_update_iommu_superpage(domain);
619 }
620
621 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
622 {
623         struct dmar_drhd_unit *drhd = NULL;
624         int i;
625
626         for_each_drhd_unit(drhd) {
627                 if (drhd->ignored)
628                         continue;
629                 if (segment != drhd->segment)
630                         continue;
631
632                 for (i = 0; i < drhd->devices_cnt; i++) {
633                         if (drhd->devices[i] &&
634                             drhd->devices[i]->bus->number == bus &&
635                             drhd->devices[i]->devfn == devfn)
636                                 return drhd->iommu;
637                         if (drhd->devices[i] &&
638                             drhd->devices[i]->subordinate &&
639                             drhd->devices[i]->subordinate->number <= bus &&
640                             drhd->devices[i]->subordinate->subordinate >= bus)
641                                 return drhd->iommu;
642                 }
643
644                 if (drhd->include_all)
645                         return drhd->iommu;
646         }
647
648         return NULL;
649 }
650
651 static void domain_flush_cache(struct dmar_domain *domain,
652                                void *addr, int size)
653 {
654         if (!domain->iommu_coherency)
655                 clflush_cache_range(addr, size);
656 }
657
658 /* Gets context entry for a given bus and devfn */
659 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
660                 u8 bus, u8 devfn)
661 {
662         struct root_entry *root;
663         struct context_entry *context;
664         unsigned long phy_addr;
665         unsigned long flags;
666
667         spin_lock_irqsave(&iommu->lock, flags);
668         root = &iommu->root_entry[bus];
669         context = get_context_addr_from_root(root);
670         if (!context) {
671                 context = (struct context_entry *)
672                                 alloc_pgtable_page(iommu->node);
673                 if (!context) {
674                         spin_unlock_irqrestore(&iommu->lock, flags);
675                         return NULL;
676                 }
677                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
678                 phy_addr = virt_to_phys((void *)context);
679                 set_root_value(root, phy_addr);
680                 set_root_present(root);
681                 __iommu_flush_cache(iommu, root, sizeof(*root));
682         }
683         spin_unlock_irqrestore(&iommu->lock, flags);
684         return &context[devfn];
685 }
686
687 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
688 {
689         struct root_entry *root;
690         struct context_entry *context;
691         int ret;
692         unsigned long flags;
693
694         spin_lock_irqsave(&iommu->lock, flags);
695         root = &iommu->root_entry[bus];
696         context = get_context_addr_from_root(root);
697         if (!context) {
698                 ret = 0;
699                 goto out;
700         }
701         ret = context_present(&context[devfn]);
702 out:
703         spin_unlock_irqrestore(&iommu->lock, flags);
704         return ret;
705 }
706
707 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
708 {
709         struct root_entry *root;
710         struct context_entry *context;
711         unsigned long flags;
712
713         spin_lock_irqsave(&iommu->lock, flags);
714         root = &iommu->root_entry[bus];
715         context = get_context_addr_from_root(root);
716         if (context) {
717                 context_clear_entry(&context[devfn]);
718                 __iommu_flush_cache(iommu, &context[devfn], \
719                         sizeof(*context));
720         }
721         spin_unlock_irqrestore(&iommu->lock, flags);
722 }
723
724 static void free_context_table(struct intel_iommu *iommu)
725 {
726         struct root_entry *root;
727         int i;
728         unsigned long flags;
729         struct context_entry *context;
730
731         spin_lock_irqsave(&iommu->lock, flags);
732         if (!iommu->root_entry) {
733                 goto out;
734         }
735         for (i = 0; i < ROOT_ENTRY_NR; i++) {
736                 root = &iommu->root_entry[i];
737                 context = get_context_addr_from_root(root);
738                 if (context)
739                         free_pgtable_page(context);
740         }
741         free_pgtable_page(iommu->root_entry);
742         iommu->root_entry = NULL;
743 out:
744         spin_unlock_irqrestore(&iommu->lock, flags);
745 }
746
747 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
748                                       unsigned long pfn, int target_level)
749 {
750         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
751         struct dma_pte *parent, *pte = NULL;
752         int level = agaw_to_level(domain->agaw);
753         int offset;
754
755         BUG_ON(!domain->pgd);
756         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
757         parent = domain->pgd;
758
759         while (level > 0) {
760                 void *tmp_page;
761
762                 offset = pfn_level_offset(pfn, level);
763                 pte = &parent[offset];
764                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
765                         break;
766                 if (level == target_level)
767                         break;
768
769                 if (!dma_pte_present(pte)) {
770                         uint64_t pteval;
771
772                         tmp_page = alloc_pgtable_page(domain->nid);
773
774                         if (!tmp_page)
775                                 return NULL;
776
777                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
778                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
779                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
780                                 /* Someone else set it while we were thinking; use theirs. */
781                                 free_pgtable_page(tmp_page);
782                         } else {
783                                 dma_pte_addr(pte);
784                                 domain_flush_cache(domain, pte, sizeof(*pte));
785                         }
786                 }
787                 parent = phys_to_virt(dma_pte_addr(pte));
788                 level--;
789         }
790
791         return pte;
792 }
793
794
795 /* return address's pte at specific level */
796 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
797                                          unsigned long pfn,
798                                          int level, int *large_page)
799 {
800         struct dma_pte *parent, *pte = NULL;
801         int total = agaw_to_level(domain->agaw);
802         int offset;
803
804         parent = domain->pgd;
805         while (level <= total) {
806                 offset = pfn_level_offset(pfn, total);
807                 pte = &parent[offset];
808                 if (level == total)
809                         return pte;
810
811                 if (!dma_pte_present(pte)) {
812                         *large_page = total;
813                         break;
814                 }
815
816                 if (pte->val & DMA_PTE_LARGE_PAGE) {
817                         *large_page = total;
818                         return pte;
819                 }
820
821                 parent = phys_to_virt(dma_pte_addr(pte));
822                 total--;
823         }
824         return NULL;
825 }
826
827 /* clear last level pte, a tlb flush should be followed */
828 static int dma_pte_clear_range(struct dmar_domain *domain,
829                                 unsigned long start_pfn,
830                                 unsigned long last_pfn)
831 {
832         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
833         unsigned int large_page = 1;
834         struct dma_pte *first_pte, *pte;
835         int order;
836
837         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
838         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
839         BUG_ON(start_pfn > last_pfn);
840
841         /* we don't need lock here; nobody else touches the iova range */
842         do {
843                 large_page = 1;
844                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
845                 if (!pte) {
846                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
847                         continue;
848                 }
849                 do {
850                         dma_clear_pte(pte);
851                         start_pfn += lvl_to_nr_pages(large_page);
852                         pte++;
853                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
854
855                 domain_flush_cache(domain, first_pte,
856                                    (void *)pte - (void *)first_pte);
857
858         } while (start_pfn && start_pfn <= last_pfn);
859
860         order = (large_page - 1) * 9;
861         return order;
862 }
863
864 static void dma_pte_free_level(struct dmar_domain *domain, int level,
865                                struct dma_pte *pte, unsigned long pfn,
866                                unsigned long start_pfn, unsigned long last_pfn)
867 {
868         pfn = max(start_pfn, pfn);
869         pte = &pte[pfn_level_offset(pfn, level)];
870
871         do {
872                 unsigned long level_pfn;
873                 struct dma_pte *level_pte;
874
875                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
876                         goto next;
877
878                 level_pfn = pfn & level_mask(level - 1);
879                 level_pte = phys_to_virt(dma_pte_addr(pte));
880
881                 if (level > 2)
882                         dma_pte_free_level(domain, level - 1, level_pte,
883                                            level_pfn, start_pfn, last_pfn);
884
885                 /* If range covers entire pagetable, free it */
886                 if (!(start_pfn > level_pfn ||
887                       last_pfn < level_pfn + level_size(level) - 1)) {
888                         dma_clear_pte(pte);
889                         domain_flush_cache(domain, pte, sizeof(*pte));
890                         free_pgtable_page(level_pte);
891                 }
892 next:
893                 pfn += level_size(level);
894         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
895 }
896
897 /* free page table pages. last level pte should already be cleared */
898 static void dma_pte_free_pagetable(struct dmar_domain *domain,
899                                    unsigned long start_pfn,
900                                    unsigned long last_pfn)
901 {
902         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
903
904         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
905         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
906         BUG_ON(start_pfn > last_pfn);
907
908         /* We don't need lock here; nobody else touches the iova range */
909         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
910                            domain->pgd, 0, start_pfn, last_pfn);
911
912         /* free pgd */
913         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
914                 free_pgtable_page(domain->pgd);
915                 domain->pgd = NULL;
916         }
917 }
918
919 /* iommu handling */
920 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
921 {
922         struct root_entry *root;
923         unsigned long flags;
924
925         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
926         if (!root)
927                 return -ENOMEM;
928
929         __iommu_flush_cache(iommu, root, ROOT_SIZE);
930
931         spin_lock_irqsave(&iommu->lock, flags);
932         iommu->root_entry = root;
933         spin_unlock_irqrestore(&iommu->lock, flags);
934
935         return 0;
936 }
937
938 static void iommu_set_root_entry(struct intel_iommu *iommu)
939 {
940         void *addr;
941         u32 sts;
942         unsigned long flag;
943
944         addr = iommu->root_entry;
945
946         raw_spin_lock_irqsave(&iommu->register_lock, flag);
947         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
948
949         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
950
951         /* Make sure hardware complete it */
952         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
953                       readl, (sts & DMA_GSTS_RTPS), sts);
954
955         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
956 }
957
958 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
959 {
960         u32 val;
961         unsigned long flag;
962
963         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
964                 return;
965
966         raw_spin_lock_irqsave(&iommu->register_lock, flag);
967         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
968
969         /* Make sure hardware complete it */
970         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
971                       readl, (!(val & DMA_GSTS_WBFS)), val);
972
973         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
974 }
975
976 /* return value determine if we need a write buffer flush */
977 static void __iommu_flush_context(struct intel_iommu *iommu,
978                                   u16 did, u16 source_id, u8 function_mask,
979                                   u64 type)
980 {
981         u64 val = 0;
982         unsigned long flag;
983
984         switch (type) {
985         case DMA_CCMD_GLOBAL_INVL:
986                 val = DMA_CCMD_GLOBAL_INVL;
987                 break;
988         case DMA_CCMD_DOMAIN_INVL:
989                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
990                 break;
991         case DMA_CCMD_DEVICE_INVL:
992                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
993                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
994                 break;
995         default:
996                 BUG();
997         }
998         val |= DMA_CCMD_ICC;
999
1000         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1001         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1002
1003         /* Make sure hardware complete it */
1004         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1005                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1006
1007         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1008 }
1009
1010 /* return value determine if we need a write buffer flush */
1011 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1012                                 u64 addr, unsigned int size_order, u64 type)
1013 {
1014         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1015         u64 val = 0, val_iva = 0;
1016         unsigned long flag;
1017
1018         switch (type) {
1019         case DMA_TLB_GLOBAL_FLUSH:
1020                 /* global flush doesn't need set IVA_REG */
1021                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1022                 break;
1023         case DMA_TLB_DSI_FLUSH:
1024                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1025                 break;
1026         case DMA_TLB_PSI_FLUSH:
1027                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1028                 /* Note: always flush non-leaf currently */
1029                 val_iva = size_order | addr;
1030                 break;
1031         default:
1032                 BUG();
1033         }
1034         /* Note: set drain read/write */
1035 #if 0
1036         /*
1037          * This is probably to be super secure.. Looks like we can
1038          * ignore it without any impact.
1039          */
1040         if (cap_read_drain(iommu->cap))
1041                 val |= DMA_TLB_READ_DRAIN;
1042 #endif
1043         if (cap_write_drain(iommu->cap))
1044                 val |= DMA_TLB_WRITE_DRAIN;
1045
1046         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1047         /* Note: Only uses first TLB reg currently */
1048         if (val_iva)
1049                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1050         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1051
1052         /* Make sure hardware complete it */
1053         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1054                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1055
1056         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1057
1058         /* check IOTLB invalidation granularity */
1059         if (DMA_TLB_IAIG(val) == 0)
1060                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1061         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1062                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1063                         (unsigned long long)DMA_TLB_IIRG(type),
1064                         (unsigned long long)DMA_TLB_IAIG(val));
1065 }
1066
1067 static struct device_domain_info *iommu_support_dev_iotlb(
1068         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1069 {
1070         int found = 0;
1071         unsigned long flags;
1072         struct device_domain_info *info;
1073         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1074
1075         if (!ecap_dev_iotlb_support(iommu->ecap))
1076                 return NULL;
1077
1078         if (!iommu->qi)
1079                 return NULL;
1080
1081         spin_lock_irqsave(&device_domain_lock, flags);
1082         list_for_each_entry(info, &domain->devices, link)
1083                 if (info->bus == bus && info->devfn == devfn) {
1084                         found = 1;
1085                         break;
1086                 }
1087         spin_unlock_irqrestore(&device_domain_lock, flags);
1088
1089         if (!found || !info->dev)
1090                 return NULL;
1091
1092         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1093                 return NULL;
1094
1095         if (!dmar_find_matched_atsr_unit(info->dev))
1096                 return NULL;
1097
1098         info->iommu = iommu;
1099
1100         return info;
1101 }
1102
1103 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1104 {
1105         if (!info)
1106                 return;
1107
1108         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1109 }
1110
1111 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1112 {
1113         if (!info->dev || !pci_ats_enabled(info->dev))
1114                 return;
1115
1116         pci_disable_ats(info->dev);
1117 }
1118
1119 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1120                                   u64 addr, unsigned mask)
1121 {
1122         u16 sid, qdep;
1123         unsigned long flags;
1124         struct device_domain_info *info;
1125
1126         spin_lock_irqsave(&device_domain_lock, flags);
1127         list_for_each_entry(info, &domain->devices, link) {
1128                 if (!info->dev || !pci_ats_enabled(info->dev))
1129                         continue;
1130
1131                 sid = info->bus << 8 | info->devfn;
1132                 qdep = pci_ats_queue_depth(info->dev);
1133                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1134         }
1135         spin_unlock_irqrestore(&device_domain_lock, flags);
1136 }
1137
1138 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1139                                   unsigned long pfn, unsigned int pages, int map)
1140 {
1141         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1142         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1143
1144         BUG_ON(pages == 0);
1145
1146         /*
1147          * Fallback to domain selective flush if no PSI support or the size is
1148          * too big.
1149          * PSI requires page size to be 2 ^ x, and the base address is naturally
1150          * aligned to the size
1151          */
1152         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1153                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1154                                                 DMA_TLB_DSI_FLUSH);
1155         else
1156                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1157                                                 DMA_TLB_PSI_FLUSH);
1158
1159         /*
1160          * In caching mode, changes of pages from non-present to present require
1161          * flush. However, device IOTLB doesn't need to be flushed in this case.
1162          */
1163         if (!cap_caching_mode(iommu->cap) || !map)
1164                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1165 }
1166
1167 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1168 {
1169         u32 pmen;
1170         unsigned long flags;
1171
1172         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1173         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1174         pmen &= ~DMA_PMEN_EPM;
1175         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1176
1177         /* wait for the protected region status bit to clear */
1178         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1179                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1180
1181         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1182 }
1183
1184 static int iommu_enable_translation(struct intel_iommu *iommu)
1185 {
1186         u32 sts;
1187         unsigned long flags;
1188
1189         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1190         iommu->gcmd |= DMA_GCMD_TE;
1191         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1192
1193         /* Make sure hardware complete it */
1194         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1195                       readl, (sts & DMA_GSTS_TES), sts);
1196
1197         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1198         return 0;
1199 }
1200
1201 static int iommu_disable_translation(struct intel_iommu *iommu)
1202 {
1203         u32 sts;
1204         unsigned long flag;
1205
1206         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1207         iommu->gcmd &= ~DMA_GCMD_TE;
1208         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1209
1210         /* Make sure hardware complete it */
1211         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1212                       readl, (!(sts & DMA_GSTS_TES)), sts);
1213
1214         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1215         return 0;
1216 }
1217
1218
1219 static int iommu_init_domains(struct intel_iommu *iommu)
1220 {
1221         unsigned long ndomains;
1222         unsigned long nlongs;
1223
1224         ndomains = cap_ndoms(iommu->cap);
1225         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1226                         ndomains);
1227         nlongs = BITS_TO_LONGS(ndomains);
1228
1229         spin_lock_init(&iommu->lock);
1230
1231         /* TBD: there might be 64K domains,
1232          * consider other allocation for future chip
1233          */
1234         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1235         if (!iommu->domain_ids) {
1236                 printk(KERN_ERR "Allocating domain id array failed\n");
1237                 return -ENOMEM;
1238         }
1239         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1240                         GFP_KERNEL);
1241         if (!iommu->domains) {
1242                 printk(KERN_ERR "Allocating domain array failed\n");
1243                 return -ENOMEM;
1244         }
1245
1246         /*
1247          * if Caching mode is set, then invalid translations are tagged
1248          * with domainid 0. Hence we need to pre-allocate it.
1249          */
1250         if (cap_caching_mode(iommu->cap))
1251                 set_bit(0, iommu->domain_ids);
1252         return 0;
1253 }
1254
1255
1256 static void domain_exit(struct dmar_domain *domain);
1257 static void vm_domain_exit(struct dmar_domain *domain);
1258
1259 void free_dmar_iommu(struct intel_iommu *iommu)
1260 {
1261         struct dmar_domain *domain;
1262         int i;
1263         unsigned long flags;
1264
1265         if ((iommu->domains) && (iommu->domain_ids)) {
1266                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1267                         domain = iommu->domains[i];
1268                         clear_bit(i, iommu->domain_ids);
1269
1270                         spin_lock_irqsave(&domain->iommu_lock, flags);
1271                         if (--domain->iommu_count == 0) {
1272                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1273                                         vm_domain_exit(domain);
1274                                 else
1275                                         domain_exit(domain);
1276                         }
1277                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1278                 }
1279         }
1280
1281         if (iommu->gcmd & DMA_GCMD_TE)
1282                 iommu_disable_translation(iommu);
1283
1284         if (iommu->irq) {
1285                 irq_set_handler_data(iommu->irq, NULL);
1286                 /* This will mask the irq */
1287                 free_irq(iommu->irq, iommu);
1288                 destroy_irq(iommu->irq);
1289         }
1290
1291         kfree(iommu->domains);
1292         kfree(iommu->domain_ids);
1293
1294         g_iommus[iommu->seq_id] = NULL;
1295
1296         /* if all iommus are freed, free g_iommus */
1297         for (i = 0; i < g_num_of_iommus; i++) {
1298                 if (g_iommus[i])
1299                         break;
1300         }
1301
1302         if (i == g_num_of_iommus)
1303                 kfree(g_iommus);
1304
1305         /* free context mapping */
1306         free_context_table(iommu);
1307 }
1308
1309 static struct dmar_domain *alloc_domain(void)
1310 {
1311         struct dmar_domain *domain;
1312
1313         domain = alloc_domain_mem();
1314         if (!domain)
1315                 return NULL;
1316
1317         domain->nid = -1;
1318         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1319         domain->flags = 0;
1320
1321         return domain;
1322 }
1323
1324 static int iommu_attach_domain(struct dmar_domain *domain,
1325                                struct intel_iommu *iommu)
1326 {
1327         int num;
1328         unsigned long ndomains;
1329         unsigned long flags;
1330
1331         ndomains = cap_ndoms(iommu->cap);
1332
1333         spin_lock_irqsave(&iommu->lock, flags);
1334
1335         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1336         if (num >= ndomains) {
1337                 spin_unlock_irqrestore(&iommu->lock, flags);
1338                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1339                 return -ENOMEM;
1340         }
1341
1342         domain->id = num;
1343         set_bit(num, iommu->domain_ids);
1344         set_bit(iommu->seq_id, &domain->iommu_bmp);
1345         iommu->domains[num] = domain;
1346         spin_unlock_irqrestore(&iommu->lock, flags);
1347
1348         return 0;
1349 }
1350
1351 static void iommu_detach_domain(struct dmar_domain *domain,
1352                                 struct intel_iommu *iommu)
1353 {
1354         unsigned long flags;
1355         int num, ndomains;
1356         int found = 0;
1357
1358         spin_lock_irqsave(&iommu->lock, flags);
1359         ndomains = cap_ndoms(iommu->cap);
1360         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1361                 if (iommu->domains[num] == domain) {
1362                         found = 1;
1363                         break;
1364                 }
1365         }
1366
1367         if (found) {
1368                 clear_bit(num, iommu->domain_ids);
1369                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1370                 iommu->domains[num] = NULL;
1371         }
1372         spin_unlock_irqrestore(&iommu->lock, flags);
1373 }
1374
1375 static struct iova_domain reserved_iova_list;
1376 static struct lock_class_key reserved_rbtree_key;
1377
1378 static int dmar_init_reserved_ranges(void)
1379 {
1380         struct pci_dev *pdev = NULL;
1381         struct iova *iova;
1382         int i;
1383
1384         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1385
1386         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1387                 &reserved_rbtree_key);
1388
1389         /* IOAPIC ranges shouldn't be accessed by DMA */
1390         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1391                 IOVA_PFN(IOAPIC_RANGE_END));
1392         if (!iova) {
1393                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1394                 return -ENODEV;
1395         }
1396
1397         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1398         for_each_pci_dev(pdev) {
1399                 struct resource *r;
1400
1401                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1402                         r = &pdev->resource[i];
1403                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1404                                 continue;
1405                         iova = reserve_iova(&reserved_iova_list,
1406                                             IOVA_PFN(r->start),
1407                                             IOVA_PFN(r->end));
1408                         if (!iova) {
1409                                 printk(KERN_ERR "Reserve iova failed\n");
1410                                 return -ENODEV;
1411                         }
1412                 }
1413         }
1414         return 0;
1415 }
1416
1417 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1418 {
1419         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1420 }
1421
1422 static inline int guestwidth_to_adjustwidth(int gaw)
1423 {
1424         int agaw;
1425         int r = (gaw - 12) % 9;
1426
1427         if (r == 0)
1428                 agaw = gaw;
1429         else
1430                 agaw = gaw + 9 - r;
1431         if (agaw > 64)
1432                 agaw = 64;
1433         return agaw;
1434 }
1435
1436 static int domain_init(struct dmar_domain *domain, int guest_width)
1437 {
1438         struct intel_iommu *iommu;
1439         int adjust_width, agaw;
1440         unsigned long sagaw;
1441
1442         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1443         spin_lock_init(&domain->iommu_lock);
1444
1445         domain_reserve_special_ranges(domain);
1446
1447         /* calculate AGAW */
1448         iommu = domain_get_iommu(domain);
1449         if (guest_width > cap_mgaw(iommu->cap))
1450                 guest_width = cap_mgaw(iommu->cap);
1451         domain->gaw = guest_width;
1452         adjust_width = guestwidth_to_adjustwidth(guest_width);
1453         agaw = width_to_agaw(adjust_width);
1454         sagaw = cap_sagaw(iommu->cap);
1455         if (!test_bit(agaw, &sagaw)) {
1456                 /* hardware doesn't support it, choose a bigger one */
1457                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1458                 agaw = find_next_bit(&sagaw, 5, agaw);
1459                 if (agaw >= 5)
1460                         return -ENODEV;
1461         }
1462         domain->agaw = agaw;
1463         INIT_LIST_HEAD(&domain->devices);
1464
1465         if (ecap_coherent(iommu->ecap))
1466                 domain->iommu_coherency = 1;
1467         else
1468                 domain->iommu_coherency = 0;
1469
1470         if (ecap_sc_support(iommu->ecap))
1471                 domain->iommu_snooping = 1;
1472         else
1473                 domain->iommu_snooping = 0;
1474
1475         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1476         domain->iommu_count = 1;
1477         domain->nid = iommu->node;
1478
1479         /* always allocate the top pgd */
1480         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1481         if (!domain->pgd)
1482                 return -ENOMEM;
1483         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1484         return 0;
1485 }
1486
1487 static void domain_exit(struct dmar_domain *domain)
1488 {
1489         struct dmar_drhd_unit *drhd;
1490         struct intel_iommu *iommu;
1491
1492         /* Domain 0 is reserved, so dont process it */
1493         if (!domain)
1494                 return;
1495
1496         /* Flush any lazy unmaps that may reference this domain */
1497         if (!intel_iommu_strict)
1498                 flush_unmaps_timeout(0);
1499
1500         domain_remove_dev_info(domain);
1501         /* destroy iovas */
1502         put_iova_domain(&domain->iovad);
1503
1504         /* clear ptes */
1505         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1506
1507         /* free page tables */
1508         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1509
1510         for_each_active_iommu(iommu, drhd)
1511                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1512                         iommu_detach_domain(domain, iommu);
1513
1514         free_domain_mem(domain);
1515 }
1516
1517 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1518                                  u8 bus, u8 devfn, int translation)
1519 {
1520         struct context_entry *context;
1521         unsigned long flags;
1522         struct intel_iommu *iommu;
1523         struct dma_pte *pgd;
1524         unsigned long num;
1525         unsigned long ndomains;
1526         int id;
1527         int agaw;
1528         struct device_domain_info *info = NULL;
1529
1530         pr_debug("Set context mapping for %02x:%02x.%d\n",
1531                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1532
1533         BUG_ON(!domain->pgd);
1534         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1535                translation != CONTEXT_TT_MULTI_LEVEL);
1536
1537         iommu = device_to_iommu(segment, bus, devfn);
1538         if (!iommu)
1539                 return -ENODEV;
1540
1541         context = device_to_context_entry(iommu, bus, devfn);
1542         if (!context)
1543                 return -ENOMEM;
1544         spin_lock_irqsave(&iommu->lock, flags);
1545         if (context_present(context)) {
1546                 spin_unlock_irqrestore(&iommu->lock, flags);
1547                 return 0;
1548         }
1549
1550         id = domain->id;
1551         pgd = domain->pgd;
1552
1553         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1554             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1555                 int found = 0;
1556
1557                 /* find an available domain id for this device in iommu */
1558                 ndomains = cap_ndoms(iommu->cap);
1559                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1560                         if (iommu->domains[num] == domain) {
1561                                 id = num;
1562                                 found = 1;
1563                                 break;
1564                         }
1565                 }
1566
1567                 if (found == 0) {
1568                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1569                         if (num >= ndomains) {
1570                                 spin_unlock_irqrestore(&iommu->lock, flags);
1571                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1572                                 return -EFAULT;
1573                         }
1574
1575                         set_bit(num, iommu->domain_ids);
1576                         iommu->domains[num] = domain;
1577                         id = num;
1578                 }
1579
1580                 /* Skip top levels of page tables for
1581                  * iommu which has less agaw than default.
1582                  * Unnecessary for PT mode.
1583                  */
1584                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1585                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1586                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1587                                 if (!dma_pte_present(pgd)) {
1588                                         spin_unlock_irqrestore(&iommu->lock, flags);
1589                                         return -ENOMEM;
1590                                 }
1591                         }
1592                 }
1593         }
1594
1595         context_set_domain_id(context, id);
1596
1597         if (translation != CONTEXT_TT_PASS_THROUGH) {
1598                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1599                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1600                                      CONTEXT_TT_MULTI_LEVEL;
1601         }
1602         /*
1603          * In pass through mode, AW must be programmed to indicate the largest
1604          * AGAW value supported by hardware. And ASR is ignored by hardware.
1605          */
1606         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1607                 context_set_address_width(context, iommu->msagaw);
1608         else {
1609                 context_set_address_root(context, virt_to_phys(pgd));
1610                 context_set_address_width(context, iommu->agaw);
1611         }
1612
1613         context_set_translation_type(context, translation);
1614         context_set_fault_enable(context);
1615         context_set_present(context);
1616         domain_flush_cache(domain, context, sizeof(*context));
1617
1618         /*
1619          * It's a non-present to present mapping. If hardware doesn't cache
1620          * non-present entry we only need to flush the write-buffer. If the
1621          * _does_ cache non-present entries, then it does so in the special
1622          * domain #0, which we have to flush:
1623          */
1624         if (cap_caching_mode(iommu->cap)) {
1625                 iommu->flush.flush_context(iommu, 0,
1626                                            (((u16)bus) << 8) | devfn,
1627                                            DMA_CCMD_MASK_NOBIT,
1628                                            DMA_CCMD_DEVICE_INVL);
1629                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1630         } else {
1631                 iommu_flush_write_buffer(iommu);
1632         }
1633         iommu_enable_dev_iotlb(info);
1634         spin_unlock_irqrestore(&iommu->lock, flags);
1635
1636         spin_lock_irqsave(&domain->iommu_lock, flags);
1637         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1638                 domain->iommu_count++;
1639                 if (domain->iommu_count == 1)
1640                         domain->nid = iommu->node;
1641                 domain_update_iommu_cap(domain);
1642         }
1643         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1644         return 0;
1645 }
1646
1647 static int
1648 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1649                         int translation)
1650 {
1651         int ret;
1652         struct pci_dev *tmp, *parent;
1653
1654         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1655                                          pdev->bus->number, pdev->devfn,
1656                                          translation);
1657         if (ret)
1658                 return ret;
1659
1660         /* dependent device mapping */
1661         tmp = pci_find_upstream_pcie_bridge(pdev);
1662         if (!tmp)
1663                 return 0;
1664         /* Secondary interface's bus number and devfn 0 */
1665         parent = pdev->bus->self;
1666         while (parent != tmp) {
1667                 ret = domain_context_mapping_one(domain,
1668                                                  pci_domain_nr(parent->bus),
1669                                                  parent->bus->number,
1670                                                  parent->devfn, translation);
1671                 if (ret)
1672                         return ret;
1673                 parent = parent->bus->self;
1674         }
1675         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1676                 return domain_context_mapping_one(domain,
1677                                         pci_domain_nr(tmp->subordinate),
1678                                         tmp->subordinate->number, 0,
1679                                         translation);
1680         else /* this is a legacy PCI bridge */
1681                 return domain_context_mapping_one(domain,
1682                                                   pci_domain_nr(tmp->bus),
1683                                                   tmp->bus->number,
1684                                                   tmp->devfn,
1685                                                   translation);
1686 }
1687
1688 static int domain_context_mapped(struct pci_dev *pdev)
1689 {
1690         int ret;
1691         struct pci_dev *tmp, *parent;
1692         struct intel_iommu *iommu;
1693
1694         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1695                                 pdev->devfn);
1696         if (!iommu)
1697                 return -ENODEV;
1698
1699         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1700         if (!ret)
1701                 return ret;
1702         /* dependent device mapping */
1703         tmp = pci_find_upstream_pcie_bridge(pdev);
1704         if (!tmp)
1705                 return ret;
1706         /* Secondary interface's bus number and devfn 0 */
1707         parent = pdev->bus->self;
1708         while (parent != tmp) {
1709                 ret = device_context_mapped(iommu, parent->bus->number,
1710                                             parent->devfn);
1711                 if (!ret)
1712                         return ret;
1713                 parent = parent->bus->self;
1714         }
1715         if (pci_is_pcie(tmp))
1716                 return device_context_mapped(iommu, tmp->subordinate->number,
1717                                              0);
1718         else
1719                 return device_context_mapped(iommu, tmp->bus->number,
1720                                              tmp->devfn);
1721 }
1722
1723 /* Returns a number of VTD pages, but aligned to MM page size */
1724 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1725                                             size_t size)
1726 {
1727         host_addr &= ~PAGE_MASK;
1728         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1729 }
1730
1731 /* Return largest possible superpage level for a given mapping */
1732 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1733                                           unsigned long iov_pfn,
1734                                           unsigned long phy_pfn,
1735                                           unsigned long pages)
1736 {
1737         int support, level = 1;
1738         unsigned long pfnmerge;
1739
1740         support = domain->iommu_superpage;
1741
1742         /* To use a large page, the virtual *and* physical addresses
1743            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1744            of them will mean we have to use smaller pages. So just
1745            merge them and check both at once. */
1746         pfnmerge = iov_pfn | phy_pfn;
1747
1748         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1749                 pages >>= VTD_STRIDE_SHIFT;
1750                 if (!pages)
1751                         break;
1752                 pfnmerge >>= VTD_STRIDE_SHIFT;
1753                 level++;
1754                 support--;
1755         }
1756         return level;
1757 }
1758
1759 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1760                             struct scatterlist *sg, unsigned long phys_pfn,
1761                             unsigned long nr_pages, int prot)
1762 {
1763         struct dma_pte *first_pte = NULL, *pte = NULL;
1764         phys_addr_t uninitialized_var(pteval);
1765         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1766         unsigned long sg_res = 0;
1767         unsigned int largepage_lvl = 0;
1768         unsigned long lvl_pages = 0;
1769
1770         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1771
1772         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1773                 return -EINVAL;
1774
1775         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1776
1777         if (!sg) {
1778                 sg_res = nr_pages;
1779                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1780         }
1781
1782         while (nr_pages > 0) {
1783                 uint64_t tmp;
1784
1785                 if (!sg_res) {
1786                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
1787
1788                         sg_res = aligned_nrpages(sg->offset, sg->length);
1789                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
1790                         sg->dma_length = sg->length;
1791                         pteval = (sg_phys(sg) - pgoff) | prot;
1792                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1793                 }
1794
1795                 if (!pte) {
1796                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1797
1798                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1799                         if (!pte)
1800                                 return -ENOMEM;
1801                         /* It is large page*/
1802                         if (largepage_lvl > 1) {
1803                                 unsigned long nr_superpages, end_pfn, lvl_pages;
1804
1805                                 pteval |= DMA_PTE_LARGE_PAGE;
1806                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1807
1808                                 nr_superpages = sg_res / lvl_pages;
1809                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
1810
1811                                 /*
1812                                  * Ensure that old small page tables are
1813                                  * removed to make room for superpage(s).
1814                                  */
1815                                 dma_pte_clear_range(domain, iov_pfn, end_pfn);
1816                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn);
1817                         } else {
1818                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1819                         }
1820
1821                 }
1822                 /* We don't need lock here, nobody else
1823                  * touches the iova range
1824                  */
1825                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1826                 if (tmp) {
1827                         static int dumps = 5;
1828                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1829                                iov_pfn, tmp, (unsigned long long)pteval);
1830                         if (dumps) {
1831                                 dumps--;
1832                                 debug_dma_dump_mappings(NULL);
1833                         }
1834                         WARN_ON(1);
1835                 }
1836
1837                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1838
1839                 BUG_ON(nr_pages < lvl_pages);
1840                 BUG_ON(sg_res < lvl_pages);
1841
1842                 nr_pages -= lvl_pages;
1843                 iov_pfn += lvl_pages;
1844                 phys_pfn += lvl_pages;
1845                 pteval += lvl_pages * VTD_PAGE_SIZE;
1846                 sg_res -= lvl_pages;
1847
1848                 /* If the next PTE would be the first in a new page, then we
1849                    need to flush the cache on the entries we've just written.
1850                    And then we'll need to recalculate 'pte', so clear it and
1851                    let it get set again in the if (!pte) block above.
1852
1853                    If we're done (!nr_pages) we need to flush the cache too.
1854
1855                    Also if we've been setting superpages, we may need to
1856                    recalculate 'pte' and switch back to smaller pages for the
1857                    end of the mapping, if the trailing size is not enough to
1858                    use another superpage (i.e. sg_res < lvl_pages). */
1859                 pte++;
1860                 if (!nr_pages || first_pte_in_page(pte) ||
1861                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1862                         domain_flush_cache(domain, first_pte,
1863                                            (void *)pte - (void *)first_pte);
1864                         pte = NULL;
1865                 }
1866
1867                 if (!sg_res && nr_pages)
1868                         sg = sg_next(sg);
1869         }
1870         return 0;
1871 }
1872
1873 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1874                                     struct scatterlist *sg, unsigned long nr_pages,
1875                                     int prot)
1876 {
1877         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1878 }
1879
1880 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1881                                      unsigned long phys_pfn, unsigned long nr_pages,
1882                                      int prot)
1883 {
1884         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1885 }
1886
1887 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1888 {
1889         if (!iommu)
1890                 return;
1891
1892         clear_context_table(iommu, bus, devfn);
1893         iommu->flush.flush_context(iommu, 0, 0, 0,
1894                                            DMA_CCMD_GLOBAL_INVL);
1895         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1896 }
1897
1898 static void domain_remove_dev_info(struct dmar_domain *domain)
1899 {
1900         struct device_domain_info *info;
1901         unsigned long flags;
1902         struct intel_iommu *iommu;
1903
1904         spin_lock_irqsave(&device_domain_lock, flags);
1905         while (!list_empty(&domain->devices)) {
1906                 info = list_entry(domain->devices.next,
1907                         struct device_domain_info, link);
1908                 list_del(&info->link);
1909                 list_del(&info->global);
1910                 if (info->dev)
1911                         info->dev->dev.archdata.iommu = NULL;
1912                 spin_unlock_irqrestore(&device_domain_lock, flags);
1913
1914                 iommu_disable_dev_iotlb(info);
1915                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1916                 iommu_detach_dev(iommu, info->bus, info->devfn);
1917                 free_devinfo_mem(info);
1918
1919                 spin_lock_irqsave(&device_domain_lock, flags);
1920         }
1921         spin_unlock_irqrestore(&device_domain_lock, flags);
1922 }
1923
1924 /*
1925  * find_domain
1926  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1927  */
1928 static struct dmar_domain *
1929 find_domain(struct pci_dev *pdev)
1930 {
1931         struct device_domain_info *info;
1932
1933         /* No lock here, assumes no domain exit in normal case */
1934         info = pdev->dev.archdata.iommu;
1935         if (info)
1936                 return info->domain;
1937         return NULL;
1938 }
1939
1940 /* domain is initialized */
1941 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1942 {
1943         struct dmar_domain *domain, *found = NULL;
1944         struct intel_iommu *iommu;
1945         struct dmar_drhd_unit *drhd;
1946         struct device_domain_info *info, *tmp;
1947         struct pci_dev *dev_tmp;
1948         unsigned long flags;
1949         int bus = 0, devfn = 0;
1950         int segment;
1951         int ret;
1952
1953         domain = find_domain(pdev);
1954         if (domain)
1955                 return domain;
1956
1957         segment = pci_domain_nr(pdev->bus);
1958
1959         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1960         if (dev_tmp) {
1961                 if (pci_is_pcie(dev_tmp)) {
1962                         bus = dev_tmp->subordinate->number;
1963                         devfn = 0;
1964                 } else {
1965                         bus = dev_tmp->bus->number;
1966                         devfn = dev_tmp->devfn;
1967                 }
1968                 spin_lock_irqsave(&device_domain_lock, flags);
1969                 list_for_each_entry(info, &device_domain_list, global) {
1970                         if (info->segment == segment &&
1971                             info->bus == bus && info->devfn == devfn) {
1972                                 found = info->domain;
1973                                 break;
1974                         }
1975                 }
1976                 spin_unlock_irqrestore(&device_domain_lock, flags);
1977                 /* pcie-pci bridge already has a domain, uses it */
1978                 if (found) {
1979                         domain = found;
1980                         goto found_domain;
1981                 }
1982         }
1983
1984         domain = alloc_domain();
1985         if (!domain)
1986                 goto error;
1987
1988         /* Allocate new domain for the device */
1989         drhd = dmar_find_matched_drhd_unit(pdev);
1990         if (!drhd) {
1991                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1992                         pci_name(pdev));
1993                 return NULL;
1994         }
1995         iommu = drhd->iommu;
1996
1997         ret = iommu_attach_domain(domain, iommu);
1998         if (ret) {
1999                 free_domain_mem(domain);
2000                 goto error;
2001         }
2002
2003         if (domain_init(domain, gaw)) {
2004                 domain_exit(domain);
2005                 goto error;
2006         }
2007
2008         /* register pcie-to-pci device */
2009         if (dev_tmp) {
2010                 info = alloc_devinfo_mem();
2011                 if (!info) {
2012                         domain_exit(domain);
2013                         goto error;
2014                 }
2015                 info->segment = segment;
2016                 info->bus = bus;
2017                 info->devfn = devfn;
2018                 info->dev = NULL;
2019                 info->domain = domain;
2020                 /* This domain is shared by devices under p2p bridge */
2021                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2022
2023                 /* pcie-to-pci bridge already has a domain, uses it */
2024                 found = NULL;
2025                 spin_lock_irqsave(&device_domain_lock, flags);
2026                 list_for_each_entry(tmp, &device_domain_list, global) {
2027                         if (tmp->segment == segment &&
2028                             tmp->bus == bus && tmp->devfn == devfn) {
2029                                 found = tmp->domain;
2030                                 break;
2031                         }
2032                 }
2033                 if (found) {
2034                         spin_unlock_irqrestore(&device_domain_lock, flags);
2035                         free_devinfo_mem(info);
2036                         domain_exit(domain);
2037                         domain = found;
2038                 } else {
2039                         list_add(&info->link, &domain->devices);
2040                         list_add(&info->global, &device_domain_list);
2041                         spin_unlock_irqrestore(&device_domain_lock, flags);
2042                 }
2043         }
2044
2045 found_domain:
2046         info = alloc_devinfo_mem();
2047         if (!info)
2048                 goto error;
2049         info->segment = segment;
2050         info->bus = pdev->bus->number;
2051         info->devfn = pdev->devfn;
2052         info->dev = pdev;
2053         info->domain = domain;
2054         spin_lock_irqsave(&device_domain_lock, flags);
2055         /* somebody is fast */
2056         found = find_domain(pdev);
2057         if (found != NULL) {
2058                 spin_unlock_irqrestore(&device_domain_lock, flags);
2059                 if (found != domain) {
2060                         domain_exit(domain);
2061                         domain = found;
2062                 }
2063                 free_devinfo_mem(info);
2064                 return domain;
2065         }
2066         list_add(&info->link, &domain->devices);
2067         list_add(&info->global, &device_domain_list);
2068         pdev->dev.archdata.iommu = info;
2069         spin_unlock_irqrestore(&device_domain_lock, flags);
2070         return domain;
2071 error:
2072         /* recheck it here, maybe others set it */
2073         return find_domain(pdev);
2074 }
2075
2076 static int iommu_identity_mapping;
2077 #define IDENTMAP_ALL            1
2078 #define IDENTMAP_GFX            2
2079 #define IDENTMAP_AZALIA         4
2080
2081 static int iommu_domain_identity_map(struct dmar_domain *domain,
2082                                      unsigned long long start,
2083                                      unsigned long long end)
2084 {
2085         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2086         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2087
2088         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2089                           dma_to_mm_pfn(last_vpfn))) {
2090                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2091                 return -ENOMEM;
2092         }
2093
2094         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2095                  start, end, domain->id);
2096         /*
2097          * RMRR range might have overlap with physical memory range,
2098          * clear it first
2099          */
2100         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2101
2102         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2103                                   last_vpfn - first_vpfn + 1,
2104                                   DMA_PTE_READ|DMA_PTE_WRITE);
2105 }
2106
2107 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2108                                       unsigned long long start,
2109                                       unsigned long long end)
2110 {
2111         struct dmar_domain *domain;
2112         int ret;
2113
2114         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2115         if (!domain)
2116                 return -ENOMEM;
2117
2118         /* For _hardware_ passthrough, don't bother. But for software
2119            passthrough, we do it anyway -- it may indicate a memory
2120            range which is reserved in E820, so which didn't get set
2121            up to start with in si_domain */
2122         if (domain == si_domain && hw_pass_through) {
2123                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2124                        pci_name(pdev), start, end);
2125                 return 0;
2126         }
2127
2128         printk(KERN_INFO
2129                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2130                pci_name(pdev), start, end);
2131         
2132         if (end < start) {
2133                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2134                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2135                         dmi_get_system_info(DMI_BIOS_VENDOR),
2136                         dmi_get_system_info(DMI_BIOS_VERSION),
2137                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2138                 ret = -EIO;
2139                 goto error;
2140         }
2141
2142         if (end >> agaw_to_width(domain->agaw)) {
2143                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2144                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2145                      agaw_to_width(domain->agaw),
2146                      dmi_get_system_info(DMI_BIOS_VENDOR),
2147                      dmi_get_system_info(DMI_BIOS_VERSION),
2148                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2149                 ret = -EIO;
2150                 goto error;
2151         }
2152
2153         ret = iommu_domain_identity_map(domain, start, end);
2154         if (ret)
2155                 goto error;
2156
2157         /* context entry init */
2158         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2159         if (ret)
2160                 goto error;
2161
2162         return 0;
2163
2164  error:
2165         domain_exit(domain);
2166         return ret;
2167 }
2168
2169 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2170         struct pci_dev *pdev)
2171 {
2172         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2173                 return 0;
2174         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2175                 rmrr->end_address);
2176 }
2177
2178 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2179 static inline void iommu_prepare_isa(void)
2180 {
2181         struct pci_dev *pdev;
2182         int ret;
2183
2184         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2185         if (!pdev)
2186                 return;
2187
2188         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2189         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2190
2191         if (ret)
2192                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2193                        "floppy might not work\n");
2194
2195 }
2196 #else
2197 static inline void iommu_prepare_isa(void)
2198 {
2199         return;
2200 }
2201 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2202
2203 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2204
2205 static int __init si_domain_work_fn(unsigned long start_pfn,
2206                                     unsigned long end_pfn, void *datax)
2207 {
2208         int *ret = datax;
2209
2210         *ret = iommu_domain_identity_map(si_domain,
2211                                          (uint64_t)start_pfn << PAGE_SHIFT,
2212                                          (uint64_t)end_pfn << PAGE_SHIFT);
2213         return *ret;
2214
2215 }
2216
2217 static int __init si_domain_init(int hw)
2218 {
2219         struct dmar_drhd_unit *drhd;
2220         struct intel_iommu *iommu;
2221         int nid, ret = 0;
2222
2223         si_domain = alloc_domain();
2224         if (!si_domain)
2225                 return -EFAULT;
2226
2227         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2228
2229         for_each_active_iommu(iommu, drhd) {
2230                 ret = iommu_attach_domain(si_domain, iommu);
2231                 if (ret) {
2232                         domain_exit(si_domain);
2233                         return -EFAULT;
2234                 }
2235         }
2236
2237         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2238                 domain_exit(si_domain);
2239                 return -EFAULT;
2240         }
2241
2242         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2243
2244         if (hw)
2245                 return 0;
2246
2247         for_each_online_node(nid) {
2248                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2249                 if (ret)
2250                         return ret;
2251         }
2252
2253         return 0;
2254 }
2255
2256 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2257                                           struct pci_dev *pdev);
2258 static int identity_mapping(struct pci_dev *pdev)
2259 {
2260         struct device_domain_info *info;
2261
2262         if (likely(!iommu_identity_mapping))
2263                 return 0;
2264
2265         info = pdev->dev.archdata.iommu;
2266         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2267                 return (info->domain == si_domain);
2268
2269         return 0;
2270 }
2271
2272 static int domain_add_dev_info(struct dmar_domain *domain,
2273                                struct pci_dev *pdev,
2274                                int translation)
2275 {
2276         struct device_domain_info *info;
2277         unsigned long flags;
2278         int ret;
2279
2280         info = alloc_devinfo_mem();
2281         if (!info)
2282                 return -ENOMEM;
2283
2284         info->segment = pci_domain_nr(pdev->bus);
2285         info->bus = pdev->bus->number;
2286         info->devfn = pdev->devfn;
2287         info->dev = pdev;
2288         info->domain = domain;
2289
2290         spin_lock_irqsave(&device_domain_lock, flags);
2291         list_add(&info->link, &domain->devices);
2292         list_add(&info->global, &device_domain_list);
2293         pdev->dev.archdata.iommu = info;
2294         spin_unlock_irqrestore(&device_domain_lock, flags);
2295
2296         ret = domain_context_mapping(domain, pdev, translation);
2297         if (ret) {
2298                 spin_lock_irqsave(&device_domain_lock, flags);
2299                 list_del(&info->link);
2300                 list_del(&info->global);
2301                 pdev->dev.archdata.iommu = NULL;
2302                 spin_unlock_irqrestore(&device_domain_lock, flags);
2303                 free_devinfo_mem(info);
2304                 return ret;
2305         }
2306
2307         return 0;
2308 }
2309
2310 static bool device_has_rmrr(struct pci_dev *dev)
2311 {
2312         struct dmar_rmrr_unit *rmrr;
2313         int i;
2314
2315         for_each_rmrr_units(rmrr) {
2316                 for (i = 0; i < rmrr->devices_cnt; i++) {
2317                         /*
2318                          * Return TRUE if this RMRR contains the device that
2319                          * is passed in.
2320                          */
2321                         if (rmrr->devices[i] == dev)
2322                                 return true;
2323                 }
2324         }
2325         return false;
2326 }
2327
2328 /*
2329  * There are a couple cases where we need to restrict the functionality of
2330  * devices associated with RMRRs.  The first is when evaluating a device for
2331  * identity mapping because problems exist when devices are moved in and out
2332  * of domains and their respective RMRR information is lost.  This means that
2333  * a device with associated RMRRs will never be in a "passthrough" domain.
2334  * The second is use of the device through the IOMMU API.  This interface
2335  * expects to have full control of the IOVA space for the device.  We cannot
2336  * satisfy both the requirement that RMRR access is maintained and have an
2337  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2338  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2339  * We therefore prevent devices associated with an RMRR from participating in
2340  * the IOMMU API, which eliminates them from device assignment.
2341  *
2342  * In both cases we assume that PCI USB devices with RMRRs have them largely
2343  * for historical reasons and that the RMRR space is not actively used post
2344  * boot.  This exclusion may change if vendors begin to abuse it.
2345  */
2346 static bool device_is_rmrr_locked(struct pci_dev *pdev)
2347 {
2348         return device_has_rmrr(pdev) &&
2349                 (pdev->class >> 8) != PCI_CLASS_SERIAL_USB;
2350 }
2351
2352 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2353 {
2354
2355         if (device_is_rmrr_locked(pdev))
2356                 return 0;
2357
2358         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2359                 return 1;
2360
2361         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2362                 return 1;
2363
2364         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2365                 return 0;
2366
2367         /*
2368          * We want to start off with all devices in the 1:1 domain, and
2369          * take them out later if we find they can't access all of memory.
2370          *
2371          * However, we can't do this for PCI devices behind bridges,
2372          * because all PCI devices behind the same bridge will end up
2373          * with the same source-id on their transactions.
2374          *
2375          * Practically speaking, we can't change things around for these
2376          * devices at run-time, because we can't be sure there'll be no
2377          * DMA transactions in flight for any of their siblings.
2378          * 
2379          * So PCI devices (unless they're on the root bus) as well as
2380          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2381          * the 1:1 domain, just in _case_ one of their siblings turns out
2382          * not to be able to map all of memory.
2383          */
2384         if (!pci_is_pcie(pdev)) {
2385                 if (!pci_is_root_bus(pdev->bus))
2386                         return 0;
2387                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2388                         return 0;
2389         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2390                 return 0;
2391
2392         /* 
2393          * At boot time, we don't yet know if devices will be 64-bit capable.
2394          * Assume that they will -- if they turn out not to be, then we can 
2395          * take them out of the 1:1 domain later.
2396          */
2397         if (!startup) {
2398                 /*
2399                  * If the device's dma_mask is less than the system's memory
2400                  * size then this is not a candidate for identity mapping.
2401                  */
2402                 u64 dma_mask = pdev->dma_mask;
2403
2404                 if (pdev->dev.coherent_dma_mask &&
2405                     pdev->dev.coherent_dma_mask < dma_mask)
2406                         dma_mask = pdev->dev.coherent_dma_mask;
2407
2408                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2409         }
2410
2411         return 1;
2412 }
2413
2414 static int __init iommu_prepare_static_identity_mapping(int hw)
2415 {
2416         struct pci_dev *pdev = NULL;
2417         int ret;
2418
2419         ret = si_domain_init(hw);
2420         if (ret)
2421                 return -EFAULT;
2422
2423         for_each_pci_dev(pdev) {
2424                 /* Skip Host/PCI Bridge devices */
2425                 if (IS_BRIDGE_HOST_DEVICE(pdev))
2426                         continue;
2427                 if (iommu_should_identity_map(pdev, 1)) {
2428                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2429                                hw ? "hardware" : "software", pci_name(pdev));
2430
2431                         ret = domain_add_dev_info(si_domain, pdev,
2432                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2433                                                      CONTEXT_TT_MULTI_LEVEL);
2434                         if (ret)
2435                                 return ret;
2436                 }
2437         }
2438
2439         return 0;
2440 }
2441
2442 static int __init init_dmars(void)
2443 {
2444         struct dmar_drhd_unit *drhd;
2445         struct dmar_rmrr_unit *rmrr;
2446         struct pci_dev *pdev;
2447         struct intel_iommu *iommu;
2448         int i, ret;
2449
2450         /*
2451          * for each drhd
2452          *    allocate root
2453          *    initialize and program root entry to not present
2454          * endfor
2455          */
2456         for_each_drhd_unit(drhd) {
2457                 g_num_of_iommus++;
2458                 /*
2459                  * lock not needed as this is only incremented in the single
2460                  * threaded kernel __init code path all other access are read
2461                  * only
2462                  */
2463         }
2464
2465         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2466                         GFP_KERNEL);
2467         if (!g_iommus) {
2468                 printk(KERN_ERR "Allocating global iommu array failed\n");
2469                 ret = -ENOMEM;
2470                 goto error;
2471         }
2472
2473         deferred_flush = kzalloc(g_num_of_iommus *
2474                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2475         if (!deferred_flush) {
2476                 ret = -ENOMEM;
2477                 goto error;
2478         }
2479
2480         for_each_drhd_unit(drhd) {
2481                 if (drhd->ignored)
2482                         continue;
2483
2484                 iommu = drhd->iommu;
2485                 g_iommus[iommu->seq_id] = iommu;
2486
2487                 ret = iommu_init_domains(iommu);
2488                 if (ret)
2489                         goto error;
2490
2491                 /*
2492                  * TBD:
2493                  * we could share the same root & context tables
2494                  * among all IOMMU's. Need to Split it later.
2495                  */
2496                 ret = iommu_alloc_root_entry(iommu);
2497                 if (ret) {
2498                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2499                         goto error;
2500                 }
2501                 if (!ecap_pass_through(iommu->ecap))
2502                         hw_pass_through = 0;
2503         }
2504
2505         /*
2506          * Start from the sane iommu hardware state.
2507          */
2508         for_each_drhd_unit(drhd) {
2509                 if (drhd->ignored)
2510                         continue;
2511
2512                 iommu = drhd->iommu;
2513
2514                 /*
2515                  * If the queued invalidation is already initialized by us
2516                  * (for example, while enabling interrupt-remapping) then
2517                  * we got the things already rolling from a sane state.
2518                  */
2519                 if (iommu->qi)
2520                         continue;
2521
2522                 /*
2523                  * Clear any previous faults.
2524                  */
2525                 dmar_fault(-1, iommu);
2526                 /*
2527                  * Disable queued invalidation if supported and already enabled
2528                  * before OS handover.
2529                  */
2530                 dmar_disable_qi(iommu);
2531         }
2532
2533         for_each_drhd_unit(drhd) {
2534                 if (drhd->ignored)
2535                         continue;
2536
2537                 iommu = drhd->iommu;
2538
2539                 if (dmar_enable_qi(iommu)) {
2540                         /*
2541                          * Queued Invalidate not enabled, use Register Based
2542                          * Invalidate
2543                          */
2544                         iommu->flush.flush_context = __iommu_flush_context;
2545                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2546                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2547                                "invalidation\n",
2548                                 iommu->seq_id,
2549                                (unsigned long long)drhd->reg_base_addr);
2550                 } else {
2551                         iommu->flush.flush_context = qi_flush_context;
2552                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2553                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2554                                "invalidation\n",
2555                                 iommu->seq_id,
2556                                (unsigned long long)drhd->reg_base_addr);
2557                 }
2558         }
2559
2560         if (iommu_pass_through)
2561                 iommu_identity_mapping |= IDENTMAP_ALL;
2562
2563 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2564         iommu_identity_mapping |= IDENTMAP_GFX;
2565 #endif
2566
2567         check_tylersburg_isoch();
2568
2569         /*
2570          * If pass through is not set or not enabled, setup context entries for
2571          * identity mappings for rmrr, gfx, and isa and may fall back to static
2572          * identity mapping if iommu_identity_mapping is set.
2573          */
2574         if (iommu_identity_mapping) {
2575                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2576                 if (ret) {
2577                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2578                         goto error;
2579                 }
2580         }
2581         /*
2582          * For each rmrr
2583          *   for each dev attached to rmrr
2584          *   do
2585          *     locate drhd for dev, alloc domain for dev
2586          *     allocate free domain
2587          *     allocate page table entries for rmrr
2588          *     if context not allocated for bus
2589          *           allocate and init context
2590          *           set present in root table for this bus
2591          *     init context with domain, translation etc
2592          *    endfor
2593          * endfor
2594          */
2595         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2596         for_each_rmrr_units(rmrr) {
2597                 for (i = 0; i < rmrr->devices_cnt; i++) {
2598                         pdev = rmrr->devices[i];
2599                         /*
2600                          * some BIOS lists non-exist devices in DMAR
2601                          * table.
2602                          */
2603                         if (!pdev)
2604                                 continue;
2605                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2606                         if (ret)
2607                                 printk(KERN_ERR
2608                                        "IOMMU: mapping reserved region failed\n");
2609                 }
2610         }
2611
2612         iommu_prepare_isa();
2613
2614         /*
2615          * for each drhd
2616          *   enable fault log
2617          *   global invalidate context cache
2618          *   global invalidate iotlb
2619          *   enable translation
2620          */
2621         for_each_drhd_unit(drhd) {
2622                 if (drhd->ignored) {
2623                         /*
2624                          * we always have to disable PMRs or DMA may fail on
2625                          * this device
2626                          */
2627                         if (force_on)
2628                                 iommu_disable_protect_mem_regions(drhd->iommu);
2629                         continue;
2630                 }
2631                 iommu = drhd->iommu;
2632
2633                 iommu_flush_write_buffer(iommu);
2634
2635                 ret = dmar_set_interrupt(iommu);
2636                 if (ret)
2637                         goto error;
2638
2639                 iommu_set_root_entry(iommu);
2640
2641                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2642                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2643
2644                 ret = iommu_enable_translation(iommu);
2645                 if (ret)
2646                         goto error;
2647
2648                 iommu_disable_protect_mem_regions(iommu);
2649         }
2650
2651         return 0;
2652 error:
2653         for_each_drhd_unit(drhd) {
2654                 if (drhd->ignored)
2655                         continue;
2656                 iommu = drhd->iommu;
2657                 free_iommu(iommu);
2658         }
2659         kfree(g_iommus);
2660         return ret;
2661 }
2662
2663 /* This takes a number of _MM_ pages, not VTD pages */
2664 static struct iova *intel_alloc_iova(struct device *dev,
2665                                      struct dmar_domain *domain,
2666                                      unsigned long nrpages, uint64_t dma_mask)
2667 {
2668         struct pci_dev *pdev = to_pci_dev(dev);
2669         struct iova *iova = NULL;
2670
2671         /* Restrict dma_mask to the width that the iommu can handle */
2672         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2673
2674         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2675                 /*
2676                  * First try to allocate an io virtual address in
2677                  * DMA_BIT_MASK(32) and if that fails then try allocating
2678                  * from higher range
2679                  */
2680                 iova = alloc_iova(&domain->iovad, nrpages,
2681                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2682                 if (iova)
2683                         return iova;
2684         }
2685         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2686         if (unlikely(!iova)) {
2687                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2688                        nrpages, pci_name(pdev));
2689                 return NULL;
2690         }
2691
2692         return iova;
2693 }
2694
2695 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2696 {
2697         struct dmar_domain *domain;
2698         int ret;
2699
2700         domain = get_domain_for_dev(pdev,
2701                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2702         if (!domain) {
2703                 printk(KERN_ERR
2704                         "Allocating domain for %s failed", pci_name(pdev));
2705                 return NULL;
2706         }
2707
2708         /* make sure context mapping is ok */
2709         if (unlikely(!domain_context_mapped(pdev))) {
2710                 ret = domain_context_mapping(domain, pdev,
2711                                              CONTEXT_TT_MULTI_LEVEL);
2712                 if (ret) {
2713                         printk(KERN_ERR
2714                                 "Domain context map for %s failed",
2715                                 pci_name(pdev));
2716                         return NULL;
2717                 }
2718         }
2719
2720         return domain;
2721 }
2722
2723 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2724 {
2725         struct device_domain_info *info;
2726
2727         /* No lock here, assumes no domain exit in normal case */
2728         info = dev->dev.archdata.iommu;
2729         if (likely(info))
2730                 return info->domain;
2731
2732         return __get_valid_domain_for_dev(dev);
2733 }
2734
2735 static int iommu_dummy(struct pci_dev *pdev)
2736 {
2737         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2738 }
2739
2740 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2741 static int iommu_no_mapping(struct device *dev)
2742 {
2743         struct pci_dev *pdev;
2744         int found;
2745
2746         if (unlikely(dev->bus != &pci_bus_type))
2747                 return 1;
2748
2749         pdev = to_pci_dev(dev);
2750         if (iommu_dummy(pdev))
2751                 return 1;
2752
2753         if (!iommu_identity_mapping)
2754                 return 0;
2755
2756         found = identity_mapping(pdev);
2757         if (found) {
2758                 if (iommu_should_identity_map(pdev, 0))
2759                         return 1;
2760                 else {
2761                         /*
2762                          * 32 bit DMA is removed from si_domain and fall back
2763                          * to non-identity mapping.
2764                          */
2765                         domain_remove_one_dev_info(si_domain, pdev);
2766                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2767                                pci_name(pdev));
2768                         return 0;
2769                 }
2770         } else {
2771                 /*
2772                  * In case of a detached 64 bit DMA device from vm, the device
2773                  * is put into si_domain for identity mapping.
2774                  */
2775                 if (iommu_should_identity_map(pdev, 0)) {
2776                         int ret;
2777                         ret = domain_add_dev_info(si_domain, pdev,
2778                                                   hw_pass_through ?
2779                                                   CONTEXT_TT_PASS_THROUGH :
2780                                                   CONTEXT_TT_MULTI_LEVEL);
2781                         if (!ret) {
2782                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2783                                        pci_name(pdev));
2784                                 return 1;
2785                         }
2786                 }
2787         }
2788
2789         return 0;
2790 }
2791
2792 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2793                                      size_t size, int dir, u64 dma_mask)
2794 {
2795         struct pci_dev *pdev = to_pci_dev(hwdev);
2796         struct dmar_domain *domain;
2797         phys_addr_t start_paddr;
2798         struct iova *iova;
2799         int prot = 0;
2800         int ret;
2801         struct intel_iommu *iommu;
2802         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2803
2804         BUG_ON(dir == DMA_NONE);
2805
2806         if (iommu_no_mapping(hwdev))
2807                 return paddr;
2808
2809         domain = get_valid_domain_for_dev(pdev);
2810         if (!domain)
2811                 return 0;
2812
2813         iommu = domain_get_iommu(domain);
2814         size = aligned_nrpages(paddr, size);
2815
2816         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2817         if (!iova)
2818                 goto error;
2819
2820         /*
2821          * Check if DMAR supports zero-length reads on write only
2822          * mappings..
2823          */
2824         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2825                         !cap_zlr(iommu->cap))
2826                 prot |= DMA_PTE_READ;
2827         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2828                 prot |= DMA_PTE_WRITE;
2829         /*
2830          * paddr - (paddr + size) might be partial page, we should map the whole
2831          * page.  Note: if two part of one page are separately mapped, we
2832          * might have two guest_addr mapping to the same host paddr, but this
2833          * is not a big problem
2834          */
2835         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2836                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2837         if (ret)
2838                 goto error;
2839
2840         /* it's a non-present to present mapping. Only flush if caching mode */
2841         if (cap_caching_mode(iommu->cap))
2842                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2843         else
2844                 iommu_flush_write_buffer(iommu);
2845
2846         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2847         start_paddr += paddr & ~PAGE_MASK;
2848         return start_paddr;
2849
2850 error:
2851         if (iova)
2852                 __free_iova(&domain->iovad, iova);
2853         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2854                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2855         return 0;
2856 }
2857
2858 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2859                                  unsigned long offset, size_t size,
2860                                  enum dma_data_direction dir,
2861                                  struct dma_attrs *attrs)
2862 {
2863         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2864                                   dir, to_pci_dev(dev)->dma_mask);
2865 }
2866
2867 static void flush_unmaps(void)
2868 {
2869         int i, j;
2870
2871         timer_on = 0;
2872
2873         /* just flush them all */
2874         for (i = 0; i < g_num_of_iommus; i++) {
2875                 struct intel_iommu *iommu = g_iommus[i];
2876                 if (!iommu)
2877                         continue;
2878
2879                 if (!deferred_flush[i].next)
2880                         continue;
2881
2882                 /* In caching mode, global flushes turn emulation expensive */
2883                 if (!cap_caching_mode(iommu->cap))
2884                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2885                                          DMA_TLB_GLOBAL_FLUSH);
2886                 for (j = 0; j < deferred_flush[i].next; j++) {
2887                         unsigned long mask;
2888                         struct iova *iova = deferred_flush[i].iova[j];
2889                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2890
2891                         /* On real hardware multiple invalidations are expensive */
2892                         if (cap_caching_mode(iommu->cap))
2893                                 iommu_flush_iotlb_psi(iommu, domain->id,
2894                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2895                         else {
2896                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2897                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2898                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2899                         }
2900                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2901                 }
2902                 deferred_flush[i].next = 0;
2903         }
2904
2905         list_size = 0;
2906 }
2907
2908 static void flush_unmaps_timeout(unsigned long data)
2909 {
2910         unsigned long flags;
2911
2912         spin_lock_irqsave(&async_umap_flush_lock, flags);
2913         flush_unmaps();
2914         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2915 }
2916
2917 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2918 {
2919         unsigned long flags;
2920         int next, iommu_id;
2921         struct intel_iommu *iommu;
2922
2923         spin_lock_irqsave(&async_umap_flush_lock, flags);
2924         if (list_size == HIGH_WATER_MARK)
2925                 flush_unmaps();
2926
2927         iommu = domain_get_iommu(dom);
2928         iommu_id = iommu->seq_id;
2929
2930         next = deferred_flush[iommu_id].next;
2931         deferred_flush[iommu_id].domain[next] = dom;
2932         deferred_flush[iommu_id].iova[next] = iova;
2933         deferred_flush[iommu_id].next++;
2934
2935         if (!timer_on) {
2936                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2937                 timer_on = 1;
2938         }
2939         list_size++;
2940         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2941 }
2942
2943 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2944                              size_t size, enum dma_data_direction dir,
2945                              struct dma_attrs *attrs)
2946 {
2947         struct pci_dev *pdev = to_pci_dev(dev);
2948         struct dmar_domain *domain;
2949         unsigned long start_pfn, last_pfn;
2950         struct iova *iova;
2951         struct intel_iommu *iommu;
2952
2953         if (iommu_no_mapping(dev))
2954                 return;
2955
2956         domain = find_domain(pdev);
2957         BUG_ON(!domain);
2958
2959         iommu = domain_get_iommu(domain);
2960
2961         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2962         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2963                       (unsigned long long)dev_addr))
2964                 return;
2965
2966         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2967         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2968
2969         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2970                  pci_name(pdev), start_pfn, last_pfn);
2971
2972         /*  clear the whole page */
2973         dma_pte_clear_range(domain, start_pfn, last_pfn);
2974
2975         /* free page tables */
2976         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2977
2978         if (intel_iommu_strict) {
2979                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2980                                       last_pfn - start_pfn + 1, 0);
2981                 /* free iova */
2982                 __free_iova(&domain->iovad, iova);
2983         } else {
2984                 add_unmap(domain, iova);
2985                 /*
2986                  * queue up the release of the unmap to save the 1/6th of the
2987                  * cpu used up by the iotlb flush operation...
2988                  */
2989         }
2990 }
2991
2992 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2993                                   dma_addr_t *dma_handle, gfp_t flags)
2994 {
2995         void *vaddr;
2996         int order;
2997
2998         size = PAGE_ALIGN(size);
2999         order = get_order(size);
3000
3001         if (!iommu_no_mapping(hwdev))
3002                 flags &= ~(GFP_DMA | GFP_DMA32);
3003         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3004                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3005                         flags |= GFP_DMA;
3006                 else
3007                         flags |= GFP_DMA32;
3008         }
3009
3010         vaddr = (void *)__get_free_pages(flags, order);
3011         if (!vaddr)
3012                 return NULL;
3013         memset(vaddr, 0, size);
3014
3015         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3016                                          DMA_BIDIRECTIONAL,
3017                                          hwdev->coherent_dma_mask);
3018         if (*dma_handle)
3019                 return vaddr;
3020         free_pages((unsigned long)vaddr, order);
3021         return NULL;
3022 }
3023
3024 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3025                                 dma_addr_t dma_handle)
3026 {
3027         int order;
3028
3029         size = PAGE_ALIGN(size);
3030         order = get_order(size);
3031
3032         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3033         free_pages((unsigned long)vaddr, order);
3034 }
3035
3036 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3037                            int nelems, enum dma_data_direction dir,
3038                            struct dma_attrs *attrs)
3039 {
3040         struct pci_dev *pdev = to_pci_dev(hwdev);
3041         struct dmar_domain *domain;
3042         unsigned long start_pfn, last_pfn;
3043         struct iova *iova;
3044         struct intel_iommu *iommu;
3045
3046         if (iommu_no_mapping(hwdev))
3047                 return;
3048
3049         domain = find_domain(pdev);
3050         BUG_ON(!domain);
3051
3052         iommu = domain_get_iommu(domain);
3053
3054         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3055         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3056                       (unsigned long long)sglist[0].dma_address))
3057                 return;
3058
3059         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3060         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3061
3062         /*  clear the whole page */
3063         dma_pte_clear_range(domain, start_pfn, last_pfn);
3064
3065         /* free page tables */
3066         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3067
3068         if (intel_iommu_strict) {
3069                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3070                                       last_pfn - start_pfn + 1, 0);
3071                 /* free iova */
3072                 __free_iova(&domain->iovad, iova);
3073         } else {
3074                 add_unmap(domain, iova);
3075                 /*
3076                  * queue up the release of the unmap to save the 1/6th of the
3077                  * cpu used up by the iotlb flush operation...
3078                  */
3079         }
3080 }
3081
3082 static int intel_nontranslate_map_sg(struct device *hddev,
3083         struct scatterlist *sglist, int nelems, int dir)
3084 {
3085         int i;
3086         struct scatterlist *sg;
3087
3088         for_each_sg(sglist, sg, nelems, i) {
3089                 BUG_ON(!sg_page(sg));
3090                 sg->dma_address = sg_phys(sg);
3091                 sg->dma_length = sg->length;
3092         }
3093         return nelems;
3094 }
3095
3096 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3097                         enum dma_data_direction dir, struct dma_attrs *attrs)
3098 {
3099         int i;
3100         struct pci_dev *pdev = to_pci_dev(hwdev);
3101         struct dmar_domain *domain;
3102         size_t size = 0;
3103         int prot = 0;
3104         struct iova *iova = NULL;
3105         int ret;
3106         struct scatterlist *sg;
3107         unsigned long start_vpfn;
3108         struct intel_iommu *iommu;
3109
3110         BUG_ON(dir == DMA_NONE);
3111         if (iommu_no_mapping(hwdev))
3112                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3113
3114         domain = get_valid_domain_for_dev(pdev);
3115         if (!domain)
3116                 return 0;
3117
3118         iommu = domain_get_iommu(domain);
3119
3120         for_each_sg(sglist, sg, nelems, i)
3121                 size += aligned_nrpages(sg->offset, sg->length);
3122
3123         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3124                                 pdev->dma_mask);
3125         if (!iova) {
3126                 sglist->dma_length = 0;
3127                 return 0;
3128         }
3129
3130         /*
3131          * Check if DMAR supports zero-length reads on write only
3132          * mappings..
3133          */
3134         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3135                         !cap_zlr(iommu->cap))
3136                 prot |= DMA_PTE_READ;
3137         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3138                 prot |= DMA_PTE_WRITE;
3139
3140         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3141
3142         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3143         if (unlikely(ret)) {
3144                 /*  clear the page */
3145                 dma_pte_clear_range(domain, start_vpfn,
3146                                     start_vpfn + size - 1);
3147                 /* free page tables */
3148                 dma_pte_free_pagetable(domain, start_vpfn,
3149                                        start_vpfn + size - 1);
3150                 /* free iova */
3151                 __free_iova(&domain->iovad, iova);
3152                 return 0;
3153         }
3154
3155         /* it's a non-present to present mapping. Only flush if caching mode */
3156         if (cap_caching_mode(iommu->cap))
3157                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3158         else
3159                 iommu_flush_write_buffer(iommu);
3160
3161         return nelems;
3162 }
3163
3164 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3165 {
3166         return !dma_addr;
3167 }
3168
3169 struct dma_map_ops intel_dma_ops = {
3170         .alloc_coherent = intel_alloc_coherent,
3171         .free_coherent = intel_free_coherent,
3172         .map_sg = intel_map_sg,
3173         .unmap_sg = intel_unmap_sg,
3174         .map_page = intel_map_page,
3175         .unmap_page = intel_unmap_page,
3176         .mapping_error = intel_mapping_error,
3177 };
3178
3179 static inline int iommu_domain_cache_init(void)
3180 {
3181         int ret = 0;
3182
3183         iommu_domain_cache = kmem_cache_create("iommu_domain",
3184                                          sizeof(struct dmar_domain),
3185                                          0,
3186                                          SLAB_HWCACHE_ALIGN,
3187
3188                                          NULL);
3189         if (!iommu_domain_cache) {
3190                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3191                 ret = -ENOMEM;
3192         }
3193
3194         return ret;
3195 }
3196
3197 static inline int iommu_devinfo_cache_init(void)
3198 {
3199         int ret = 0;
3200
3201         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3202                                          sizeof(struct device_domain_info),
3203                                          0,
3204                                          SLAB_HWCACHE_ALIGN,
3205                                          NULL);
3206         if (!iommu_devinfo_cache) {
3207                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3208                 ret = -ENOMEM;
3209         }
3210
3211         return ret;
3212 }
3213
3214 static inline int iommu_iova_cache_init(void)
3215 {
3216         int ret = 0;
3217
3218         iommu_iova_cache = kmem_cache_create("iommu_iova",
3219                                          sizeof(struct iova),
3220                                          0,
3221                                          SLAB_HWCACHE_ALIGN,
3222                                          NULL);
3223         if (!iommu_iova_cache) {
3224                 printk(KERN_ERR "Couldn't create iova cache\n");
3225                 ret = -ENOMEM;
3226         }
3227
3228         return ret;
3229 }
3230
3231 static int __init iommu_init_mempool(void)
3232 {
3233         int ret;
3234         ret = iommu_iova_cache_init();
3235         if (ret)
3236                 return ret;
3237
3238         ret = iommu_domain_cache_init();
3239         if (ret)
3240                 goto domain_error;
3241
3242         ret = iommu_devinfo_cache_init();
3243         if (!ret)
3244                 return ret;
3245
3246         kmem_cache_destroy(iommu_domain_cache);
3247 domain_error:
3248         kmem_cache_destroy(iommu_iova_cache);
3249
3250         return -ENOMEM;
3251 }
3252
3253 static void __init iommu_exit_mempool(void)
3254 {
3255         kmem_cache_destroy(iommu_devinfo_cache);
3256         kmem_cache_destroy(iommu_domain_cache);
3257         kmem_cache_destroy(iommu_iova_cache);
3258
3259 }
3260
3261 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3262 {
3263         struct dmar_drhd_unit *drhd;
3264         u32 vtbar;
3265         int rc;
3266
3267         /* We know that this device on this chipset has its own IOMMU.
3268          * If we find it under a different IOMMU, then the BIOS is lying
3269          * to us. Hope that the IOMMU for this device is actually
3270          * disabled, and it needs no translation...
3271          */
3272         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3273         if (rc) {
3274                 /* "can't" happen */
3275                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3276                 return;
3277         }
3278         vtbar &= 0xffff0000;
3279
3280         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3281         drhd = dmar_find_matched_drhd_unit(pdev);
3282         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3283                             TAINT_FIRMWARE_WORKAROUND,
3284                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3285                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3286 }
3287 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3288
3289 static void __init init_no_remapping_devices(void)
3290 {
3291         struct dmar_drhd_unit *drhd;
3292
3293         for_each_drhd_unit(drhd) {
3294                 if (!drhd->include_all) {
3295                         int i;
3296                         for (i = 0; i < drhd->devices_cnt; i++)
3297                                 if (drhd->devices[i] != NULL)
3298                                         break;
3299                         /* ignore DMAR unit if no pci devices exist */
3300                         if (i == drhd->devices_cnt)
3301                                 drhd->ignored = 1;
3302                 }
3303         }
3304
3305         for_each_drhd_unit(drhd) {
3306                 int i;
3307                 if (drhd->ignored || drhd->include_all)
3308                         continue;
3309
3310                 for (i = 0; i < drhd->devices_cnt; i++)
3311                         if (drhd->devices[i] &&
3312                             !IS_GFX_DEVICE(drhd->devices[i]))
3313                                 break;
3314
3315                 if (i < drhd->devices_cnt)
3316                         continue;
3317
3318                 /* This IOMMU has *only* gfx devices. Either bypass it or
3319                    set the gfx_mapped flag, as appropriate */
3320                 if (dmar_map_gfx) {
3321                         intel_iommu_gfx_mapped = 1;
3322                 } else {
3323                         drhd->ignored = 1;
3324                         for (i = 0; i < drhd->devices_cnt; i++) {
3325                                 if (!drhd->devices[i])
3326                                         continue;
3327                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3328                         }
3329                 }
3330         }
3331 }
3332
3333 #ifdef CONFIG_SUSPEND
3334 static int init_iommu_hw(void)
3335 {
3336         struct dmar_drhd_unit *drhd;
3337         struct intel_iommu *iommu = NULL;
3338
3339         for_each_active_iommu(iommu, drhd)
3340                 if (iommu->qi)
3341                         dmar_reenable_qi(iommu);
3342
3343         for_each_iommu(iommu, drhd) {
3344                 if (drhd->ignored) {
3345                         /*
3346                          * we always have to disable PMRs or DMA may fail on
3347                          * this device
3348                          */
3349                         if (force_on)
3350                                 iommu_disable_protect_mem_regions(iommu);
3351                         continue;
3352                 }
3353         
3354                 iommu_flush_write_buffer(iommu);
3355
3356                 iommu_set_root_entry(iommu);
3357
3358                 iommu->flush.flush_context(iommu, 0, 0, 0,
3359                                            DMA_CCMD_GLOBAL_INVL);
3360                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3361                                          DMA_TLB_GLOBAL_FLUSH);
3362                 if (iommu_enable_translation(iommu))
3363                         return 1;
3364                 iommu_disable_protect_mem_regions(iommu);
3365         }
3366
3367         return 0;
3368 }
3369
3370 static void iommu_flush_all(void)
3371 {
3372         struct dmar_drhd_unit *drhd;
3373         struct intel_iommu *iommu;
3374
3375         for_each_active_iommu(iommu, drhd) {
3376                 iommu->flush.flush_context(iommu, 0, 0, 0,
3377                                            DMA_CCMD_GLOBAL_INVL);
3378                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3379                                          DMA_TLB_GLOBAL_FLUSH);
3380         }
3381 }
3382
3383 static int iommu_suspend(void)
3384 {
3385         struct dmar_drhd_unit *drhd;
3386         struct intel_iommu *iommu = NULL;
3387         unsigned long flag;
3388
3389         for_each_active_iommu(iommu, drhd) {
3390                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3391                                                  GFP_ATOMIC);
3392                 if (!iommu->iommu_state)
3393                         goto nomem;
3394         }
3395
3396         iommu_flush_all();
3397
3398         for_each_active_iommu(iommu, drhd) {
3399                 iommu_disable_translation(iommu);
3400
3401                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3402
3403                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3404                         readl(iommu->reg + DMAR_FECTL_REG);
3405                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3406                         readl(iommu->reg + DMAR_FEDATA_REG);
3407                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3408                         readl(iommu->reg + DMAR_FEADDR_REG);
3409                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3410                         readl(iommu->reg + DMAR_FEUADDR_REG);
3411
3412                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3413         }
3414         return 0;
3415
3416 nomem:
3417         for_each_active_iommu(iommu, drhd)
3418                 kfree(iommu->iommu_state);
3419
3420         return -ENOMEM;
3421 }
3422
3423 static void iommu_resume(void)
3424 {
3425         struct dmar_drhd_unit *drhd;
3426         struct intel_iommu *iommu = NULL;
3427         unsigned long flag;
3428
3429         if (init_iommu_hw()) {
3430                 if (force_on)
3431                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3432                 else
3433                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3434                 return;
3435         }
3436
3437         for_each_active_iommu(iommu, drhd) {
3438
3439                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3440
3441                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3442                         iommu->reg + DMAR_FECTL_REG);
3443                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3444                         iommu->reg + DMAR_FEDATA_REG);
3445                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3446                         iommu->reg + DMAR_FEADDR_REG);
3447                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3448                         iommu->reg + DMAR_FEUADDR_REG);
3449
3450                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3451         }
3452
3453         for_each_active_iommu(iommu, drhd)
3454                 kfree(iommu->iommu_state);
3455 }
3456
3457 static struct syscore_ops iommu_syscore_ops = {
3458         .resume         = iommu_resume,
3459         .suspend        = iommu_suspend,
3460 };
3461
3462 static void __init init_iommu_pm_ops(void)
3463 {
3464         register_syscore_ops(&iommu_syscore_ops);
3465 }
3466
3467 #else
3468 static inline void init_iommu_pm_ops(void) {}
3469 #endif  /* CONFIG_PM */
3470
3471 LIST_HEAD(dmar_rmrr_units);
3472
3473 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3474 {
3475         list_add(&rmrr->list, &dmar_rmrr_units);
3476 }
3477
3478
3479 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3480 {
3481         struct acpi_dmar_reserved_memory *rmrr;
3482         struct dmar_rmrr_unit *rmrru;
3483
3484         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3485         if (!rmrru)
3486                 return -ENOMEM;
3487
3488         rmrru->hdr = header;
3489         rmrr = (struct acpi_dmar_reserved_memory *)header;
3490         rmrru->base_address = rmrr->base_address;
3491         rmrru->end_address = rmrr->end_address;
3492
3493         dmar_register_rmrr_unit(rmrru);
3494         return 0;
3495 }
3496
3497 static int __init
3498 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3499 {
3500         struct acpi_dmar_reserved_memory *rmrr;
3501         int ret;
3502
3503         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3504         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3505                 ((void *)rmrr) + rmrr->header.length,
3506                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3507
3508         if (ret || (rmrru->devices_cnt == 0)) {
3509                 list_del(&rmrru->list);
3510                 kfree(rmrru);
3511         }
3512         return ret;
3513 }
3514
3515 static LIST_HEAD(dmar_atsr_units);
3516
3517 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3518 {
3519         struct acpi_dmar_atsr *atsr;
3520         struct dmar_atsr_unit *atsru;
3521
3522         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3523         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3524         if (!atsru)
3525                 return -ENOMEM;
3526
3527         atsru->hdr = hdr;
3528         atsru->include_all = atsr->flags & 0x1;
3529
3530         list_add(&atsru->list, &dmar_atsr_units);
3531
3532         return 0;
3533 }
3534
3535 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3536 {
3537         int rc;
3538         struct acpi_dmar_atsr *atsr;
3539
3540         if (atsru->include_all)
3541                 return 0;
3542
3543         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3544         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3545                                 (void *)atsr + atsr->header.length,
3546                                 &atsru->devices_cnt, &atsru->devices,
3547                                 atsr->segment);
3548         if (rc || !atsru->devices_cnt) {
3549                 list_del(&atsru->list);
3550                 kfree(atsru);
3551         }
3552
3553         return rc;
3554 }
3555
3556 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3557 {
3558         int i;
3559         struct pci_bus *bus;
3560         struct acpi_dmar_atsr *atsr;
3561         struct dmar_atsr_unit *atsru;
3562
3563         dev = pci_physfn(dev);
3564
3565         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3566                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3567                 if (atsr->segment == pci_domain_nr(dev->bus))
3568                         goto found;
3569         }
3570
3571         return 0;
3572
3573 found:
3574         for (bus = dev->bus; bus; bus = bus->parent) {
3575                 struct pci_dev *bridge = bus->self;
3576
3577                 /* If it's an integrated device, allow ATS */
3578                 if (!bridge)
3579                         return 1;
3580                 /* Connected via non-PCIe: no ATS */
3581                 if (!pci_is_pcie(bridge) ||
3582                     bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3583                         return 0;
3584
3585                 /* If we found the root port, look it up in the ATSR */
3586                 if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3587                         for (i = 0; i < atsru->devices_cnt; i++)
3588                                 if (atsru->devices[i] == bridge)
3589                                         return 1;
3590                         break;
3591                 }
3592         }
3593
3594         if (atsru->include_all)
3595                 return 1;
3596
3597         return 0;
3598 }
3599
3600 int __init dmar_parse_rmrr_atsr_dev(void)
3601 {
3602         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3603         struct dmar_atsr_unit *atsr, *atsr_n;
3604         int ret = 0;
3605
3606         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3607                 ret = rmrr_parse_dev(rmrr);
3608                 if (ret)
3609                         return ret;
3610         }
3611
3612         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3613                 ret = atsr_parse_dev(atsr);
3614                 if (ret)
3615                         return ret;
3616         }
3617
3618         return ret;
3619 }
3620
3621 /*
3622  * Here we only respond to action of unbound device from driver.
3623  *
3624  * Added device is not attached to its DMAR domain here yet. That will happen
3625  * when mapping the device to iova.
3626  */
3627 static int device_notifier(struct notifier_block *nb,
3628                                   unsigned long action, void *data)
3629 {
3630         struct device *dev = data;
3631         struct pci_dev *pdev = to_pci_dev(dev);
3632         struct dmar_domain *domain;
3633
3634         if (iommu_no_mapping(dev))
3635                 return 0;
3636
3637         domain = find_domain(pdev);
3638         if (!domain)
3639                 return 0;
3640
3641         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3642                 domain_remove_one_dev_info(domain, pdev);
3643
3644                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3645                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3646                     list_empty(&domain->devices))
3647                         domain_exit(domain);
3648         }
3649
3650         return 0;
3651 }
3652
3653 static struct notifier_block device_nb = {
3654         .notifier_call = device_notifier,
3655 };
3656
3657 int __init intel_iommu_init(void)
3658 {
3659         int ret = 0;
3660         struct dmar_drhd_unit *drhd;
3661
3662         /* VT-d is required for a TXT/tboot launch, so enforce that */
3663         force_on = tboot_force_iommu();
3664
3665         if (dmar_table_init()) {
3666                 if (force_on)
3667                         panic("tboot: Failed to initialize DMAR table\n");
3668                 return  -ENODEV;
3669         }
3670
3671         /*
3672          * Disable translation if already enabled prior to OS handover.
3673          */
3674         for_each_drhd_unit(drhd) {
3675                 struct intel_iommu *iommu;
3676
3677                 if (drhd->ignored)
3678                         continue;
3679
3680                 iommu = drhd->iommu;
3681                 if (iommu->gcmd & DMA_GCMD_TE)
3682                         iommu_disable_translation(iommu);
3683         }
3684
3685         if (dmar_dev_scope_init() < 0) {
3686                 if (force_on)
3687                         panic("tboot: Failed to initialize DMAR device scope\n");
3688                 return  -ENODEV;
3689         }
3690
3691         if (no_iommu || dmar_disabled)
3692                 return -ENODEV;
3693
3694         if (iommu_init_mempool()) {
3695                 if (force_on)
3696                         panic("tboot: Failed to initialize iommu memory\n");
3697                 return  -ENODEV;
3698         }
3699
3700         if (list_empty(&dmar_rmrr_units))
3701                 printk(KERN_INFO "DMAR: No RMRR found\n");
3702
3703         if (list_empty(&dmar_atsr_units))
3704                 printk(KERN_INFO "DMAR: No ATSR found\n");
3705
3706         if (dmar_init_reserved_ranges()) {
3707                 if (force_on)
3708                         panic("tboot: Failed to reserve iommu ranges\n");
3709                 return  -ENODEV;
3710         }
3711
3712         init_no_remapping_devices();
3713
3714         ret = init_dmars();
3715         if (ret) {
3716                 if (force_on)
3717                         panic("tboot: Failed to initialize DMARs\n");
3718                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3719                 put_iova_domain(&reserved_iova_list);
3720                 iommu_exit_mempool();
3721                 return ret;
3722         }
3723         printk(KERN_INFO
3724         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3725
3726         init_timer(&unmap_timer);
3727 #ifdef CONFIG_SWIOTLB
3728         swiotlb = 0;
3729 #endif
3730         dma_ops = &intel_dma_ops;
3731
3732         init_iommu_pm_ops();
3733
3734         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3735
3736         bus_register_notifier(&pci_bus_type, &device_nb);
3737
3738         intel_iommu_enabled = 1;
3739
3740         return 0;
3741 }
3742
3743 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3744                                            struct pci_dev *pdev)
3745 {
3746         struct pci_dev *tmp, *parent;
3747
3748         if (!iommu || !pdev)
3749                 return;
3750
3751         /* dependent device detach */
3752         tmp = pci_find_upstream_pcie_bridge(pdev);
3753         /* Secondary interface's bus number and devfn 0 */
3754         if (tmp) {
3755                 parent = pdev->bus->self;
3756                 while (parent != tmp) {
3757                         iommu_detach_dev(iommu, parent->bus->number,
3758                                          parent->devfn);
3759                         parent = parent->bus->self;
3760                 }
3761                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3762                         iommu_detach_dev(iommu,
3763                                 tmp->subordinate->number, 0);
3764                 else /* this is a legacy PCI bridge */
3765                         iommu_detach_dev(iommu, tmp->bus->number,
3766                                          tmp->devfn);
3767         }
3768 }
3769
3770 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3771                                           struct pci_dev *pdev)
3772 {
3773         struct device_domain_info *info;
3774         struct intel_iommu *iommu;
3775         unsigned long flags;
3776         int found = 0;
3777         struct list_head *entry, *tmp;
3778
3779         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3780                                 pdev->devfn);
3781         if (!iommu)
3782                 return;
3783
3784         spin_lock_irqsave(&device_domain_lock, flags);
3785         list_for_each_safe(entry, tmp, &domain->devices) {
3786                 info = list_entry(entry, struct device_domain_info, link);
3787                 if (info->segment == pci_domain_nr(pdev->bus) &&
3788                     info->bus == pdev->bus->number &&
3789                     info->devfn == pdev->devfn) {
3790                         list_del(&info->link);
3791                         list_del(&info->global);
3792                         if (info->dev)
3793                                 info->dev->dev.archdata.iommu = NULL;
3794                         spin_unlock_irqrestore(&device_domain_lock, flags);
3795
3796                         iommu_disable_dev_iotlb(info);
3797                         iommu_detach_dev(iommu, info->bus, info->devfn);
3798                         iommu_detach_dependent_devices(iommu, pdev);
3799                         free_devinfo_mem(info);
3800
3801                         spin_lock_irqsave(&device_domain_lock, flags);
3802
3803                         if (found)
3804                                 break;
3805                         else
3806                                 continue;
3807                 }
3808
3809                 /* if there is no other devices under the same iommu
3810                  * owned by this domain, clear this iommu in iommu_bmp
3811                  * update iommu count and coherency
3812                  */
3813                 if (iommu == device_to_iommu(info->segment, info->bus,
3814                                             info->devfn))
3815                         found = 1;
3816         }
3817
3818         spin_unlock_irqrestore(&device_domain_lock, flags);
3819
3820         if (found == 0) {
3821                 unsigned long tmp_flags;
3822                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3823                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3824                 domain->iommu_count--;
3825                 domain_update_iommu_cap(domain);
3826                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3827
3828                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3829                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3830                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3831                         clear_bit(domain->id, iommu->domain_ids);
3832                         iommu->domains[domain->id] = NULL;
3833                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3834                 }
3835         }
3836 }
3837
3838 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3839 {
3840         struct device_domain_info *info;
3841         struct intel_iommu *iommu;
3842         unsigned long flags1, flags2;
3843
3844         spin_lock_irqsave(&device_domain_lock, flags1);
3845         while (!list_empty(&domain->devices)) {
3846                 info = list_entry(domain->devices.next,
3847                         struct device_domain_info, link);
3848                 list_del(&info->link);
3849                 list_del(&info->global);
3850                 if (info->dev)
3851                         info->dev->dev.archdata.iommu = NULL;
3852
3853                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3854
3855                 iommu_disable_dev_iotlb(info);
3856                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3857                 iommu_detach_dev(iommu, info->bus, info->devfn);
3858                 iommu_detach_dependent_devices(iommu, info->dev);
3859
3860                 /* clear this iommu in iommu_bmp, update iommu count
3861                  * and capabilities
3862                  */
3863                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3864                 if (test_and_clear_bit(iommu->seq_id,
3865                                        &domain->iommu_bmp)) {
3866                         domain->iommu_count--;
3867                         domain_update_iommu_cap(domain);
3868                 }
3869                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3870
3871                 free_devinfo_mem(info);
3872                 spin_lock_irqsave(&device_domain_lock, flags1);
3873         }
3874         spin_unlock_irqrestore(&device_domain_lock, flags1);
3875 }
3876
3877 /* domain id for virtual machine, it won't be set in context */
3878 static unsigned long vm_domid;
3879
3880 static struct dmar_domain *iommu_alloc_vm_domain(void)
3881 {
3882         struct dmar_domain *domain;
3883
3884         domain = alloc_domain_mem();
3885         if (!domain)
3886                 return NULL;
3887
3888         domain->id = vm_domid++;
3889         domain->nid = -1;
3890         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3891         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3892
3893         return domain;
3894 }
3895
3896 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3897 {
3898         int adjust_width;
3899
3900         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3901         spin_lock_init(&domain->iommu_lock);
3902
3903         domain_reserve_special_ranges(domain);
3904
3905         /* calculate AGAW */
3906         domain->gaw = guest_width;
3907         adjust_width = guestwidth_to_adjustwidth(guest_width);
3908         domain->agaw = width_to_agaw(adjust_width);
3909
3910         INIT_LIST_HEAD(&domain->devices);
3911
3912         domain->iommu_count = 0;
3913         domain->iommu_coherency = 0;
3914         domain->iommu_snooping = 0;
3915         domain->iommu_superpage = 0;
3916         domain->max_addr = 0;
3917         domain->nid = -1;
3918
3919         /* always allocate the top pgd */
3920         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3921         if (!domain->pgd)
3922                 return -ENOMEM;
3923         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3924         return 0;
3925 }
3926
3927 static void iommu_free_vm_domain(struct dmar_domain *domain)
3928 {
3929         unsigned long flags;
3930         struct dmar_drhd_unit *drhd;
3931         struct intel_iommu *iommu;
3932         unsigned long i;
3933         unsigned long ndomains;
3934
3935         for_each_drhd_unit(drhd) {
3936                 if (drhd->ignored)
3937                         continue;
3938                 iommu = drhd->iommu;
3939
3940                 ndomains = cap_ndoms(iommu->cap);
3941                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3942                         if (iommu->domains[i] == domain) {
3943                                 spin_lock_irqsave(&iommu->lock, flags);
3944                                 clear_bit(i, iommu->domain_ids);
3945                                 iommu->domains[i] = NULL;
3946                                 spin_unlock_irqrestore(&iommu->lock, flags);
3947                                 break;
3948                         }
3949                 }
3950         }
3951 }
3952
3953 static void vm_domain_exit(struct dmar_domain *domain)
3954 {
3955         /* Domain 0 is reserved, so dont process it */
3956         if (!domain)
3957                 return;
3958
3959         vm_domain_remove_all_dev_info(domain);
3960         /* destroy iovas */
3961         put_iova_domain(&domain->iovad);
3962
3963         /* clear ptes */
3964         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3965
3966         /* free page tables */
3967         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3968
3969         iommu_free_vm_domain(domain);
3970         free_domain_mem(domain);
3971 }
3972
3973 static int intel_iommu_domain_init(struct iommu_domain *domain)
3974 {
3975         struct dmar_domain *dmar_domain;
3976
3977         dmar_domain = iommu_alloc_vm_domain();
3978         if (!dmar_domain) {
3979                 printk(KERN_ERR
3980                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3981                 return -ENOMEM;
3982         }
3983         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3984                 printk(KERN_ERR
3985                         "intel_iommu_domain_init() failed\n");
3986                 vm_domain_exit(dmar_domain);
3987                 return -ENOMEM;
3988         }
3989         domain_update_iommu_cap(dmar_domain);
3990         domain->priv = dmar_domain;
3991
3992         return 0;
3993 }
3994
3995 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3996 {
3997         struct dmar_domain *dmar_domain = domain->priv;
3998
3999         domain->priv = NULL;
4000         vm_domain_exit(dmar_domain);
4001 }
4002
4003 static int intel_iommu_attach_device(struct iommu_domain *domain,
4004                                      struct device *dev)
4005 {
4006         struct dmar_domain *dmar_domain = domain->priv;
4007         struct pci_dev *pdev = to_pci_dev(dev);
4008         struct intel_iommu *iommu;
4009         int addr_width;
4010
4011         if (device_is_rmrr_locked(pdev)) {
4012                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4013                 return -EPERM;
4014         }
4015
4016         /* normally pdev is not mapped */
4017         if (unlikely(domain_context_mapped(pdev))) {
4018                 struct dmar_domain *old_domain;
4019
4020                 old_domain = find_domain(pdev);
4021                 if (old_domain) {
4022                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4023                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4024                                 domain_remove_one_dev_info(old_domain, pdev);
4025                         else
4026                                 domain_remove_dev_info(old_domain);
4027                 }
4028         }
4029
4030         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4031                                 pdev->devfn);
4032         if (!iommu)
4033                 return -ENODEV;
4034
4035         /* check if this iommu agaw is sufficient for max mapped address */
4036         addr_width = agaw_to_width(iommu->agaw);
4037         if (addr_width > cap_mgaw(iommu->cap))
4038                 addr_width = cap_mgaw(iommu->cap);
4039
4040         if (dmar_domain->max_addr > (1LL << addr_width)) {
4041                 printk(KERN_ERR "%s: iommu width (%d) is not "
4042                        "sufficient for the mapped address (%llx)\n",
4043                        __func__, addr_width, dmar_domain->max_addr);
4044                 return -EFAULT;
4045         }
4046         dmar_domain->gaw = addr_width;
4047
4048         /*
4049          * Knock out extra levels of page tables if necessary
4050          */
4051         while (iommu->agaw < dmar_domain->agaw) {
4052                 struct dma_pte *pte;
4053
4054                 pte = dmar_domain->pgd;
4055                 if (dma_pte_present(pte)) {
4056                         dmar_domain->pgd = (struct dma_pte *)
4057                                 phys_to_virt(dma_pte_addr(pte));
4058                         free_pgtable_page(pte);
4059                 }
4060                 dmar_domain->agaw--;
4061         }
4062
4063         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4064 }
4065
4066 static void intel_iommu_detach_device(struct iommu_domain *domain,
4067                                       struct device *dev)
4068 {
4069         struct dmar_domain *dmar_domain = domain->priv;
4070         struct pci_dev *pdev = to_pci_dev(dev);
4071
4072         domain_remove_one_dev_info(dmar_domain, pdev);
4073 }
4074
4075 static int intel_iommu_map(struct iommu_domain *domain,
4076                            unsigned long iova, phys_addr_t hpa,
4077                            int gfp_order, int iommu_prot)
4078 {
4079         struct dmar_domain *dmar_domain = domain->priv;
4080         u64 max_addr;
4081         int prot = 0;
4082         size_t size;
4083         int ret;
4084
4085         if (iommu_prot & IOMMU_READ)
4086                 prot |= DMA_PTE_READ;
4087         if (iommu_prot & IOMMU_WRITE)
4088                 prot |= DMA_PTE_WRITE;
4089         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4090                 prot |= DMA_PTE_SNP;
4091
4092         size     = PAGE_SIZE << gfp_order;
4093         max_addr = iova + size;
4094         if (dmar_domain->max_addr < max_addr) {
4095                 u64 end;
4096
4097                 /* check if minimum agaw is sufficient for mapped address */
4098                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4099                 if (end < max_addr) {
4100                         printk(KERN_ERR "%s: iommu width (%d) is not "
4101                                "sufficient for the mapped address (%llx)\n",
4102                                __func__, dmar_domain->gaw, max_addr);
4103                         return -EFAULT;
4104                 }
4105                 dmar_domain->max_addr = max_addr;
4106         }
4107         /* Round up size to next multiple of PAGE_SIZE, if it and
4108            the low bits of hpa would take us onto the next page */
4109         size = aligned_nrpages(hpa, size);
4110         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4111                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4112         return ret;
4113 }
4114
4115 static int intel_iommu_unmap(struct iommu_domain *domain,
4116                              unsigned long iova, int gfp_order)
4117 {
4118         struct dmar_domain *dmar_domain = domain->priv;
4119         size_t size = PAGE_SIZE << gfp_order;
4120         int order, iommu_id;
4121
4122         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4123                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4124
4125         if (dmar_domain->max_addr == iova + size)
4126                 dmar_domain->max_addr = iova;
4127
4128         for_each_set_bit(iommu_id, &dmar_domain->iommu_bmp, g_num_of_iommus) {
4129                 struct intel_iommu *iommu = g_iommus[iommu_id];
4130                 int num, ndomains;
4131
4132                 /*
4133                  * find bit position of dmar_domain
4134                  */
4135                 ndomains = cap_ndoms(iommu->cap);
4136                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
4137                         if (iommu->domains[num] == dmar_domain)
4138                                 iommu_flush_iotlb_psi(iommu, num,
4139                                                       iova >> VTD_PAGE_SHIFT,
4140                                                       1 << order, 0);
4141                 }
4142         }
4143
4144         return order;
4145 }
4146
4147 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4148                                             unsigned long iova)
4149 {
4150         struct dmar_domain *dmar_domain = domain->priv;
4151         struct dma_pte *pte;
4152         u64 phys = 0;
4153
4154         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4155         if (pte)
4156                 phys = dma_pte_addr(pte);
4157
4158         return phys;
4159 }
4160
4161 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4162                                       unsigned long cap)
4163 {
4164         struct dmar_domain *dmar_domain = domain->priv;
4165
4166         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4167                 return dmar_domain->iommu_snooping;
4168         if (cap == IOMMU_CAP_INTR_REMAP)
4169                 return intr_remapping_enabled;
4170
4171         return 0;
4172 }
4173
4174 static struct iommu_ops intel_iommu_ops = {
4175         .domain_init    = intel_iommu_domain_init,
4176         .domain_destroy = intel_iommu_domain_destroy,
4177         .attach_dev     = intel_iommu_attach_device,
4178         .detach_dev     = intel_iommu_detach_device,
4179         .map            = intel_iommu_map,
4180         .unmap          = intel_iommu_unmap,
4181         .iova_to_phys   = intel_iommu_iova_to_phys,
4182         .domain_has_cap = intel_iommu_domain_has_cap,
4183 };
4184
4185 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4186 {
4187         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4188         printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4189         dmar_map_gfx = 0;
4190 }
4191
4192 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4193 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4194 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4195 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4196 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4197 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4198 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4199
4200 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4201 {
4202         /*
4203          * Mobile 4 Series Chipset neglects to set RWBF capability,
4204          * but needs it. Same seems to hold for the desktop versions.
4205          */
4206         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4207         rwbf_quirk = 1;
4208 }
4209
4210 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4211 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4212 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4213 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4214 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4215 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4216 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4217
4218 #define GGC 0x52
4219 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4220 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4221 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4222 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4223 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4224 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4225 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4226 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4227
4228 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4229 {
4230         unsigned short ggc;
4231
4232         if (pci_read_config_word(dev, GGC, &ggc))
4233                 return;
4234
4235         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4236                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4237                 dmar_map_gfx = 0;
4238         } else if (dmar_map_gfx) {
4239                 /* we have to ensure the gfx device is idle before we flush */
4240                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4241                 intel_iommu_strict = 1;
4242        }
4243 }
4244 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4245 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4248
4249 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4250    ISOCH DMAR unit for the Azalia sound device, but not give it any
4251    TLB entries, which causes it to deadlock. Check for that.  We do
4252    this in a function called from init_dmars(), instead of in a PCI
4253    quirk, because we don't want to print the obnoxious "BIOS broken"
4254    message if VT-d is actually disabled.
4255 */
4256 static void __init check_tylersburg_isoch(void)
4257 {
4258         struct pci_dev *pdev;
4259         uint32_t vtisochctrl;
4260
4261         /* If there's no Azalia in the system anyway, forget it. */
4262         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4263         if (!pdev)
4264                 return;
4265         pci_dev_put(pdev);
4266
4267         /* System Management Registers. Might be hidden, in which case
4268            we can't do the sanity check. But that's OK, because the
4269            known-broken BIOSes _don't_ actually hide it, so far. */
4270         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4271         if (!pdev)
4272                 return;
4273
4274         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4275                 pci_dev_put(pdev);
4276                 return;
4277         }
4278
4279         pci_dev_put(pdev);
4280
4281         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4282         if (vtisochctrl & 1)
4283                 return;
4284
4285         /* Drop all bits other than the number of TLB entries */
4286         vtisochctrl &= 0x1c;
4287
4288         /* If we have the recommended number of TLB entries (16), fine. */
4289         if (vtisochctrl == 0x10)
4290                 return;
4291
4292         /* Zero TLB entries? You get to ride the short bus to school. */
4293         if (!vtisochctrl) {
4294                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4295                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4296                      dmi_get_system_info(DMI_BIOS_VENDOR),
4297                      dmi_get_system_info(DMI_BIOS_VERSION),
4298                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4299                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4300                 return;
4301         }
4302         
4303         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4304                vtisochctrl);
4305 }