bb1e579267460b7536c5b76efcfb4a7fedb88d81
[pandora-kernel.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
46
47 #define ROOT_SIZE               VTD_PAGE_SIZE
48 #define CONTEXT_SIZE            VTD_PAGE_SIZE
49
50 #define IS_BRIDGE_HOST_DEVICE(pdev) \
51                             ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63
64 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66
67 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
70                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72
73 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
74 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
75 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
76
77 /* page table handling */
78 #define LEVEL_STRIDE            (9)
79 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
80
81 static inline int agaw_to_level(int agaw)
82 {
83         return agaw + 2;
84 }
85
86 static inline int agaw_to_width(int agaw)
87 {
88         return 30 + agaw * LEVEL_STRIDE;
89 }
90
91 static inline int width_to_agaw(int width)
92 {
93         return (width - 30) / LEVEL_STRIDE;
94 }
95
96 static inline unsigned int level_to_offset_bits(int level)
97 {
98         return (level - 1) * LEVEL_STRIDE;
99 }
100
101 static inline int pfn_level_offset(unsigned long pfn, int level)
102 {
103         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
104 }
105
106 static inline unsigned long level_mask(int level)
107 {
108         return -1UL << level_to_offset_bits(level);
109 }
110
111 static inline unsigned long level_size(int level)
112 {
113         return 1UL << level_to_offset_bits(level);
114 }
115
116 static inline unsigned long align_to_level(unsigned long pfn, int level)
117 {
118         return (pfn + level_size(level) - 1) & level_mask(level);
119 }
120
121 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
122 {
123         return  1 << ((lvl - 1) * LEVEL_STRIDE);
124 }
125
126 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
127    are never going to work. */
128 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
129 {
130         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
131 }
132
133 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
134 {
135         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
136 }
137 static inline unsigned long page_to_dma_pfn(struct page *pg)
138 {
139         return mm_to_dma_pfn(page_to_pfn(pg));
140 }
141 static inline unsigned long virt_to_dma_pfn(void *p)
142 {
143         return page_to_dma_pfn(virt_to_page(p));
144 }
145
146 /* global iommu list, set NULL for ignored DMAR units */
147 static struct intel_iommu **g_iommus;
148
149 static void __init check_tylersburg_isoch(void);
150 static int rwbf_quirk;
151
152 /*
153  * set to 1 to panic kernel if can't successfully enable VT-d
154  * (used when kernel is launched w/ TXT)
155  */
156 static int force_on = 0;
157
158 /*
159  * 0: Present
160  * 1-11: Reserved
161  * 12-63: Context Ptr (12 - (haw-1))
162  * 64-127: Reserved
163  */
164 struct root_entry {
165         u64     val;
166         u64     rsvd1;
167 };
168 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
169 static inline bool root_present(struct root_entry *root)
170 {
171         return (root->val & 1);
172 }
173 static inline void set_root_present(struct root_entry *root)
174 {
175         root->val |= 1;
176 }
177 static inline void set_root_value(struct root_entry *root, unsigned long value)
178 {
179         root->val |= value & VTD_PAGE_MASK;
180 }
181
182 static inline struct context_entry *
183 get_context_addr_from_root(struct root_entry *root)
184 {
185         return (struct context_entry *)
186                 (root_present(root)?phys_to_virt(
187                 root->val & VTD_PAGE_MASK) :
188                 NULL);
189 }
190
191 /*
192  * low 64 bits:
193  * 0: present
194  * 1: fault processing disable
195  * 2-3: translation type
196  * 12-63: address space root
197  * high 64 bits:
198  * 0-2: address width
199  * 3-6: aval
200  * 8-23: domain id
201  */
202 struct context_entry {
203         u64 lo;
204         u64 hi;
205 };
206
207 static inline bool context_present(struct context_entry *context)
208 {
209         return (context->lo & 1);
210 }
211 static inline void context_set_present(struct context_entry *context)
212 {
213         context->lo |= 1;
214 }
215
216 static inline void context_set_fault_enable(struct context_entry *context)
217 {
218         context->lo &= (((u64)-1) << 2) | 1;
219 }
220
221 static inline void context_set_translation_type(struct context_entry *context,
222                                                 unsigned long value)
223 {
224         context->lo &= (((u64)-1) << 4) | 3;
225         context->lo |= (value & 3) << 2;
226 }
227
228 static inline void context_set_address_root(struct context_entry *context,
229                                             unsigned long value)
230 {
231         context->lo |= value & VTD_PAGE_MASK;
232 }
233
234 static inline void context_set_address_width(struct context_entry *context,
235                                              unsigned long value)
236 {
237         context->hi |= value & 7;
238 }
239
240 static inline void context_set_domain_id(struct context_entry *context,
241                                          unsigned long value)
242 {
243         context->hi |= (value & ((1 << 16) - 1)) << 8;
244 }
245
246 static inline void context_clear_entry(struct context_entry *context)
247 {
248         context->lo = 0;
249         context->hi = 0;
250 }
251
252 /*
253  * 0: readable
254  * 1: writable
255  * 2-6: reserved
256  * 7: super page
257  * 8-10: available
258  * 11: snoop behavior
259  * 12-63: Host physcial address
260  */
261 struct dma_pte {
262         u64 val;
263 };
264
265 static inline void dma_clear_pte(struct dma_pte *pte)
266 {
267         pte->val = 0;
268 }
269
270 static inline void dma_set_pte_readable(struct dma_pte *pte)
271 {
272         pte->val |= DMA_PTE_READ;
273 }
274
275 static inline void dma_set_pte_writable(struct dma_pte *pte)
276 {
277         pte->val |= DMA_PTE_WRITE;
278 }
279
280 static inline void dma_set_pte_snp(struct dma_pte *pte)
281 {
282         pte->val |= DMA_PTE_SNP;
283 }
284
285 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
286 {
287         pte->val = (pte->val & ~3) | (prot & 3);
288 }
289
290 static inline u64 dma_pte_addr(struct dma_pte *pte)
291 {
292 #ifdef CONFIG_64BIT
293         return pte->val & VTD_PAGE_MASK;
294 #else
295         /* Must have a full atomic 64-bit read */
296         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
297 #endif
298 }
299
300 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
301 {
302         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
303 }
304
305 static inline bool dma_pte_present(struct dma_pte *pte)
306 {
307         return (pte->val & 3) != 0;
308 }
309
310 static inline bool dma_pte_superpage(struct dma_pte *pte)
311 {
312         return (pte->val & (1 << 7));
313 }
314
315 static inline int first_pte_in_page(struct dma_pte *pte)
316 {
317         return !((unsigned long)pte & ~VTD_PAGE_MASK);
318 }
319
320 /*
321  * This domain is a statically identity mapping domain.
322  *      1. This domain creats a static 1:1 mapping to all usable memory.
323  *      2. It maps to each iommu if successful.
324  *      3. Each iommu mapps to this domain if successful.
325  */
326 static struct dmar_domain *si_domain;
327 static int hw_pass_through = 1;
328
329 /* devices under the same p2p bridge are owned in one domain */
330 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
331
332 /* domain represents a virtual machine, more than one devices
333  * across iommus may be owned in one domain, e.g. kvm guest.
334  */
335 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
336
337 /* si_domain contains mulitple devices */
338 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
339
340 struct dmar_domain {
341         int     id;                     /* domain id */
342         int     nid;                    /* node id */
343         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
344
345         struct list_head devices;       /* all devices' list */
346         struct iova_domain iovad;       /* iova's that belong to this domain */
347
348         struct dma_pte  *pgd;           /* virtual address */
349         int             gaw;            /* max guest address width */
350
351         /* adjusted guest address width, 0 is level 2 30-bit */
352         int             agaw;
353
354         int             flags;          /* flags to find out type of domain */
355
356         int             iommu_coherency;/* indicate coherency of iommu access */
357         int             iommu_snooping; /* indicate snooping control feature*/
358         int             iommu_count;    /* reference count of iommu */
359         int             iommu_superpage;/* Level of superpages supported:
360                                            0 == 4KiB (no superpages), 1 == 2MiB,
361                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
362         spinlock_t      iommu_lock;     /* protect iommu set in domain */
363         u64             max_addr;       /* maximum mapped address */
364 };
365
366 /* PCI domain-device relationship */
367 struct device_domain_info {
368         struct list_head link;  /* link to domain siblings */
369         struct list_head global; /* link to global list */
370         int segment;            /* PCI domain */
371         u8 bus;                 /* PCI bus number */
372         u8 devfn;               /* PCI devfn number */
373         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
374         struct intel_iommu *iommu; /* IOMMU used by this device */
375         struct dmar_domain *domain; /* pointer to domain */
376 };
377
378 static void flush_unmaps_timeout(unsigned long data);
379
380 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
381
382 #define HIGH_WATER_MARK 250
383 struct deferred_flush_tables {
384         int next;
385         struct iova *iova[HIGH_WATER_MARK];
386         struct dmar_domain *domain[HIGH_WATER_MARK];
387 };
388
389 static struct deferred_flush_tables *deferred_flush;
390
391 /* bitmap for indexing intel_iommus */
392 static int g_num_of_iommus;
393
394 static DEFINE_SPINLOCK(async_umap_flush_lock);
395 static LIST_HEAD(unmaps_to_do);
396
397 static int timer_on;
398 static long list_size;
399
400 static void domain_remove_dev_info(struct dmar_domain *domain);
401
402 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
403 int dmar_disabled = 0;
404 #else
405 int dmar_disabled = 1;
406 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
407
408 int intel_iommu_enabled = 0;
409 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
410
411 static int dmar_map_gfx = 1;
412 static int dmar_forcedac;
413 static int intel_iommu_strict;
414 static int intel_iommu_superpage = 1;
415
416 int intel_iommu_gfx_mapped;
417 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
418
419 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
420 static DEFINE_SPINLOCK(device_domain_lock);
421 static LIST_HEAD(device_domain_list);
422
423 static struct iommu_ops intel_iommu_ops;
424
425 static int __init intel_iommu_setup(char *str)
426 {
427         if (!str)
428                 return -EINVAL;
429         while (*str) {
430                 if (!strncmp(str, "on", 2)) {
431                         dmar_disabled = 0;
432                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
433                 } else if (!strncmp(str, "off", 3)) {
434                         dmar_disabled = 1;
435                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
436                 } else if (!strncmp(str, "igfx_off", 8)) {
437                         dmar_map_gfx = 0;
438                         printk(KERN_INFO
439                                 "Intel-IOMMU: disable GFX device mapping\n");
440                 } else if (!strncmp(str, "forcedac", 8)) {
441                         printk(KERN_INFO
442                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
443                         dmar_forcedac = 1;
444                 } else if (!strncmp(str, "strict", 6)) {
445                         printk(KERN_INFO
446                                 "Intel-IOMMU: disable batched IOTLB flush\n");
447                         intel_iommu_strict = 1;
448                 } else if (!strncmp(str, "sp_off", 6)) {
449                         printk(KERN_INFO
450                                 "Intel-IOMMU: disable supported super page\n");
451                         intel_iommu_superpage = 0;
452                 }
453
454                 str += strcspn(str, ",");
455                 while (*str == ',')
456                         str++;
457         }
458         return 0;
459 }
460 __setup("intel_iommu=", intel_iommu_setup);
461
462 static struct kmem_cache *iommu_domain_cache;
463 static struct kmem_cache *iommu_devinfo_cache;
464 static struct kmem_cache *iommu_iova_cache;
465
466 static inline void *alloc_pgtable_page(int node)
467 {
468         struct page *page;
469         void *vaddr = NULL;
470
471         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
472         if (page)
473                 vaddr = page_address(page);
474         return vaddr;
475 }
476
477 static inline void free_pgtable_page(void *vaddr)
478 {
479         free_page((unsigned long)vaddr);
480 }
481
482 static inline void *alloc_domain_mem(void)
483 {
484         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
485 }
486
487 static void free_domain_mem(void *vaddr)
488 {
489         kmem_cache_free(iommu_domain_cache, vaddr);
490 }
491
492 static inline void * alloc_devinfo_mem(void)
493 {
494         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
495 }
496
497 static inline void free_devinfo_mem(void *vaddr)
498 {
499         kmem_cache_free(iommu_devinfo_cache, vaddr);
500 }
501
502 struct iova *alloc_iova_mem(void)
503 {
504         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
505 }
506
507 void free_iova_mem(struct iova *iova)
508 {
509         kmem_cache_free(iommu_iova_cache, iova);
510 }
511
512
513 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
514 {
515         unsigned long sagaw;
516         int agaw = -1;
517
518         sagaw = cap_sagaw(iommu->cap);
519         for (agaw = width_to_agaw(max_gaw);
520              agaw >= 0; agaw--) {
521                 if (test_bit(agaw, &sagaw))
522                         break;
523         }
524
525         return agaw;
526 }
527
528 /*
529  * Calculate max SAGAW for each iommu.
530  */
531 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
532 {
533         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
534 }
535
536 /*
537  * calculate agaw for each iommu.
538  * "SAGAW" may be different across iommus, use a default agaw, and
539  * get a supported less agaw for iommus that don't support the default agaw.
540  */
541 int iommu_calculate_agaw(struct intel_iommu *iommu)
542 {
543         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
544 }
545
546 /* This functionin only returns single iommu in a domain */
547 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
548 {
549         int iommu_id;
550
551         /* si_domain and vm domain should not get here. */
552         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
553         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
554
555         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
556         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
557                 return NULL;
558
559         return g_iommus[iommu_id];
560 }
561
562 static void domain_update_iommu_coherency(struct dmar_domain *domain)
563 {
564         int i;
565
566         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
567
568         domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
569
570         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
571                 if (!ecap_coherent(g_iommus[i]->ecap)) {
572                         domain->iommu_coherency = 0;
573                         break;
574                 }
575         }
576 }
577
578 static void domain_update_iommu_snooping(struct dmar_domain *domain)
579 {
580         int i;
581
582         domain->iommu_snooping = 1;
583
584         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
585                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
586                         domain->iommu_snooping = 0;
587                         break;
588                 }
589         }
590 }
591
592 static void domain_update_iommu_superpage(struct dmar_domain *domain)
593 {
594         struct dmar_drhd_unit *drhd;
595         struct intel_iommu *iommu = NULL;
596         int mask = 0xf;
597
598         if (!intel_iommu_superpage) {
599                 domain->iommu_superpage = 0;
600                 return;
601         }
602
603         /* set iommu_superpage to the smallest common denominator */
604         for_each_active_iommu(iommu, drhd) {
605                 mask &= cap_super_page_val(iommu->cap);
606                 if (!mask) {
607                         break;
608                 }
609         }
610         domain->iommu_superpage = fls(mask);
611 }
612
613 /* Some capabilities may be different across iommus */
614 static void domain_update_iommu_cap(struct dmar_domain *domain)
615 {
616         domain_update_iommu_coherency(domain);
617         domain_update_iommu_snooping(domain);
618         domain_update_iommu_superpage(domain);
619 }
620
621 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
622 {
623         struct dmar_drhd_unit *drhd = NULL;
624         int i;
625
626         for_each_drhd_unit(drhd) {
627                 if (drhd->ignored)
628                         continue;
629                 if (segment != drhd->segment)
630                         continue;
631
632                 for (i = 0; i < drhd->devices_cnt; i++) {
633                         if (drhd->devices[i] &&
634                             drhd->devices[i]->bus->number == bus &&
635                             drhd->devices[i]->devfn == devfn)
636                                 return drhd->iommu;
637                         if (drhd->devices[i] &&
638                             drhd->devices[i]->subordinate &&
639                             drhd->devices[i]->subordinate->number <= bus &&
640                             drhd->devices[i]->subordinate->subordinate >= bus)
641                                 return drhd->iommu;
642                 }
643
644                 if (drhd->include_all)
645                         return drhd->iommu;
646         }
647
648         return NULL;
649 }
650
651 static void domain_flush_cache(struct dmar_domain *domain,
652                                void *addr, int size)
653 {
654         if (!domain->iommu_coherency)
655                 clflush_cache_range(addr, size);
656 }
657
658 /* Gets context entry for a given bus and devfn */
659 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
660                 u8 bus, u8 devfn)
661 {
662         struct root_entry *root;
663         struct context_entry *context;
664         unsigned long phy_addr;
665         unsigned long flags;
666
667         spin_lock_irqsave(&iommu->lock, flags);
668         root = &iommu->root_entry[bus];
669         context = get_context_addr_from_root(root);
670         if (!context) {
671                 context = (struct context_entry *)
672                                 alloc_pgtable_page(iommu->node);
673                 if (!context) {
674                         spin_unlock_irqrestore(&iommu->lock, flags);
675                         return NULL;
676                 }
677                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
678                 phy_addr = virt_to_phys((void *)context);
679                 set_root_value(root, phy_addr);
680                 set_root_present(root);
681                 __iommu_flush_cache(iommu, root, sizeof(*root));
682         }
683         spin_unlock_irqrestore(&iommu->lock, flags);
684         return &context[devfn];
685 }
686
687 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
688 {
689         struct root_entry *root;
690         struct context_entry *context;
691         int ret;
692         unsigned long flags;
693
694         spin_lock_irqsave(&iommu->lock, flags);
695         root = &iommu->root_entry[bus];
696         context = get_context_addr_from_root(root);
697         if (!context) {
698                 ret = 0;
699                 goto out;
700         }
701         ret = context_present(&context[devfn]);
702 out:
703         spin_unlock_irqrestore(&iommu->lock, flags);
704         return ret;
705 }
706
707 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
708 {
709         struct root_entry *root;
710         struct context_entry *context;
711         unsigned long flags;
712
713         spin_lock_irqsave(&iommu->lock, flags);
714         root = &iommu->root_entry[bus];
715         context = get_context_addr_from_root(root);
716         if (context) {
717                 context_clear_entry(&context[devfn]);
718                 __iommu_flush_cache(iommu, &context[devfn], \
719                         sizeof(*context));
720         }
721         spin_unlock_irqrestore(&iommu->lock, flags);
722 }
723
724 static void free_context_table(struct intel_iommu *iommu)
725 {
726         struct root_entry *root;
727         int i;
728         unsigned long flags;
729         struct context_entry *context;
730
731         spin_lock_irqsave(&iommu->lock, flags);
732         if (!iommu->root_entry) {
733                 goto out;
734         }
735         for (i = 0; i < ROOT_ENTRY_NR; i++) {
736                 root = &iommu->root_entry[i];
737                 context = get_context_addr_from_root(root);
738                 if (context)
739                         free_pgtable_page(context);
740         }
741         free_pgtable_page(iommu->root_entry);
742         iommu->root_entry = NULL;
743 out:
744         spin_unlock_irqrestore(&iommu->lock, flags);
745 }
746
747 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
748                                       unsigned long pfn, int target_level)
749 {
750         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
751         struct dma_pte *parent, *pte = NULL;
752         int level = agaw_to_level(domain->agaw);
753         int offset;
754
755         BUG_ON(!domain->pgd);
756         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
757         parent = domain->pgd;
758
759         while (level > 0) {
760                 void *tmp_page;
761
762                 offset = pfn_level_offset(pfn, level);
763                 pte = &parent[offset];
764                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
765                         break;
766                 if (level == target_level)
767                         break;
768
769                 if (!dma_pte_present(pte)) {
770                         uint64_t pteval;
771
772                         tmp_page = alloc_pgtable_page(domain->nid);
773
774                         if (!tmp_page)
775                                 return NULL;
776
777                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
778                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
779                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
780                                 /* Someone else set it while we were thinking; use theirs. */
781                                 free_pgtable_page(tmp_page);
782                         } else {
783                                 dma_pte_addr(pte);
784                                 domain_flush_cache(domain, pte, sizeof(*pte));
785                         }
786                 }
787                 parent = phys_to_virt(dma_pte_addr(pte));
788                 level--;
789         }
790
791         return pte;
792 }
793
794
795 /* return address's pte at specific level */
796 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
797                                          unsigned long pfn,
798                                          int level, int *large_page)
799 {
800         struct dma_pte *parent, *pte = NULL;
801         int total = agaw_to_level(domain->agaw);
802         int offset;
803
804         parent = domain->pgd;
805         while (level <= total) {
806                 offset = pfn_level_offset(pfn, total);
807                 pte = &parent[offset];
808                 if (level == total)
809                         return pte;
810
811                 if (!dma_pte_present(pte)) {
812                         *large_page = total;
813                         break;
814                 }
815
816                 if (pte->val & DMA_PTE_LARGE_PAGE) {
817                         *large_page = total;
818                         return pte;
819                 }
820
821                 parent = phys_to_virt(dma_pte_addr(pte));
822                 total--;
823         }
824         return NULL;
825 }
826
827 /* clear last level pte, a tlb flush should be followed */
828 static int dma_pte_clear_range(struct dmar_domain *domain,
829                                 unsigned long start_pfn,
830                                 unsigned long last_pfn)
831 {
832         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
833         unsigned int large_page = 1;
834         struct dma_pte *first_pte, *pte;
835         int order;
836
837         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
838         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
839         BUG_ON(start_pfn > last_pfn);
840
841         /* we don't need lock here; nobody else touches the iova range */
842         do {
843                 large_page = 1;
844                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
845                 if (!pte) {
846                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
847                         continue;
848                 }
849                 do {
850                         dma_clear_pte(pte);
851                         start_pfn += lvl_to_nr_pages(large_page);
852                         pte++;
853                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
854
855                 domain_flush_cache(domain, first_pte,
856                                    (void *)pte - (void *)first_pte);
857
858         } while (start_pfn && start_pfn <= last_pfn);
859
860         order = (large_page - 1) * 9;
861         return order;
862 }
863
864 static void dma_pte_free_level(struct dmar_domain *domain, int level,
865                                struct dma_pte *pte, unsigned long pfn,
866                                unsigned long start_pfn, unsigned long last_pfn)
867 {
868         pfn = max(start_pfn, pfn);
869         pte = &pte[pfn_level_offset(pfn, level)];
870
871         do {
872                 unsigned long level_pfn;
873                 struct dma_pte *level_pte;
874
875                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
876                         goto next;
877
878                 level_pfn = pfn & level_mask(level - 1);
879                 level_pte = phys_to_virt(dma_pte_addr(pte));
880
881                 if (level > 2)
882                         dma_pte_free_level(domain, level - 1, level_pte,
883                                            level_pfn, start_pfn, last_pfn);
884
885                 /* If range covers entire pagetable, free it */
886                 if (!(start_pfn > level_pfn ||
887                       last_pfn < level_pfn + level_size(level) - 1)) {
888                         dma_clear_pte(pte);
889                         domain_flush_cache(domain, pte, sizeof(*pte));
890                         free_pgtable_page(level_pte);
891                 }
892 next:
893                 pfn += level_size(level);
894         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
895 }
896
897 /* free page table pages. last level pte should already be cleared */
898 static void dma_pte_free_pagetable(struct dmar_domain *domain,
899                                    unsigned long start_pfn,
900                                    unsigned long last_pfn)
901 {
902         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
903
904         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
905         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
906         BUG_ON(start_pfn > last_pfn);
907
908         /* We don't need lock here; nobody else touches the iova range */
909         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
910                            domain->pgd, 0, start_pfn, last_pfn);
911
912         /* free pgd */
913         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
914                 free_pgtable_page(domain->pgd);
915                 domain->pgd = NULL;
916         }
917 }
918
919 /* iommu handling */
920 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
921 {
922         struct root_entry *root;
923         unsigned long flags;
924
925         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
926         if (!root)
927                 return -ENOMEM;
928
929         __iommu_flush_cache(iommu, root, ROOT_SIZE);
930
931         spin_lock_irqsave(&iommu->lock, flags);
932         iommu->root_entry = root;
933         spin_unlock_irqrestore(&iommu->lock, flags);
934
935         return 0;
936 }
937
938 static void iommu_set_root_entry(struct intel_iommu *iommu)
939 {
940         void *addr;
941         u32 sts;
942         unsigned long flag;
943
944         addr = iommu->root_entry;
945
946         raw_spin_lock_irqsave(&iommu->register_lock, flag);
947         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
948
949         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
950
951         /* Make sure hardware complete it */
952         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
953                       readl, (sts & DMA_GSTS_RTPS), sts);
954
955         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
956 }
957
958 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
959 {
960         u32 val;
961         unsigned long flag;
962
963         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
964                 return;
965
966         raw_spin_lock_irqsave(&iommu->register_lock, flag);
967         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
968
969         /* Make sure hardware complete it */
970         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
971                       readl, (!(val & DMA_GSTS_WBFS)), val);
972
973         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
974 }
975
976 /* return value determine if we need a write buffer flush */
977 static void __iommu_flush_context(struct intel_iommu *iommu,
978                                   u16 did, u16 source_id, u8 function_mask,
979                                   u64 type)
980 {
981         u64 val = 0;
982         unsigned long flag;
983
984         switch (type) {
985         case DMA_CCMD_GLOBAL_INVL:
986                 val = DMA_CCMD_GLOBAL_INVL;
987                 break;
988         case DMA_CCMD_DOMAIN_INVL:
989                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
990                 break;
991         case DMA_CCMD_DEVICE_INVL:
992                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
993                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
994                 break;
995         default:
996                 BUG();
997         }
998         val |= DMA_CCMD_ICC;
999
1000         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1001         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1002
1003         /* Make sure hardware complete it */
1004         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1005                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1006
1007         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1008 }
1009
1010 /* return value determine if we need a write buffer flush */
1011 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1012                                 u64 addr, unsigned int size_order, u64 type)
1013 {
1014         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1015         u64 val = 0, val_iva = 0;
1016         unsigned long flag;
1017
1018         switch (type) {
1019         case DMA_TLB_GLOBAL_FLUSH:
1020                 /* global flush doesn't need set IVA_REG */
1021                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1022                 break;
1023         case DMA_TLB_DSI_FLUSH:
1024                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1025                 break;
1026         case DMA_TLB_PSI_FLUSH:
1027                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1028                 /* Note: always flush non-leaf currently */
1029                 val_iva = size_order | addr;
1030                 break;
1031         default:
1032                 BUG();
1033         }
1034         /* Note: set drain read/write */
1035 #if 0
1036         /*
1037          * This is probably to be super secure.. Looks like we can
1038          * ignore it without any impact.
1039          */
1040         if (cap_read_drain(iommu->cap))
1041                 val |= DMA_TLB_READ_DRAIN;
1042 #endif
1043         if (cap_write_drain(iommu->cap))
1044                 val |= DMA_TLB_WRITE_DRAIN;
1045
1046         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1047         /* Note: Only uses first TLB reg currently */
1048         if (val_iva)
1049                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1050         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1051
1052         /* Make sure hardware complete it */
1053         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1054                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1055
1056         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1057
1058         /* check IOTLB invalidation granularity */
1059         if (DMA_TLB_IAIG(val) == 0)
1060                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1061         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1062                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1063                         (unsigned long long)DMA_TLB_IIRG(type),
1064                         (unsigned long long)DMA_TLB_IAIG(val));
1065 }
1066
1067 static struct device_domain_info *iommu_support_dev_iotlb(
1068         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1069 {
1070         int found = 0;
1071         unsigned long flags;
1072         struct device_domain_info *info;
1073         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1074
1075         if (!ecap_dev_iotlb_support(iommu->ecap))
1076                 return NULL;
1077
1078         if (!iommu->qi)
1079                 return NULL;
1080
1081         spin_lock_irqsave(&device_domain_lock, flags);
1082         list_for_each_entry(info, &domain->devices, link)
1083                 if (info->bus == bus && info->devfn == devfn) {
1084                         found = 1;
1085                         break;
1086                 }
1087         spin_unlock_irqrestore(&device_domain_lock, flags);
1088
1089         if (!found || !info->dev)
1090                 return NULL;
1091
1092         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1093                 return NULL;
1094
1095         if (!dmar_find_matched_atsr_unit(info->dev))
1096                 return NULL;
1097
1098         info->iommu = iommu;
1099
1100         return info;
1101 }
1102
1103 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1104 {
1105         if (!info)
1106                 return;
1107
1108         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1109 }
1110
1111 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1112 {
1113         if (!info->dev || !pci_ats_enabled(info->dev))
1114                 return;
1115
1116         pci_disable_ats(info->dev);
1117 }
1118
1119 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1120                                   u64 addr, unsigned mask)
1121 {
1122         u16 sid, qdep;
1123         unsigned long flags;
1124         struct device_domain_info *info;
1125
1126         spin_lock_irqsave(&device_domain_lock, flags);
1127         list_for_each_entry(info, &domain->devices, link) {
1128                 if (!info->dev || !pci_ats_enabled(info->dev))
1129                         continue;
1130
1131                 sid = info->bus << 8 | info->devfn;
1132                 qdep = pci_ats_queue_depth(info->dev);
1133                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1134         }
1135         spin_unlock_irqrestore(&device_domain_lock, flags);
1136 }
1137
1138 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1139                                   unsigned long pfn, unsigned int pages, int map)
1140 {
1141         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1142         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1143
1144         BUG_ON(pages == 0);
1145
1146         /*
1147          * Fallback to domain selective flush if no PSI support or the size is
1148          * too big.
1149          * PSI requires page size to be 2 ^ x, and the base address is naturally
1150          * aligned to the size
1151          */
1152         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1153                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1154                                                 DMA_TLB_DSI_FLUSH);
1155         else
1156                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1157                                                 DMA_TLB_PSI_FLUSH);
1158
1159         /*
1160          * In caching mode, changes of pages from non-present to present require
1161          * flush. However, device IOTLB doesn't need to be flushed in this case.
1162          */
1163         if (!cap_caching_mode(iommu->cap) || !map)
1164                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1165 }
1166
1167 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1168 {
1169         u32 pmen;
1170         unsigned long flags;
1171
1172         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1173         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1174         pmen &= ~DMA_PMEN_EPM;
1175         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1176
1177         /* wait for the protected region status bit to clear */
1178         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1179                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1180
1181         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1182 }
1183
1184 static int iommu_enable_translation(struct intel_iommu *iommu)
1185 {
1186         u32 sts;
1187         unsigned long flags;
1188
1189         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1190         iommu->gcmd |= DMA_GCMD_TE;
1191         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1192
1193         /* Make sure hardware complete it */
1194         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1195                       readl, (sts & DMA_GSTS_TES), sts);
1196
1197         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1198         return 0;
1199 }
1200
1201 static int iommu_disable_translation(struct intel_iommu *iommu)
1202 {
1203         u32 sts;
1204         unsigned long flag;
1205
1206         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1207         iommu->gcmd &= ~DMA_GCMD_TE;
1208         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1209
1210         /* Make sure hardware complete it */
1211         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1212                       readl, (!(sts & DMA_GSTS_TES)), sts);
1213
1214         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1215         return 0;
1216 }
1217
1218
1219 static int iommu_init_domains(struct intel_iommu *iommu)
1220 {
1221         unsigned long ndomains;
1222         unsigned long nlongs;
1223
1224         ndomains = cap_ndoms(iommu->cap);
1225         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1226                         ndomains);
1227         nlongs = BITS_TO_LONGS(ndomains);
1228
1229         spin_lock_init(&iommu->lock);
1230
1231         /* TBD: there might be 64K domains,
1232          * consider other allocation for future chip
1233          */
1234         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1235         if (!iommu->domain_ids) {
1236                 printk(KERN_ERR "Allocating domain id array failed\n");
1237                 return -ENOMEM;
1238         }
1239         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1240                         GFP_KERNEL);
1241         if (!iommu->domains) {
1242                 printk(KERN_ERR "Allocating domain array failed\n");
1243                 return -ENOMEM;
1244         }
1245
1246         /*
1247          * if Caching mode is set, then invalid translations are tagged
1248          * with domainid 0. Hence we need to pre-allocate it.
1249          */
1250         if (cap_caching_mode(iommu->cap))
1251                 set_bit(0, iommu->domain_ids);
1252         return 0;
1253 }
1254
1255
1256 static void domain_exit(struct dmar_domain *domain);
1257 static void vm_domain_exit(struct dmar_domain *domain);
1258
1259 void free_dmar_iommu(struct intel_iommu *iommu)
1260 {
1261         struct dmar_domain *domain;
1262         int i;
1263         unsigned long flags;
1264
1265         if ((iommu->domains) && (iommu->domain_ids)) {
1266                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1267                         domain = iommu->domains[i];
1268                         clear_bit(i, iommu->domain_ids);
1269
1270                         spin_lock_irqsave(&domain->iommu_lock, flags);
1271                         if (--domain->iommu_count == 0) {
1272                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1273                                         vm_domain_exit(domain);
1274                                 else
1275                                         domain_exit(domain);
1276                         }
1277                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1278                 }
1279         }
1280
1281         if (iommu->gcmd & DMA_GCMD_TE)
1282                 iommu_disable_translation(iommu);
1283
1284         if (iommu->irq) {
1285                 irq_set_handler_data(iommu->irq, NULL);
1286                 /* This will mask the irq */
1287                 free_irq(iommu->irq, iommu);
1288                 destroy_irq(iommu->irq);
1289         }
1290
1291         kfree(iommu->domains);
1292         kfree(iommu->domain_ids);
1293
1294         g_iommus[iommu->seq_id] = NULL;
1295
1296         /* if all iommus are freed, free g_iommus */
1297         for (i = 0; i < g_num_of_iommus; i++) {
1298                 if (g_iommus[i])
1299                         break;
1300         }
1301
1302         if (i == g_num_of_iommus)
1303                 kfree(g_iommus);
1304
1305         /* free context mapping */
1306         free_context_table(iommu);
1307 }
1308
1309 static struct dmar_domain *alloc_domain(void)
1310 {
1311         struct dmar_domain *domain;
1312
1313         domain = alloc_domain_mem();
1314         if (!domain)
1315                 return NULL;
1316
1317         domain->nid = -1;
1318         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1319         domain->flags = 0;
1320
1321         return domain;
1322 }
1323
1324 static int iommu_attach_domain(struct dmar_domain *domain,
1325                                struct intel_iommu *iommu)
1326 {
1327         int num;
1328         unsigned long ndomains;
1329         unsigned long flags;
1330
1331         ndomains = cap_ndoms(iommu->cap);
1332
1333         spin_lock_irqsave(&iommu->lock, flags);
1334
1335         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1336         if (num >= ndomains) {
1337                 spin_unlock_irqrestore(&iommu->lock, flags);
1338                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1339                 return -ENOMEM;
1340         }
1341
1342         domain->id = num;
1343         set_bit(num, iommu->domain_ids);
1344         set_bit(iommu->seq_id, &domain->iommu_bmp);
1345         iommu->domains[num] = domain;
1346         spin_unlock_irqrestore(&iommu->lock, flags);
1347
1348         return 0;
1349 }
1350
1351 static void iommu_detach_domain(struct dmar_domain *domain,
1352                                 struct intel_iommu *iommu)
1353 {
1354         unsigned long flags;
1355         int num, ndomains;
1356         int found = 0;
1357
1358         spin_lock_irqsave(&iommu->lock, flags);
1359         ndomains = cap_ndoms(iommu->cap);
1360         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1361                 if (iommu->domains[num] == domain) {
1362                         found = 1;
1363                         break;
1364                 }
1365         }
1366
1367         if (found) {
1368                 clear_bit(num, iommu->domain_ids);
1369                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1370                 iommu->domains[num] = NULL;
1371         }
1372         spin_unlock_irqrestore(&iommu->lock, flags);
1373 }
1374
1375 static struct iova_domain reserved_iova_list;
1376 static struct lock_class_key reserved_rbtree_key;
1377
1378 static int dmar_init_reserved_ranges(void)
1379 {
1380         struct pci_dev *pdev = NULL;
1381         struct iova *iova;
1382         int i;
1383
1384         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1385
1386         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1387                 &reserved_rbtree_key);
1388
1389         /* IOAPIC ranges shouldn't be accessed by DMA */
1390         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1391                 IOVA_PFN(IOAPIC_RANGE_END));
1392         if (!iova) {
1393                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1394                 return -ENODEV;
1395         }
1396
1397         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1398         for_each_pci_dev(pdev) {
1399                 struct resource *r;
1400
1401                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1402                         r = &pdev->resource[i];
1403                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1404                                 continue;
1405                         iova = reserve_iova(&reserved_iova_list,
1406                                             IOVA_PFN(r->start),
1407                                             IOVA_PFN(r->end));
1408                         if (!iova) {
1409                                 printk(KERN_ERR "Reserve iova failed\n");
1410                                 return -ENODEV;
1411                         }
1412                 }
1413         }
1414         return 0;
1415 }
1416
1417 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1418 {
1419         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1420 }
1421
1422 static inline int guestwidth_to_adjustwidth(int gaw)
1423 {
1424         int agaw;
1425         int r = (gaw - 12) % 9;
1426
1427         if (r == 0)
1428                 agaw = gaw;
1429         else
1430                 agaw = gaw + 9 - r;
1431         if (agaw > 64)
1432                 agaw = 64;
1433         return agaw;
1434 }
1435
1436 static int domain_init(struct dmar_domain *domain, int guest_width)
1437 {
1438         struct intel_iommu *iommu;
1439         int adjust_width, agaw;
1440         unsigned long sagaw;
1441
1442         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1443         spin_lock_init(&domain->iommu_lock);
1444
1445         domain_reserve_special_ranges(domain);
1446
1447         /* calculate AGAW */
1448         iommu = domain_get_iommu(domain);
1449         if (guest_width > cap_mgaw(iommu->cap))
1450                 guest_width = cap_mgaw(iommu->cap);
1451         domain->gaw = guest_width;
1452         adjust_width = guestwidth_to_adjustwidth(guest_width);
1453         agaw = width_to_agaw(adjust_width);
1454         sagaw = cap_sagaw(iommu->cap);
1455         if (!test_bit(agaw, &sagaw)) {
1456                 /* hardware doesn't support it, choose a bigger one */
1457                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1458                 agaw = find_next_bit(&sagaw, 5, agaw);
1459                 if (agaw >= 5)
1460                         return -ENODEV;
1461         }
1462         domain->agaw = agaw;
1463         INIT_LIST_HEAD(&domain->devices);
1464
1465         if (ecap_coherent(iommu->ecap))
1466                 domain->iommu_coherency = 1;
1467         else
1468                 domain->iommu_coherency = 0;
1469
1470         if (ecap_sc_support(iommu->ecap))
1471                 domain->iommu_snooping = 1;
1472         else
1473                 domain->iommu_snooping = 0;
1474
1475         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1476         domain->iommu_count = 1;
1477         domain->nid = iommu->node;
1478
1479         /* always allocate the top pgd */
1480         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1481         if (!domain->pgd)
1482                 return -ENOMEM;
1483         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1484         return 0;
1485 }
1486
1487 static void domain_exit(struct dmar_domain *domain)
1488 {
1489         struct dmar_drhd_unit *drhd;
1490         struct intel_iommu *iommu;
1491
1492         /* Domain 0 is reserved, so dont process it */
1493         if (!domain)
1494                 return;
1495
1496         /* Flush any lazy unmaps that may reference this domain */
1497         if (!intel_iommu_strict)
1498                 flush_unmaps_timeout(0);
1499
1500         domain_remove_dev_info(domain);
1501         /* destroy iovas */
1502         put_iova_domain(&domain->iovad);
1503
1504         /* clear ptes */
1505         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1506
1507         /* free page tables */
1508         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1509
1510         for_each_active_iommu(iommu, drhd)
1511                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1512                         iommu_detach_domain(domain, iommu);
1513
1514         free_domain_mem(domain);
1515 }
1516
1517 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1518                                  u8 bus, u8 devfn, int translation)
1519 {
1520         struct context_entry *context;
1521         unsigned long flags;
1522         struct intel_iommu *iommu;
1523         struct dma_pte *pgd;
1524         unsigned long num;
1525         unsigned long ndomains;
1526         int id;
1527         int agaw;
1528         struct device_domain_info *info = NULL;
1529
1530         pr_debug("Set context mapping for %02x:%02x.%d\n",
1531                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1532
1533         BUG_ON(!domain->pgd);
1534         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1535                translation != CONTEXT_TT_MULTI_LEVEL);
1536
1537         iommu = device_to_iommu(segment, bus, devfn);
1538         if (!iommu)
1539                 return -ENODEV;
1540
1541         context = device_to_context_entry(iommu, bus, devfn);
1542         if (!context)
1543                 return -ENOMEM;
1544         spin_lock_irqsave(&iommu->lock, flags);
1545         if (context_present(context)) {
1546                 spin_unlock_irqrestore(&iommu->lock, flags);
1547                 return 0;
1548         }
1549
1550         id = domain->id;
1551         pgd = domain->pgd;
1552
1553         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1554             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1555                 int found = 0;
1556
1557                 /* find an available domain id for this device in iommu */
1558                 ndomains = cap_ndoms(iommu->cap);
1559                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1560                         if (iommu->domains[num] == domain) {
1561                                 id = num;
1562                                 found = 1;
1563                                 break;
1564                         }
1565                 }
1566
1567                 if (found == 0) {
1568                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1569                         if (num >= ndomains) {
1570                                 spin_unlock_irqrestore(&iommu->lock, flags);
1571                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1572                                 return -EFAULT;
1573                         }
1574
1575                         set_bit(num, iommu->domain_ids);
1576                         iommu->domains[num] = domain;
1577                         id = num;
1578                 }
1579
1580                 /* Skip top levels of page tables for
1581                  * iommu which has less agaw than default.
1582                  * Unnecessary for PT mode.
1583                  */
1584                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1585                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1586                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1587                                 if (!dma_pte_present(pgd)) {
1588                                         spin_unlock_irqrestore(&iommu->lock, flags);
1589                                         return -ENOMEM;
1590                                 }
1591                         }
1592                 }
1593         }
1594
1595         context_set_domain_id(context, id);
1596
1597         if (translation != CONTEXT_TT_PASS_THROUGH) {
1598                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1599                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1600                                      CONTEXT_TT_MULTI_LEVEL;
1601         }
1602         /*
1603          * In pass through mode, AW must be programmed to indicate the largest
1604          * AGAW value supported by hardware. And ASR is ignored by hardware.
1605          */
1606         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1607                 context_set_address_width(context, iommu->msagaw);
1608         else {
1609                 context_set_address_root(context, virt_to_phys(pgd));
1610                 context_set_address_width(context, iommu->agaw);
1611         }
1612
1613         context_set_translation_type(context, translation);
1614         context_set_fault_enable(context);
1615         context_set_present(context);
1616         domain_flush_cache(domain, context, sizeof(*context));
1617
1618         /*
1619          * It's a non-present to present mapping. If hardware doesn't cache
1620          * non-present entry we only need to flush the write-buffer. If the
1621          * _does_ cache non-present entries, then it does so in the special
1622          * domain #0, which we have to flush:
1623          */
1624         if (cap_caching_mode(iommu->cap)) {
1625                 iommu->flush.flush_context(iommu, 0,
1626                                            (((u16)bus) << 8) | devfn,
1627                                            DMA_CCMD_MASK_NOBIT,
1628                                            DMA_CCMD_DEVICE_INVL);
1629                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1630         } else {
1631                 iommu_flush_write_buffer(iommu);
1632         }
1633         iommu_enable_dev_iotlb(info);
1634         spin_unlock_irqrestore(&iommu->lock, flags);
1635
1636         spin_lock_irqsave(&domain->iommu_lock, flags);
1637         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1638                 domain->iommu_count++;
1639                 if (domain->iommu_count == 1)
1640                         domain->nid = iommu->node;
1641                 domain_update_iommu_cap(domain);
1642         }
1643         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1644         return 0;
1645 }
1646
1647 static int
1648 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1649                         int translation)
1650 {
1651         int ret;
1652         struct pci_dev *tmp, *parent;
1653
1654         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1655                                          pdev->bus->number, pdev->devfn,
1656                                          translation);
1657         if (ret)
1658                 return ret;
1659
1660         /* dependent device mapping */
1661         tmp = pci_find_upstream_pcie_bridge(pdev);
1662         if (!tmp)
1663                 return 0;
1664         /* Secondary interface's bus number and devfn 0 */
1665         parent = pdev->bus->self;
1666         while (parent != tmp) {
1667                 ret = domain_context_mapping_one(domain,
1668                                                  pci_domain_nr(parent->bus),
1669                                                  parent->bus->number,
1670                                                  parent->devfn, translation);
1671                 if (ret)
1672                         return ret;
1673                 parent = parent->bus->self;
1674         }
1675         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1676                 return domain_context_mapping_one(domain,
1677                                         pci_domain_nr(tmp->subordinate),
1678                                         tmp->subordinate->number, 0,
1679                                         translation);
1680         else /* this is a legacy PCI bridge */
1681                 return domain_context_mapping_one(domain,
1682                                                   pci_domain_nr(tmp->bus),
1683                                                   tmp->bus->number,
1684                                                   tmp->devfn,
1685                                                   translation);
1686 }
1687
1688 static int domain_context_mapped(struct pci_dev *pdev)
1689 {
1690         int ret;
1691         struct pci_dev *tmp, *parent;
1692         struct intel_iommu *iommu;
1693
1694         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1695                                 pdev->devfn);
1696         if (!iommu)
1697                 return -ENODEV;
1698
1699         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1700         if (!ret)
1701                 return ret;
1702         /* dependent device mapping */
1703         tmp = pci_find_upstream_pcie_bridge(pdev);
1704         if (!tmp)
1705                 return ret;
1706         /* Secondary interface's bus number and devfn 0 */
1707         parent = pdev->bus->self;
1708         while (parent != tmp) {
1709                 ret = device_context_mapped(iommu, parent->bus->number,
1710                                             parent->devfn);
1711                 if (!ret)
1712                         return ret;
1713                 parent = parent->bus->self;
1714         }
1715         if (pci_is_pcie(tmp))
1716                 return device_context_mapped(iommu, tmp->subordinate->number,
1717                                              0);
1718         else
1719                 return device_context_mapped(iommu, tmp->bus->number,
1720                                              tmp->devfn);
1721 }
1722
1723 /* Returns a number of VTD pages, but aligned to MM page size */
1724 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1725                                             size_t size)
1726 {
1727         host_addr &= ~PAGE_MASK;
1728         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1729 }
1730
1731 /* Return largest possible superpage level for a given mapping */
1732 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1733                                           unsigned long iov_pfn,
1734                                           unsigned long phy_pfn,
1735                                           unsigned long pages)
1736 {
1737         int support, level = 1;
1738         unsigned long pfnmerge;
1739
1740         support = domain->iommu_superpage;
1741
1742         /* To use a large page, the virtual *and* physical addresses
1743            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1744            of them will mean we have to use smaller pages. So just
1745            merge them and check both at once. */
1746         pfnmerge = iov_pfn | phy_pfn;
1747
1748         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1749                 pages >>= VTD_STRIDE_SHIFT;
1750                 if (!pages)
1751                         break;
1752                 pfnmerge >>= VTD_STRIDE_SHIFT;
1753                 level++;
1754                 support--;
1755         }
1756         return level;
1757 }
1758
1759 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1760                             struct scatterlist *sg, unsigned long phys_pfn,
1761                             unsigned long nr_pages, int prot)
1762 {
1763         struct dma_pte *first_pte = NULL, *pte = NULL;
1764         phys_addr_t uninitialized_var(pteval);
1765         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1766         unsigned long sg_res;
1767         unsigned int largepage_lvl = 0;
1768         unsigned long lvl_pages = 0;
1769
1770         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1771
1772         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1773                 return -EINVAL;
1774
1775         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1776
1777         if (sg)
1778                 sg_res = 0;
1779         else {
1780                 sg_res = nr_pages + 1;
1781                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1782         }
1783
1784         while (nr_pages > 0) {
1785                 uint64_t tmp;
1786
1787                 if (!sg_res) {
1788                         sg_res = aligned_nrpages(sg->offset, sg->length);
1789                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1790                         sg->dma_length = sg->length;
1791                         pteval = page_to_phys(sg_page(sg)) | prot;
1792                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1793                 }
1794
1795                 if (!pte) {
1796                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1797
1798                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1799                         if (!pte)
1800                                 return -ENOMEM;
1801                         /* It is large page*/
1802                         if (largepage_lvl > 1) {
1803                                 pteval |= DMA_PTE_LARGE_PAGE;
1804                                 /* Ensure that old small page tables are removed to make room
1805                                    for superpage, if they exist. */
1806                                 dma_pte_clear_range(domain, iov_pfn,
1807                                                     iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1808                                 dma_pte_free_pagetable(domain, iov_pfn,
1809                                                        iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1810                         } else {
1811                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1812                         }
1813
1814                 }
1815                 /* We don't need lock here, nobody else
1816                  * touches the iova range
1817                  */
1818                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1819                 if (tmp) {
1820                         static int dumps = 5;
1821                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1822                                iov_pfn, tmp, (unsigned long long)pteval);
1823                         if (dumps) {
1824                                 dumps--;
1825                                 debug_dma_dump_mappings(NULL);
1826                         }
1827                         WARN_ON(1);
1828                 }
1829
1830                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1831
1832                 BUG_ON(nr_pages < lvl_pages);
1833                 BUG_ON(sg_res < lvl_pages);
1834
1835                 nr_pages -= lvl_pages;
1836                 iov_pfn += lvl_pages;
1837                 phys_pfn += lvl_pages;
1838                 pteval += lvl_pages * VTD_PAGE_SIZE;
1839                 sg_res -= lvl_pages;
1840
1841                 /* If the next PTE would be the first in a new page, then we
1842                    need to flush the cache on the entries we've just written.
1843                    And then we'll need to recalculate 'pte', so clear it and
1844                    let it get set again in the if (!pte) block above.
1845
1846                    If we're done (!nr_pages) we need to flush the cache too.
1847
1848                    Also if we've been setting superpages, we may need to
1849                    recalculate 'pte' and switch back to smaller pages for the
1850                    end of the mapping, if the trailing size is not enough to
1851                    use another superpage (i.e. sg_res < lvl_pages). */
1852                 pte++;
1853                 if (!nr_pages || first_pte_in_page(pte) ||
1854                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1855                         domain_flush_cache(domain, first_pte,
1856                                            (void *)pte - (void *)first_pte);
1857                         pte = NULL;
1858                 }
1859
1860                 if (!sg_res && nr_pages)
1861                         sg = sg_next(sg);
1862         }
1863         return 0;
1864 }
1865
1866 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1867                                     struct scatterlist *sg, unsigned long nr_pages,
1868                                     int prot)
1869 {
1870         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1871 }
1872
1873 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1874                                      unsigned long phys_pfn, unsigned long nr_pages,
1875                                      int prot)
1876 {
1877         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1878 }
1879
1880 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1881 {
1882         if (!iommu)
1883                 return;
1884
1885         clear_context_table(iommu, bus, devfn);
1886         iommu->flush.flush_context(iommu, 0, 0, 0,
1887                                            DMA_CCMD_GLOBAL_INVL);
1888         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1889 }
1890
1891 static void domain_remove_dev_info(struct dmar_domain *domain)
1892 {
1893         struct device_domain_info *info;
1894         unsigned long flags;
1895         struct intel_iommu *iommu;
1896
1897         spin_lock_irqsave(&device_domain_lock, flags);
1898         while (!list_empty(&domain->devices)) {
1899                 info = list_entry(domain->devices.next,
1900                         struct device_domain_info, link);
1901                 list_del(&info->link);
1902                 list_del(&info->global);
1903                 if (info->dev)
1904                         info->dev->dev.archdata.iommu = NULL;
1905                 spin_unlock_irqrestore(&device_domain_lock, flags);
1906
1907                 iommu_disable_dev_iotlb(info);
1908                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1909                 iommu_detach_dev(iommu, info->bus, info->devfn);
1910                 free_devinfo_mem(info);
1911
1912                 spin_lock_irqsave(&device_domain_lock, flags);
1913         }
1914         spin_unlock_irqrestore(&device_domain_lock, flags);
1915 }
1916
1917 /*
1918  * find_domain
1919  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1920  */
1921 static struct dmar_domain *
1922 find_domain(struct pci_dev *pdev)
1923 {
1924         struct device_domain_info *info;
1925
1926         /* No lock here, assumes no domain exit in normal case */
1927         info = pdev->dev.archdata.iommu;
1928         if (info)
1929                 return info->domain;
1930         return NULL;
1931 }
1932
1933 /* domain is initialized */
1934 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1935 {
1936         struct dmar_domain *domain, *found = NULL;
1937         struct intel_iommu *iommu;
1938         struct dmar_drhd_unit *drhd;
1939         struct device_domain_info *info, *tmp;
1940         struct pci_dev *dev_tmp;
1941         unsigned long flags;
1942         int bus = 0, devfn = 0;
1943         int segment;
1944         int ret;
1945
1946         domain = find_domain(pdev);
1947         if (domain)
1948                 return domain;
1949
1950         segment = pci_domain_nr(pdev->bus);
1951
1952         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1953         if (dev_tmp) {
1954                 if (pci_is_pcie(dev_tmp)) {
1955                         bus = dev_tmp->subordinate->number;
1956                         devfn = 0;
1957                 } else {
1958                         bus = dev_tmp->bus->number;
1959                         devfn = dev_tmp->devfn;
1960                 }
1961                 spin_lock_irqsave(&device_domain_lock, flags);
1962                 list_for_each_entry(info, &device_domain_list, global) {
1963                         if (info->segment == segment &&
1964                             info->bus == bus && info->devfn == devfn) {
1965                                 found = info->domain;
1966                                 break;
1967                         }
1968                 }
1969                 spin_unlock_irqrestore(&device_domain_lock, flags);
1970                 /* pcie-pci bridge already has a domain, uses it */
1971                 if (found) {
1972                         domain = found;
1973                         goto found_domain;
1974                 }
1975         }
1976
1977         domain = alloc_domain();
1978         if (!domain)
1979                 goto error;
1980
1981         /* Allocate new domain for the device */
1982         drhd = dmar_find_matched_drhd_unit(pdev);
1983         if (!drhd) {
1984                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1985                         pci_name(pdev));
1986                 return NULL;
1987         }
1988         iommu = drhd->iommu;
1989
1990         ret = iommu_attach_domain(domain, iommu);
1991         if (ret) {
1992                 free_domain_mem(domain);
1993                 goto error;
1994         }
1995
1996         if (domain_init(domain, gaw)) {
1997                 domain_exit(domain);
1998                 goto error;
1999         }
2000
2001         /* register pcie-to-pci device */
2002         if (dev_tmp) {
2003                 info = alloc_devinfo_mem();
2004                 if (!info) {
2005                         domain_exit(domain);
2006                         goto error;
2007                 }
2008                 info->segment = segment;
2009                 info->bus = bus;
2010                 info->devfn = devfn;
2011                 info->dev = NULL;
2012                 info->domain = domain;
2013                 /* This domain is shared by devices under p2p bridge */
2014                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2015
2016                 /* pcie-to-pci bridge already has a domain, uses it */
2017                 found = NULL;
2018                 spin_lock_irqsave(&device_domain_lock, flags);
2019                 list_for_each_entry(tmp, &device_domain_list, global) {
2020                         if (tmp->segment == segment &&
2021                             tmp->bus == bus && tmp->devfn == devfn) {
2022                                 found = tmp->domain;
2023                                 break;
2024                         }
2025                 }
2026                 if (found) {
2027                         spin_unlock_irqrestore(&device_domain_lock, flags);
2028                         free_devinfo_mem(info);
2029                         domain_exit(domain);
2030                         domain = found;
2031                 } else {
2032                         list_add(&info->link, &domain->devices);
2033                         list_add(&info->global, &device_domain_list);
2034                         spin_unlock_irqrestore(&device_domain_lock, flags);
2035                 }
2036         }
2037
2038 found_domain:
2039         info = alloc_devinfo_mem();
2040         if (!info)
2041                 goto error;
2042         info->segment = segment;
2043         info->bus = pdev->bus->number;
2044         info->devfn = pdev->devfn;
2045         info->dev = pdev;
2046         info->domain = domain;
2047         spin_lock_irqsave(&device_domain_lock, flags);
2048         /* somebody is fast */
2049         found = find_domain(pdev);
2050         if (found != NULL) {
2051                 spin_unlock_irqrestore(&device_domain_lock, flags);
2052                 if (found != domain) {
2053                         domain_exit(domain);
2054                         domain = found;
2055                 }
2056                 free_devinfo_mem(info);
2057                 return domain;
2058         }
2059         list_add(&info->link, &domain->devices);
2060         list_add(&info->global, &device_domain_list);
2061         pdev->dev.archdata.iommu = info;
2062         spin_unlock_irqrestore(&device_domain_lock, flags);
2063         return domain;
2064 error:
2065         /* recheck it here, maybe others set it */
2066         return find_domain(pdev);
2067 }
2068
2069 static int iommu_identity_mapping;
2070 #define IDENTMAP_ALL            1
2071 #define IDENTMAP_GFX            2
2072 #define IDENTMAP_AZALIA         4
2073
2074 static int iommu_domain_identity_map(struct dmar_domain *domain,
2075                                      unsigned long long start,
2076                                      unsigned long long end)
2077 {
2078         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2079         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2080
2081         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2082                           dma_to_mm_pfn(last_vpfn))) {
2083                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2084                 return -ENOMEM;
2085         }
2086
2087         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2088                  start, end, domain->id);
2089         /*
2090          * RMRR range might have overlap with physical memory range,
2091          * clear it first
2092          */
2093         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2094
2095         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2096                                   last_vpfn - first_vpfn + 1,
2097                                   DMA_PTE_READ|DMA_PTE_WRITE);
2098 }
2099
2100 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2101                                       unsigned long long start,
2102                                       unsigned long long end)
2103 {
2104         struct dmar_domain *domain;
2105         int ret;
2106
2107         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2108         if (!domain)
2109                 return -ENOMEM;
2110
2111         /* For _hardware_ passthrough, don't bother. But for software
2112            passthrough, we do it anyway -- it may indicate a memory
2113            range which is reserved in E820, so which didn't get set
2114            up to start with in si_domain */
2115         if (domain == si_domain && hw_pass_through) {
2116                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2117                        pci_name(pdev), start, end);
2118                 return 0;
2119         }
2120
2121         printk(KERN_INFO
2122                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2123                pci_name(pdev), start, end);
2124         
2125         if (end < start) {
2126                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2127                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2128                         dmi_get_system_info(DMI_BIOS_VENDOR),
2129                         dmi_get_system_info(DMI_BIOS_VERSION),
2130                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2131                 ret = -EIO;
2132                 goto error;
2133         }
2134
2135         if (end >> agaw_to_width(domain->agaw)) {
2136                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2137                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2138                      agaw_to_width(domain->agaw),
2139                      dmi_get_system_info(DMI_BIOS_VENDOR),
2140                      dmi_get_system_info(DMI_BIOS_VERSION),
2141                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2142                 ret = -EIO;
2143                 goto error;
2144         }
2145
2146         ret = iommu_domain_identity_map(domain, start, end);
2147         if (ret)
2148                 goto error;
2149
2150         /* context entry init */
2151         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2152         if (ret)
2153                 goto error;
2154
2155         return 0;
2156
2157  error:
2158         domain_exit(domain);
2159         return ret;
2160 }
2161
2162 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2163         struct pci_dev *pdev)
2164 {
2165         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2166                 return 0;
2167         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2168                 rmrr->end_address);
2169 }
2170
2171 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2172 static inline void iommu_prepare_isa(void)
2173 {
2174         struct pci_dev *pdev;
2175         int ret;
2176
2177         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2178         if (!pdev)
2179                 return;
2180
2181         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2182         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2183
2184         if (ret)
2185                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2186                        "floppy might not work\n");
2187
2188 }
2189 #else
2190 static inline void iommu_prepare_isa(void)
2191 {
2192         return;
2193 }
2194 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2195
2196 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2197
2198 static int __init si_domain_work_fn(unsigned long start_pfn,
2199                                     unsigned long end_pfn, void *datax)
2200 {
2201         int *ret = datax;
2202
2203         *ret = iommu_domain_identity_map(si_domain,
2204                                          (uint64_t)start_pfn << PAGE_SHIFT,
2205                                          (uint64_t)end_pfn << PAGE_SHIFT);
2206         return *ret;
2207
2208 }
2209
2210 static int __init si_domain_init(int hw)
2211 {
2212         struct dmar_drhd_unit *drhd;
2213         struct intel_iommu *iommu;
2214         int nid, ret = 0;
2215
2216         si_domain = alloc_domain();
2217         if (!si_domain)
2218                 return -EFAULT;
2219
2220         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2221
2222         for_each_active_iommu(iommu, drhd) {
2223                 ret = iommu_attach_domain(si_domain, iommu);
2224                 if (ret) {
2225                         domain_exit(si_domain);
2226                         return -EFAULT;
2227                 }
2228         }
2229
2230         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2231                 domain_exit(si_domain);
2232                 return -EFAULT;
2233         }
2234
2235         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2236
2237         if (hw)
2238                 return 0;
2239
2240         for_each_online_node(nid) {
2241                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2242                 if (ret)
2243                         return ret;
2244         }
2245
2246         return 0;
2247 }
2248
2249 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2250                                           struct pci_dev *pdev);
2251 static int identity_mapping(struct pci_dev *pdev)
2252 {
2253         struct device_domain_info *info;
2254
2255         if (likely(!iommu_identity_mapping))
2256                 return 0;
2257
2258         info = pdev->dev.archdata.iommu;
2259         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2260                 return (info->domain == si_domain);
2261
2262         return 0;
2263 }
2264
2265 static int domain_add_dev_info(struct dmar_domain *domain,
2266                                struct pci_dev *pdev,
2267                                int translation)
2268 {
2269         struct device_domain_info *info;
2270         unsigned long flags;
2271         int ret;
2272
2273         info = alloc_devinfo_mem();
2274         if (!info)
2275                 return -ENOMEM;
2276
2277         info->segment = pci_domain_nr(pdev->bus);
2278         info->bus = pdev->bus->number;
2279         info->devfn = pdev->devfn;
2280         info->dev = pdev;
2281         info->domain = domain;
2282
2283         spin_lock_irqsave(&device_domain_lock, flags);
2284         list_add(&info->link, &domain->devices);
2285         list_add(&info->global, &device_domain_list);
2286         pdev->dev.archdata.iommu = info;
2287         spin_unlock_irqrestore(&device_domain_lock, flags);
2288
2289         ret = domain_context_mapping(domain, pdev, translation);
2290         if (ret) {
2291                 spin_lock_irqsave(&device_domain_lock, flags);
2292                 list_del(&info->link);
2293                 list_del(&info->global);
2294                 pdev->dev.archdata.iommu = NULL;
2295                 spin_unlock_irqrestore(&device_domain_lock, flags);
2296                 free_devinfo_mem(info);
2297                 return ret;
2298         }
2299
2300         return 0;
2301 }
2302
2303 static bool device_has_rmrr(struct pci_dev *dev)
2304 {
2305         struct dmar_rmrr_unit *rmrr;
2306         int i;
2307
2308         for_each_rmrr_units(rmrr) {
2309                 for (i = 0; i < rmrr->devices_cnt; i++) {
2310                         /*
2311                          * Return TRUE if this RMRR contains the device that
2312                          * is passed in.
2313                          */
2314                         if (rmrr->devices[i] == dev)
2315                                 return true;
2316                 }
2317         }
2318         return false;
2319 }
2320
2321 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2322 {
2323
2324         /*
2325          * We want to prevent any device associated with an RMRR from
2326          * getting placed into the SI Domain. This is done because
2327          * problems exist when devices are moved in and out of domains
2328          * and their respective RMRR info is lost. We exempt USB devices
2329          * from this process due to their usage of RMRRs that are known
2330          * to not be needed after BIOS hand-off to OS.
2331          */
2332         if (device_has_rmrr(pdev) &&
2333             (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2334                 return 0;
2335
2336         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2337                 return 1;
2338
2339         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2340                 return 1;
2341
2342         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2343                 return 0;
2344
2345         /*
2346          * We want to start off with all devices in the 1:1 domain, and
2347          * take them out later if we find they can't access all of memory.
2348          *
2349          * However, we can't do this for PCI devices behind bridges,
2350          * because all PCI devices behind the same bridge will end up
2351          * with the same source-id on their transactions.
2352          *
2353          * Practically speaking, we can't change things around for these
2354          * devices at run-time, because we can't be sure there'll be no
2355          * DMA transactions in flight for any of their siblings.
2356          * 
2357          * So PCI devices (unless they're on the root bus) as well as
2358          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2359          * the 1:1 domain, just in _case_ one of their siblings turns out
2360          * not to be able to map all of memory.
2361          */
2362         if (!pci_is_pcie(pdev)) {
2363                 if (!pci_is_root_bus(pdev->bus))
2364                         return 0;
2365                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2366                         return 0;
2367         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2368                 return 0;
2369
2370         /* 
2371          * At boot time, we don't yet know if devices will be 64-bit capable.
2372          * Assume that they will -- if they turn out not to be, then we can 
2373          * take them out of the 1:1 domain later.
2374          */
2375         if (!startup) {
2376                 /*
2377                  * If the device's dma_mask is less than the system's memory
2378                  * size then this is not a candidate for identity mapping.
2379                  */
2380                 u64 dma_mask = pdev->dma_mask;
2381
2382                 if (pdev->dev.coherent_dma_mask &&
2383                     pdev->dev.coherent_dma_mask < dma_mask)
2384                         dma_mask = pdev->dev.coherent_dma_mask;
2385
2386                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2387         }
2388
2389         return 1;
2390 }
2391
2392 static int __init iommu_prepare_static_identity_mapping(int hw)
2393 {
2394         struct pci_dev *pdev = NULL;
2395         int ret;
2396
2397         ret = si_domain_init(hw);
2398         if (ret)
2399                 return -EFAULT;
2400
2401         for_each_pci_dev(pdev) {
2402                 /* Skip Host/PCI Bridge devices */
2403                 if (IS_BRIDGE_HOST_DEVICE(pdev))
2404                         continue;
2405                 if (iommu_should_identity_map(pdev, 1)) {
2406                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2407                                hw ? "hardware" : "software", pci_name(pdev));
2408
2409                         ret = domain_add_dev_info(si_domain, pdev,
2410                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2411                                                      CONTEXT_TT_MULTI_LEVEL);
2412                         if (ret)
2413                                 return ret;
2414                 }
2415         }
2416
2417         return 0;
2418 }
2419
2420 static int __init init_dmars(void)
2421 {
2422         struct dmar_drhd_unit *drhd;
2423         struct dmar_rmrr_unit *rmrr;
2424         struct pci_dev *pdev;
2425         struct intel_iommu *iommu;
2426         int i, ret;
2427
2428         /*
2429          * for each drhd
2430          *    allocate root
2431          *    initialize and program root entry to not present
2432          * endfor
2433          */
2434         for_each_drhd_unit(drhd) {
2435                 g_num_of_iommus++;
2436                 /*
2437                  * lock not needed as this is only incremented in the single
2438                  * threaded kernel __init code path all other access are read
2439                  * only
2440                  */
2441         }
2442
2443         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2444                         GFP_KERNEL);
2445         if (!g_iommus) {
2446                 printk(KERN_ERR "Allocating global iommu array failed\n");
2447                 ret = -ENOMEM;
2448                 goto error;
2449         }
2450
2451         deferred_flush = kzalloc(g_num_of_iommus *
2452                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2453         if (!deferred_flush) {
2454                 ret = -ENOMEM;
2455                 goto error;
2456         }
2457
2458         for_each_drhd_unit(drhd) {
2459                 if (drhd->ignored)
2460                         continue;
2461
2462                 iommu = drhd->iommu;
2463                 g_iommus[iommu->seq_id] = iommu;
2464
2465                 ret = iommu_init_domains(iommu);
2466                 if (ret)
2467                         goto error;
2468
2469                 /*
2470                  * TBD:
2471                  * we could share the same root & context tables
2472                  * among all IOMMU's. Need to Split it later.
2473                  */
2474                 ret = iommu_alloc_root_entry(iommu);
2475                 if (ret) {
2476                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2477                         goto error;
2478                 }
2479                 if (!ecap_pass_through(iommu->ecap))
2480                         hw_pass_through = 0;
2481         }
2482
2483         /*
2484          * Start from the sane iommu hardware state.
2485          */
2486         for_each_drhd_unit(drhd) {
2487                 if (drhd->ignored)
2488                         continue;
2489
2490                 iommu = drhd->iommu;
2491
2492                 /*
2493                  * If the queued invalidation is already initialized by us
2494                  * (for example, while enabling interrupt-remapping) then
2495                  * we got the things already rolling from a sane state.
2496                  */
2497                 if (iommu->qi)
2498                         continue;
2499
2500                 /*
2501                  * Clear any previous faults.
2502                  */
2503                 dmar_fault(-1, iommu);
2504                 /*
2505                  * Disable queued invalidation if supported and already enabled
2506                  * before OS handover.
2507                  */
2508                 dmar_disable_qi(iommu);
2509         }
2510
2511         for_each_drhd_unit(drhd) {
2512                 if (drhd->ignored)
2513                         continue;
2514
2515                 iommu = drhd->iommu;
2516
2517                 if (dmar_enable_qi(iommu)) {
2518                         /*
2519                          * Queued Invalidate not enabled, use Register Based
2520                          * Invalidate
2521                          */
2522                         iommu->flush.flush_context = __iommu_flush_context;
2523                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2524                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2525                                "invalidation\n",
2526                                 iommu->seq_id,
2527                                (unsigned long long)drhd->reg_base_addr);
2528                 } else {
2529                         iommu->flush.flush_context = qi_flush_context;
2530                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2531                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2532                                "invalidation\n",
2533                                 iommu->seq_id,
2534                                (unsigned long long)drhd->reg_base_addr);
2535                 }
2536         }
2537
2538         if (iommu_pass_through)
2539                 iommu_identity_mapping |= IDENTMAP_ALL;
2540
2541 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2542         iommu_identity_mapping |= IDENTMAP_GFX;
2543 #endif
2544
2545         check_tylersburg_isoch();
2546
2547         /*
2548          * If pass through is not set or not enabled, setup context entries for
2549          * identity mappings for rmrr, gfx, and isa and may fall back to static
2550          * identity mapping if iommu_identity_mapping is set.
2551          */
2552         if (iommu_identity_mapping) {
2553                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2554                 if (ret) {
2555                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2556                         goto error;
2557                 }
2558         }
2559         /*
2560          * For each rmrr
2561          *   for each dev attached to rmrr
2562          *   do
2563          *     locate drhd for dev, alloc domain for dev
2564          *     allocate free domain
2565          *     allocate page table entries for rmrr
2566          *     if context not allocated for bus
2567          *           allocate and init context
2568          *           set present in root table for this bus
2569          *     init context with domain, translation etc
2570          *    endfor
2571          * endfor
2572          */
2573         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2574         for_each_rmrr_units(rmrr) {
2575                 for (i = 0; i < rmrr->devices_cnt; i++) {
2576                         pdev = rmrr->devices[i];
2577                         /*
2578                          * some BIOS lists non-exist devices in DMAR
2579                          * table.
2580                          */
2581                         if (!pdev)
2582                                 continue;
2583                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2584                         if (ret)
2585                                 printk(KERN_ERR
2586                                        "IOMMU: mapping reserved region failed\n");
2587                 }
2588         }
2589
2590         iommu_prepare_isa();
2591
2592         /*
2593          * for each drhd
2594          *   enable fault log
2595          *   global invalidate context cache
2596          *   global invalidate iotlb
2597          *   enable translation
2598          */
2599         for_each_drhd_unit(drhd) {
2600                 if (drhd->ignored) {
2601                         /*
2602                          * we always have to disable PMRs or DMA may fail on
2603                          * this device
2604                          */
2605                         if (force_on)
2606                                 iommu_disable_protect_mem_regions(drhd->iommu);
2607                         continue;
2608                 }
2609                 iommu = drhd->iommu;
2610
2611                 iommu_flush_write_buffer(iommu);
2612
2613                 ret = dmar_set_interrupt(iommu);
2614                 if (ret)
2615                         goto error;
2616
2617                 iommu_set_root_entry(iommu);
2618
2619                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2620                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2621
2622                 ret = iommu_enable_translation(iommu);
2623                 if (ret)
2624                         goto error;
2625
2626                 iommu_disable_protect_mem_regions(iommu);
2627         }
2628
2629         return 0;
2630 error:
2631         for_each_drhd_unit(drhd) {
2632                 if (drhd->ignored)
2633                         continue;
2634                 iommu = drhd->iommu;
2635                 free_iommu(iommu);
2636         }
2637         kfree(g_iommus);
2638         return ret;
2639 }
2640
2641 /* This takes a number of _MM_ pages, not VTD pages */
2642 static struct iova *intel_alloc_iova(struct device *dev,
2643                                      struct dmar_domain *domain,
2644                                      unsigned long nrpages, uint64_t dma_mask)
2645 {
2646         struct pci_dev *pdev = to_pci_dev(dev);
2647         struct iova *iova = NULL;
2648
2649         /* Restrict dma_mask to the width that the iommu can handle */
2650         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2651
2652         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2653                 /*
2654                  * First try to allocate an io virtual address in
2655                  * DMA_BIT_MASK(32) and if that fails then try allocating
2656                  * from higher range
2657                  */
2658                 iova = alloc_iova(&domain->iovad, nrpages,
2659                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2660                 if (iova)
2661                         return iova;
2662         }
2663         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2664         if (unlikely(!iova)) {
2665                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2666                        nrpages, pci_name(pdev));
2667                 return NULL;
2668         }
2669
2670         return iova;
2671 }
2672
2673 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2674 {
2675         struct dmar_domain *domain;
2676         int ret;
2677
2678         domain = get_domain_for_dev(pdev,
2679                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2680         if (!domain) {
2681                 printk(KERN_ERR
2682                         "Allocating domain for %s failed", pci_name(pdev));
2683                 return NULL;
2684         }
2685
2686         /* make sure context mapping is ok */
2687         if (unlikely(!domain_context_mapped(pdev))) {
2688                 ret = domain_context_mapping(domain, pdev,
2689                                              CONTEXT_TT_MULTI_LEVEL);
2690                 if (ret) {
2691                         printk(KERN_ERR
2692                                 "Domain context map for %s failed",
2693                                 pci_name(pdev));
2694                         return NULL;
2695                 }
2696         }
2697
2698         return domain;
2699 }
2700
2701 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2702 {
2703         struct device_domain_info *info;
2704
2705         /* No lock here, assumes no domain exit in normal case */
2706         info = dev->dev.archdata.iommu;
2707         if (likely(info))
2708                 return info->domain;
2709
2710         return __get_valid_domain_for_dev(dev);
2711 }
2712
2713 static int iommu_dummy(struct pci_dev *pdev)
2714 {
2715         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2716 }
2717
2718 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2719 static int iommu_no_mapping(struct device *dev)
2720 {
2721         struct pci_dev *pdev;
2722         int found;
2723
2724         if (unlikely(dev->bus != &pci_bus_type))
2725                 return 1;
2726
2727         pdev = to_pci_dev(dev);
2728         if (iommu_dummy(pdev))
2729                 return 1;
2730
2731         if (!iommu_identity_mapping)
2732                 return 0;
2733
2734         found = identity_mapping(pdev);
2735         if (found) {
2736                 if (iommu_should_identity_map(pdev, 0))
2737                         return 1;
2738                 else {
2739                         /*
2740                          * 32 bit DMA is removed from si_domain and fall back
2741                          * to non-identity mapping.
2742                          */
2743                         domain_remove_one_dev_info(si_domain, pdev);
2744                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2745                                pci_name(pdev));
2746                         return 0;
2747                 }
2748         } else {
2749                 /*
2750                  * In case of a detached 64 bit DMA device from vm, the device
2751                  * is put into si_domain for identity mapping.
2752                  */
2753                 if (iommu_should_identity_map(pdev, 0)) {
2754                         int ret;
2755                         ret = domain_add_dev_info(si_domain, pdev,
2756                                                   hw_pass_through ?
2757                                                   CONTEXT_TT_PASS_THROUGH :
2758                                                   CONTEXT_TT_MULTI_LEVEL);
2759                         if (!ret) {
2760                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2761                                        pci_name(pdev));
2762                                 return 1;
2763                         }
2764                 }
2765         }
2766
2767         return 0;
2768 }
2769
2770 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2771                                      size_t size, int dir, u64 dma_mask)
2772 {
2773         struct pci_dev *pdev = to_pci_dev(hwdev);
2774         struct dmar_domain *domain;
2775         phys_addr_t start_paddr;
2776         struct iova *iova;
2777         int prot = 0;
2778         int ret;
2779         struct intel_iommu *iommu;
2780         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2781
2782         BUG_ON(dir == DMA_NONE);
2783
2784         if (iommu_no_mapping(hwdev))
2785                 return paddr;
2786
2787         domain = get_valid_domain_for_dev(pdev);
2788         if (!domain)
2789                 return 0;
2790
2791         iommu = domain_get_iommu(domain);
2792         size = aligned_nrpages(paddr, size);
2793
2794         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2795         if (!iova)
2796                 goto error;
2797
2798         /*
2799          * Check if DMAR supports zero-length reads on write only
2800          * mappings..
2801          */
2802         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2803                         !cap_zlr(iommu->cap))
2804                 prot |= DMA_PTE_READ;
2805         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2806                 prot |= DMA_PTE_WRITE;
2807         /*
2808          * paddr - (paddr + size) might be partial page, we should map the whole
2809          * page.  Note: if two part of one page are separately mapped, we
2810          * might have two guest_addr mapping to the same host paddr, but this
2811          * is not a big problem
2812          */
2813         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2814                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2815         if (ret)
2816                 goto error;
2817
2818         /* it's a non-present to present mapping. Only flush if caching mode */
2819         if (cap_caching_mode(iommu->cap))
2820                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2821         else
2822                 iommu_flush_write_buffer(iommu);
2823
2824         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2825         start_paddr += paddr & ~PAGE_MASK;
2826         return start_paddr;
2827
2828 error:
2829         if (iova)
2830                 __free_iova(&domain->iovad, iova);
2831         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2832                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2833         return 0;
2834 }
2835
2836 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2837                                  unsigned long offset, size_t size,
2838                                  enum dma_data_direction dir,
2839                                  struct dma_attrs *attrs)
2840 {
2841         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2842                                   dir, to_pci_dev(dev)->dma_mask);
2843 }
2844
2845 static void flush_unmaps(void)
2846 {
2847         int i, j;
2848
2849         timer_on = 0;
2850
2851         /* just flush them all */
2852         for (i = 0; i < g_num_of_iommus; i++) {
2853                 struct intel_iommu *iommu = g_iommus[i];
2854                 if (!iommu)
2855                         continue;
2856
2857                 if (!deferred_flush[i].next)
2858                         continue;
2859
2860                 /* In caching mode, global flushes turn emulation expensive */
2861                 if (!cap_caching_mode(iommu->cap))
2862                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2863                                          DMA_TLB_GLOBAL_FLUSH);
2864                 for (j = 0; j < deferred_flush[i].next; j++) {
2865                         unsigned long mask;
2866                         struct iova *iova = deferred_flush[i].iova[j];
2867                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2868
2869                         /* On real hardware multiple invalidations are expensive */
2870                         if (cap_caching_mode(iommu->cap))
2871                                 iommu_flush_iotlb_psi(iommu, domain->id,
2872                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2873                         else {
2874                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2875                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2876                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2877                         }
2878                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2879                 }
2880                 deferred_flush[i].next = 0;
2881         }
2882
2883         list_size = 0;
2884 }
2885
2886 static void flush_unmaps_timeout(unsigned long data)
2887 {
2888         unsigned long flags;
2889
2890         spin_lock_irqsave(&async_umap_flush_lock, flags);
2891         flush_unmaps();
2892         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2893 }
2894
2895 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2896 {
2897         unsigned long flags;
2898         int next, iommu_id;
2899         struct intel_iommu *iommu;
2900
2901         spin_lock_irqsave(&async_umap_flush_lock, flags);
2902         if (list_size == HIGH_WATER_MARK)
2903                 flush_unmaps();
2904
2905         iommu = domain_get_iommu(dom);
2906         iommu_id = iommu->seq_id;
2907
2908         next = deferred_flush[iommu_id].next;
2909         deferred_flush[iommu_id].domain[next] = dom;
2910         deferred_flush[iommu_id].iova[next] = iova;
2911         deferred_flush[iommu_id].next++;
2912
2913         if (!timer_on) {
2914                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2915                 timer_on = 1;
2916         }
2917         list_size++;
2918         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2919 }
2920
2921 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2922                              size_t size, enum dma_data_direction dir,
2923                              struct dma_attrs *attrs)
2924 {
2925         struct pci_dev *pdev = to_pci_dev(dev);
2926         struct dmar_domain *domain;
2927         unsigned long start_pfn, last_pfn;
2928         struct iova *iova;
2929         struct intel_iommu *iommu;
2930
2931         if (iommu_no_mapping(dev))
2932                 return;
2933
2934         domain = find_domain(pdev);
2935         BUG_ON(!domain);
2936
2937         iommu = domain_get_iommu(domain);
2938
2939         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2940         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2941                       (unsigned long long)dev_addr))
2942                 return;
2943
2944         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2945         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2946
2947         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2948                  pci_name(pdev), start_pfn, last_pfn);
2949
2950         /*  clear the whole page */
2951         dma_pte_clear_range(domain, start_pfn, last_pfn);
2952
2953         /* free page tables */
2954         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2955
2956         if (intel_iommu_strict) {
2957                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2958                                       last_pfn - start_pfn + 1, 0);
2959                 /* free iova */
2960                 __free_iova(&domain->iovad, iova);
2961         } else {
2962                 add_unmap(domain, iova);
2963                 /*
2964                  * queue up the release of the unmap to save the 1/6th of the
2965                  * cpu used up by the iotlb flush operation...
2966                  */
2967         }
2968 }
2969
2970 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2971                                   dma_addr_t *dma_handle, gfp_t flags)
2972 {
2973         void *vaddr;
2974         int order;
2975
2976         size = PAGE_ALIGN(size);
2977         order = get_order(size);
2978
2979         if (!iommu_no_mapping(hwdev))
2980                 flags &= ~(GFP_DMA | GFP_DMA32);
2981         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2982                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2983                         flags |= GFP_DMA;
2984                 else
2985                         flags |= GFP_DMA32;
2986         }
2987
2988         vaddr = (void *)__get_free_pages(flags, order);
2989         if (!vaddr)
2990                 return NULL;
2991         memset(vaddr, 0, size);
2992
2993         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2994                                          DMA_BIDIRECTIONAL,
2995                                          hwdev->coherent_dma_mask);
2996         if (*dma_handle)
2997                 return vaddr;
2998         free_pages((unsigned long)vaddr, order);
2999         return NULL;
3000 }
3001
3002 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3003                                 dma_addr_t dma_handle)
3004 {
3005         int order;
3006
3007         size = PAGE_ALIGN(size);
3008         order = get_order(size);
3009
3010         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3011         free_pages((unsigned long)vaddr, order);
3012 }
3013
3014 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3015                            int nelems, enum dma_data_direction dir,
3016                            struct dma_attrs *attrs)
3017 {
3018         struct pci_dev *pdev = to_pci_dev(hwdev);
3019         struct dmar_domain *domain;
3020         unsigned long start_pfn, last_pfn;
3021         struct iova *iova;
3022         struct intel_iommu *iommu;
3023
3024         if (iommu_no_mapping(hwdev))
3025                 return;
3026
3027         domain = find_domain(pdev);
3028         BUG_ON(!domain);
3029
3030         iommu = domain_get_iommu(domain);
3031
3032         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3033         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3034                       (unsigned long long)sglist[0].dma_address))
3035                 return;
3036
3037         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3038         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3039
3040         /*  clear the whole page */
3041         dma_pte_clear_range(domain, start_pfn, last_pfn);
3042
3043         /* free page tables */
3044         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3045
3046         if (intel_iommu_strict) {
3047                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3048                                       last_pfn - start_pfn + 1, 0);
3049                 /* free iova */
3050                 __free_iova(&domain->iovad, iova);
3051         } else {
3052                 add_unmap(domain, iova);
3053                 /*
3054                  * queue up the release of the unmap to save the 1/6th of the
3055                  * cpu used up by the iotlb flush operation...
3056                  */
3057         }
3058 }
3059
3060 static int intel_nontranslate_map_sg(struct device *hddev,
3061         struct scatterlist *sglist, int nelems, int dir)
3062 {
3063         int i;
3064         struct scatterlist *sg;
3065
3066         for_each_sg(sglist, sg, nelems, i) {
3067                 BUG_ON(!sg_page(sg));
3068                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3069                 sg->dma_length = sg->length;
3070         }
3071         return nelems;
3072 }
3073
3074 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3075                         enum dma_data_direction dir, struct dma_attrs *attrs)
3076 {
3077         int i;
3078         struct pci_dev *pdev = to_pci_dev(hwdev);
3079         struct dmar_domain *domain;
3080         size_t size = 0;
3081         int prot = 0;
3082         struct iova *iova = NULL;
3083         int ret;
3084         struct scatterlist *sg;
3085         unsigned long start_vpfn;
3086         struct intel_iommu *iommu;
3087
3088         BUG_ON(dir == DMA_NONE);
3089         if (iommu_no_mapping(hwdev))
3090                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3091
3092         domain = get_valid_domain_for_dev(pdev);
3093         if (!domain)
3094                 return 0;
3095
3096         iommu = domain_get_iommu(domain);
3097
3098         for_each_sg(sglist, sg, nelems, i)
3099                 size += aligned_nrpages(sg->offset, sg->length);
3100
3101         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3102                                 pdev->dma_mask);
3103         if (!iova) {
3104                 sglist->dma_length = 0;
3105                 return 0;
3106         }
3107
3108         /*
3109          * Check if DMAR supports zero-length reads on write only
3110          * mappings..
3111          */
3112         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3113                         !cap_zlr(iommu->cap))
3114                 prot |= DMA_PTE_READ;
3115         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3116                 prot |= DMA_PTE_WRITE;
3117
3118         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3119
3120         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3121         if (unlikely(ret)) {
3122                 /*  clear the page */
3123                 dma_pte_clear_range(domain, start_vpfn,
3124                                     start_vpfn + size - 1);
3125                 /* free page tables */
3126                 dma_pte_free_pagetable(domain, start_vpfn,
3127                                        start_vpfn + size - 1);
3128                 /* free iova */
3129                 __free_iova(&domain->iovad, iova);
3130                 return 0;
3131         }
3132
3133         /* it's a non-present to present mapping. Only flush if caching mode */
3134         if (cap_caching_mode(iommu->cap))
3135                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3136         else
3137                 iommu_flush_write_buffer(iommu);
3138
3139         return nelems;
3140 }
3141
3142 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3143 {
3144         return !dma_addr;
3145 }
3146
3147 struct dma_map_ops intel_dma_ops = {
3148         .alloc_coherent = intel_alloc_coherent,
3149         .free_coherent = intel_free_coherent,
3150         .map_sg = intel_map_sg,
3151         .unmap_sg = intel_unmap_sg,
3152         .map_page = intel_map_page,
3153         .unmap_page = intel_unmap_page,
3154         .mapping_error = intel_mapping_error,
3155 };
3156
3157 static inline int iommu_domain_cache_init(void)
3158 {
3159         int ret = 0;
3160
3161         iommu_domain_cache = kmem_cache_create("iommu_domain",
3162                                          sizeof(struct dmar_domain),
3163                                          0,
3164                                          SLAB_HWCACHE_ALIGN,
3165
3166                                          NULL);
3167         if (!iommu_domain_cache) {
3168                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3169                 ret = -ENOMEM;
3170         }
3171
3172         return ret;
3173 }
3174
3175 static inline int iommu_devinfo_cache_init(void)
3176 {
3177         int ret = 0;
3178
3179         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3180                                          sizeof(struct device_domain_info),
3181                                          0,
3182                                          SLAB_HWCACHE_ALIGN,
3183                                          NULL);
3184         if (!iommu_devinfo_cache) {
3185                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3186                 ret = -ENOMEM;
3187         }
3188
3189         return ret;
3190 }
3191
3192 static inline int iommu_iova_cache_init(void)
3193 {
3194         int ret = 0;
3195
3196         iommu_iova_cache = kmem_cache_create("iommu_iova",
3197                                          sizeof(struct iova),
3198                                          0,
3199                                          SLAB_HWCACHE_ALIGN,
3200                                          NULL);
3201         if (!iommu_iova_cache) {
3202                 printk(KERN_ERR "Couldn't create iova cache\n");
3203                 ret = -ENOMEM;
3204         }
3205
3206         return ret;
3207 }
3208
3209 static int __init iommu_init_mempool(void)
3210 {
3211         int ret;
3212         ret = iommu_iova_cache_init();
3213         if (ret)
3214                 return ret;
3215
3216         ret = iommu_domain_cache_init();
3217         if (ret)
3218                 goto domain_error;
3219
3220         ret = iommu_devinfo_cache_init();
3221         if (!ret)
3222                 return ret;
3223
3224         kmem_cache_destroy(iommu_domain_cache);
3225 domain_error:
3226         kmem_cache_destroy(iommu_iova_cache);
3227
3228         return -ENOMEM;
3229 }
3230
3231 static void __init iommu_exit_mempool(void)
3232 {
3233         kmem_cache_destroy(iommu_devinfo_cache);
3234         kmem_cache_destroy(iommu_domain_cache);
3235         kmem_cache_destroy(iommu_iova_cache);
3236
3237 }
3238
3239 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3240 {
3241         struct dmar_drhd_unit *drhd;
3242         u32 vtbar;
3243         int rc;
3244
3245         /* We know that this device on this chipset has its own IOMMU.
3246          * If we find it under a different IOMMU, then the BIOS is lying
3247          * to us. Hope that the IOMMU for this device is actually
3248          * disabled, and it needs no translation...
3249          */
3250         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3251         if (rc) {
3252                 /* "can't" happen */
3253                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3254                 return;
3255         }
3256         vtbar &= 0xffff0000;
3257
3258         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3259         drhd = dmar_find_matched_drhd_unit(pdev);
3260         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3261                             TAINT_FIRMWARE_WORKAROUND,
3262                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3263                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3264 }
3265 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3266
3267 static void __init init_no_remapping_devices(void)
3268 {
3269         struct dmar_drhd_unit *drhd;
3270
3271         for_each_drhd_unit(drhd) {
3272                 if (!drhd->include_all) {
3273                         int i;
3274                         for (i = 0; i < drhd->devices_cnt; i++)
3275                                 if (drhd->devices[i] != NULL)
3276                                         break;
3277                         /* ignore DMAR unit if no pci devices exist */
3278                         if (i == drhd->devices_cnt)
3279                                 drhd->ignored = 1;
3280                 }
3281         }
3282
3283         for_each_drhd_unit(drhd) {
3284                 int i;
3285                 if (drhd->ignored || drhd->include_all)
3286                         continue;
3287
3288                 for (i = 0; i < drhd->devices_cnt; i++)
3289                         if (drhd->devices[i] &&
3290                             !IS_GFX_DEVICE(drhd->devices[i]))
3291                                 break;
3292
3293                 if (i < drhd->devices_cnt)
3294                         continue;
3295
3296                 /* This IOMMU has *only* gfx devices. Either bypass it or
3297                    set the gfx_mapped flag, as appropriate */
3298                 if (dmar_map_gfx) {
3299                         intel_iommu_gfx_mapped = 1;
3300                 } else {
3301                         drhd->ignored = 1;
3302                         for (i = 0; i < drhd->devices_cnt; i++) {
3303                                 if (!drhd->devices[i])
3304                                         continue;
3305                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3306                         }
3307                 }
3308         }
3309 }
3310
3311 #ifdef CONFIG_SUSPEND
3312 static int init_iommu_hw(void)
3313 {
3314         struct dmar_drhd_unit *drhd;
3315         struct intel_iommu *iommu = NULL;
3316
3317         for_each_active_iommu(iommu, drhd)
3318                 if (iommu->qi)
3319                         dmar_reenable_qi(iommu);
3320
3321         for_each_iommu(iommu, drhd) {
3322                 if (drhd->ignored) {
3323                         /*
3324                          * we always have to disable PMRs or DMA may fail on
3325                          * this device
3326                          */
3327                         if (force_on)
3328                                 iommu_disable_protect_mem_regions(iommu);
3329                         continue;
3330                 }
3331         
3332                 iommu_flush_write_buffer(iommu);
3333
3334                 iommu_set_root_entry(iommu);
3335
3336                 iommu->flush.flush_context(iommu, 0, 0, 0,
3337                                            DMA_CCMD_GLOBAL_INVL);
3338                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3339                                          DMA_TLB_GLOBAL_FLUSH);
3340                 if (iommu_enable_translation(iommu))
3341                         return 1;
3342                 iommu_disable_protect_mem_regions(iommu);
3343         }
3344
3345         return 0;
3346 }
3347
3348 static void iommu_flush_all(void)
3349 {
3350         struct dmar_drhd_unit *drhd;
3351         struct intel_iommu *iommu;
3352
3353         for_each_active_iommu(iommu, drhd) {
3354                 iommu->flush.flush_context(iommu, 0, 0, 0,
3355                                            DMA_CCMD_GLOBAL_INVL);
3356                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3357                                          DMA_TLB_GLOBAL_FLUSH);
3358         }
3359 }
3360
3361 static int iommu_suspend(void)
3362 {
3363         struct dmar_drhd_unit *drhd;
3364         struct intel_iommu *iommu = NULL;
3365         unsigned long flag;
3366
3367         for_each_active_iommu(iommu, drhd) {
3368                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3369                                                  GFP_ATOMIC);
3370                 if (!iommu->iommu_state)
3371                         goto nomem;
3372         }
3373
3374         iommu_flush_all();
3375
3376         for_each_active_iommu(iommu, drhd) {
3377                 iommu_disable_translation(iommu);
3378
3379                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3380
3381                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3382                         readl(iommu->reg + DMAR_FECTL_REG);
3383                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3384                         readl(iommu->reg + DMAR_FEDATA_REG);
3385                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3386                         readl(iommu->reg + DMAR_FEADDR_REG);
3387                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3388                         readl(iommu->reg + DMAR_FEUADDR_REG);
3389
3390                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3391         }
3392         return 0;
3393
3394 nomem:
3395         for_each_active_iommu(iommu, drhd)
3396                 kfree(iommu->iommu_state);
3397
3398         return -ENOMEM;
3399 }
3400
3401 static void iommu_resume(void)
3402 {
3403         struct dmar_drhd_unit *drhd;
3404         struct intel_iommu *iommu = NULL;
3405         unsigned long flag;
3406
3407         if (init_iommu_hw()) {
3408                 if (force_on)
3409                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3410                 else
3411                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3412                 return;
3413         }
3414
3415         for_each_active_iommu(iommu, drhd) {
3416
3417                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3418
3419                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3420                         iommu->reg + DMAR_FECTL_REG);
3421                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3422                         iommu->reg + DMAR_FEDATA_REG);
3423                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3424                         iommu->reg + DMAR_FEADDR_REG);
3425                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3426                         iommu->reg + DMAR_FEUADDR_REG);
3427
3428                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3429         }
3430
3431         for_each_active_iommu(iommu, drhd)
3432                 kfree(iommu->iommu_state);
3433 }
3434
3435 static struct syscore_ops iommu_syscore_ops = {
3436         .resume         = iommu_resume,
3437         .suspend        = iommu_suspend,
3438 };
3439
3440 static void __init init_iommu_pm_ops(void)
3441 {
3442         register_syscore_ops(&iommu_syscore_ops);
3443 }
3444
3445 #else
3446 static inline void init_iommu_pm_ops(void) {}
3447 #endif  /* CONFIG_PM */
3448
3449 LIST_HEAD(dmar_rmrr_units);
3450
3451 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3452 {
3453         list_add(&rmrr->list, &dmar_rmrr_units);
3454 }
3455
3456
3457 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3458 {
3459         struct acpi_dmar_reserved_memory *rmrr;
3460         struct dmar_rmrr_unit *rmrru;
3461
3462         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3463         if (!rmrru)
3464                 return -ENOMEM;
3465
3466         rmrru->hdr = header;
3467         rmrr = (struct acpi_dmar_reserved_memory *)header;
3468         rmrru->base_address = rmrr->base_address;
3469         rmrru->end_address = rmrr->end_address;
3470
3471         dmar_register_rmrr_unit(rmrru);
3472         return 0;
3473 }
3474
3475 static int __init
3476 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3477 {
3478         struct acpi_dmar_reserved_memory *rmrr;
3479         int ret;
3480
3481         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3482         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3483                 ((void *)rmrr) + rmrr->header.length,
3484                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3485
3486         if (ret || (rmrru->devices_cnt == 0)) {
3487                 list_del(&rmrru->list);
3488                 kfree(rmrru);
3489         }
3490         return ret;
3491 }
3492
3493 static LIST_HEAD(dmar_atsr_units);
3494
3495 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3496 {
3497         struct acpi_dmar_atsr *atsr;
3498         struct dmar_atsr_unit *atsru;
3499
3500         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3501         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3502         if (!atsru)
3503                 return -ENOMEM;
3504
3505         atsru->hdr = hdr;
3506         atsru->include_all = atsr->flags & 0x1;
3507
3508         list_add(&atsru->list, &dmar_atsr_units);
3509
3510         return 0;
3511 }
3512
3513 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3514 {
3515         int rc;
3516         struct acpi_dmar_atsr *atsr;
3517
3518         if (atsru->include_all)
3519                 return 0;
3520
3521         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3522         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3523                                 (void *)atsr + atsr->header.length,
3524                                 &atsru->devices_cnt, &atsru->devices,
3525                                 atsr->segment);
3526         if (rc || !atsru->devices_cnt) {
3527                 list_del(&atsru->list);
3528                 kfree(atsru);
3529         }
3530
3531         return rc;
3532 }
3533
3534 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3535 {
3536         int i;
3537         struct pci_bus *bus;
3538         struct acpi_dmar_atsr *atsr;
3539         struct dmar_atsr_unit *atsru;
3540
3541         dev = pci_physfn(dev);
3542
3543         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3544                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3545                 if (atsr->segment == pci_domain_nr(dev->bus))
3546                         goto found;
3547         }
3548
3549         return 0;
3550
3551 found:
3552         for (bus = dev->bus; bus; bus = bus->parent) {
3553                 struct pci_dev *bridge = bus->self;
3554
3555                 if (!bridge || !pci_is_pcie(bridge) ||
3556                     bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3557                         return 0;
3558
3559                 if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3560                         for (i = 0; i < atsru->devices_cnt; i++)
3561                                 if (atsru->devices[i] == bridge)
3562                                         return 1;
3563                         break;
3564                 }
3565         }
3566
3567         if (atsru->include_all)
3568                 return 1;
3569
3570         return 0;
3571 }
3572
3573 int __init dmar_parse_rmrr_atsr_dev(void)
3574 {
3575         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3576         struct dmar_atsr_unit *atsr, *atsr_n;
3577         int ret = 0;
3578
3579         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3580                 ret = rmrr_parse_dev(rmrr);
3581                 if (ret)
3582                         return ret;
3583         }
3584
3585         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3586                 ret = atsr_parse_dev(atsr);
3587                 if (ret)
3588                         return ret;
3589         }
3590
3591         return ret;
3592 }
3593
3594 /*
3595  * Here we only respond to action of unbound device from driver.
3596  *
3597  * Added device is not attached to its DMAR domain here yet. That will happen
3598  * when mapping the device to iova.
3599  */
3600 static int device_notifier(struct notifier_block *nb,
3601                                   unsigned long action, void *data)
3602 {
3603         struct device *dev = data;
3604         struct pci_dev *pdev = to_pci_dev(dev);
3605         struct dmar_domain *domain;
3606
3607         if (iommu_no_mapping(dev))
3608                 return 0;
3609
3610         domain = find_domain(pdev);
3611         if (!domain)
3612                 return 0;
3613
3614         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3615                 domain_remove_one_dev_info(domain, pdev);
3616
3617                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3618                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3619                     list_empty(&domain->devices))
3620                         domain_exit(domain);
3621         }
3622
3623         return 0;
3624 }
3625
3626 static struct notifier_block device_nb = {
3627         .notifier_call = device_notifier,
3628 };
3629
3630 int __init intel_iommu_init(void)
3631 {
3632         int ret = 0;
3633
3634         /* VT-d is required for a TXT/tboot launch, so enforce that */
3635         force_on = tboot_force_iommu();
3636
3637         if (dmar_table_init()) {
3638                 if (force_on)
3639                         panic("tboot: Failed to initialize DMAR table\n");
3640                 return  -ENODEV;
3641         }
3642
3643         if (dmar_dev_scope_init() < 0) {
3644                 if (force_on)
3645                         panic("tboot: Failed to initialize DMAR device scope\n");
3646                 return  -ENODEV;
3647         }
3648
3649         if (no_iommu || dmar_disabled)
3650                 return -ENODEV;
3651
3652         if (iommu_init_mempool()) {
3653                 if (force_on)
3654                         panic("tboot: Failed to initialize iommu memory\n");
3655                 return  -ENODEV;
3656         }
3657
3658         if (list_empty(&dmar_rmrr_units))
3659                 printk(KERN_INFO "DMAR: No RMRR found\n");
3660
3661         if (list_empty(&dmar_atsr_units))
3662                 printk(KERN_INFO "DMAR: No ATSR found\n");
3663
3664         if (dmar_init_reserved_ranges()) {
3665                 if (force_on)
3666                         panic("tboot: Failed to reserve iommu ranges\n");
3667                 return  -ENODEV;
3668         }
3669
3670         init_no_remapping_devices();
3671
3672         ret = init_dmars();
3673         if (ret) {
3674                 if (force_on)
3675                         panic("tboot: Failed to initialize DMARs\n");
3676                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3677                 put_iova_domain(&reserved_iova_list);
3678                 iommu_exit_mempool();
3679                 return ret;
3680         }
3681         printk(KERN_INFO
3682         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3683
3684         init_timer(&unmap_timer);
3685 #ifdef CONFIG_SWIOTLB
3686         swiotlb = 0;
3687 #endif
3688         dma_ops = &intel_dma_ops;
3689
3690         init_iommu_pm_ops();
3691
3692         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3693
3694         bus_register_notifier(&pci_bus_type, &device_nb);
3695
3696         intel_iommu_enabled = 1;
3697
3698         return 0;
3699 }
3700
3701 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3702                                            struct pci_dev *pdev)
3703 {
3704         struct pci_dev *tmp, *parent;
3705
3706         if (!iommu || !pdev)
3707                 return;
3708
3709         /* dependent device detach */
3710         tmp = pci_find_upstream_pcie_bridge(pdev);
3711         /* Secondary interface's bus number and devfn 0 */
3712         if (tmp) {
3713                 parent = pdev->bus->self;
3714                 while (parent != tmp) {
3715                         iommu_detach_dev(iommu, parent->bus->number,
3716                                          parent->devfn);
3717                         parent = parent->bus->self;
3718                 }
3719                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3720                         iommu_detach_dev(iommu,
3721                                 tmp->subordinate->number, 0);
3722                 else /* this is a legacy PCI bridge */
3723                         iommu_detach_dev(iommu, tmp->bus->number,
3724                                          tmp->devfn);
3725         }
3726 }
3727
3728 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3729                                           struct pci_dev *pdev)
3730 {
3731         struct device_domain_info *info;
3732         struct intel_iommu *iommu;
3733         unsigned long flags;
3734         int found = 0;
3735         struct list_head *entry, *tmp;
3736
3737         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3738                                 pdev->devfn);
3739         if (!iommu)
3740                 return;
3741
3742         spin_lock_irqsave(&device_domain_lock, flags);
3743         list_for_each_safe(entry, tmp, &domain->devices) {
3744                 info = list_entry(entry, struct device_domain_info, link);
3745                 if (info->segment == pci_domain_nr(pdev->bus) &&
3746                     info->bus == pdev->bus->number &&
3747                     info->devfn == pdev->devfn) {
3748                         list_del(&info->link);
3749                         list_del(&info->global);
3750                         if (info->dev)
3751                                 info->dev->dev.archdata.iommu = NULL;
3752                         spin_unlock_irqrestore(&device_domain_lock, flags);
3753
3754                         iommu_disable_dev_iotlb(info);
3755                         iommu_detach_dev(iommu, info->bus, info->devfn);
3756                         iommu_detach_dependent_devices(iommu, pdev);
3757                         free_devinfo_mem(info);
3758
3759                         spin_lock_irqsave(&device_domain_lock, flags);
3760
3761                         if (found)
3762                                 break;
3763                         else
3764                                 continue;
3765                 }
3766
3767                 /* if there is no other devices under the same iommu
3768                  * owned by this domain, clear this iommu in iommu_bmp
3769                  * update iommu count and coherency
3770                  */
3771                 if (iommu == device_to_iommu(info->segment, info->bus,
3772                                             info->devfn))
3773                         found = 1;
3774         }
3775
3776         spin_unlock_irqrestore(&device_domain_lock, flags);
3777
3778         if (found == 0) {
3779                 unsigned long tmp_flags;
3780                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3781                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3782                 domain->iommu_count--;
3783                 domain_update_iommu_cap(domain);
3784                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3785
3786                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3787                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3788                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3789                         clear_bit(domain->id, iommu->domain_ids);
3790                         iommu->domains[domain->id] = NULL;
3791                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3792                 }
3793         }
3794 }
3795
3796 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3797 {
3798         struct device_domain_info *info;
3799         struct intel_iommu *iommu;
3800         unsigned long flags1, flags2;
3801
3802         spin_lock_irqsave(&device_domain_lock, flags1);
3803         while (!list_empty(&domain->devices)) {
3804                 info = list_entry(domain->devices.next,
3805                         struct device_domain_info, link);
3806                 list_del(&info->link);
3807                 list_del(&info->global);
3808                 if (info->dev)
3809                         info->dev->dev.archdata.iommu = NULL;
3810
3811                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3812
3813                 iommu_disable_dev_iotlb(info);
3814                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3815                 iommu_detach_dev(iommu, info->bus, info->devfn);
3816                 iommu_detach_dependent_devices(iommu, info->dev);
3817
3818                 /* clear this iommu in iommu_bmp, update iommu count
3819                  * and capabilities
3820                  */
3821                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3822                 if (test_and_clear_bit(iommu->seq_id,
3823                                        &domain->iommu_bmp)) {
3824                         domain->iommu_count--;
3825                         domain_update_iommu_cap(domain);
3826                 }
3827                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3828
3829                 free_devinfo_mem(info);
3830                 spin_lock_irqsave(&device_domain_lock, flags1);
3831         }
3832         spin_unlock_irqrestore(&device_domain_lock, flags1);
3833 }
3834
3835 /* domain id for virtual machine, it won't be set in context */
3836 static unsigned long vm_domid;
3837
3838 static struct dmar_domain *iommu_alloc_vm_domain(void)
3839 {
3840         struct dmar_domain *domain;
3841
3842         domain = alloc_domain_mem();
3843         if (!domain)
3844                 return NULL;
3845
3846         domain->id = vm_domid++;
3847         domain->nid = -1;
3848         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3849         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3850
3851         return domain;
3852 }
3853
3854 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3855 {
3856         int adjust_width;
3857
3858         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3859         spin_lock_init(&domain->iommu_lock);
3860
3861         domain_reserve_special_ranges(domain);
3862
3863         /* calculate AGAW */
3864         domain->gaw = guest_width;
3865         adjust_width = guestwidth_to_adjustwidth(guest_width);
3866         domain->agaw = width_to_agaw(adjust_width);
3867
3868         INIT_LIST_HEAD(&domain->devices);
3869
3870         domain->iommu_count = 0;
3871         domain->iommu_coherency = 0;
3872         domain->iommu_snooping = 0;
3873         domain->iommu_superpage = 0;
3874         domain->max_addr = 0;
3875         domain->nid = -1;
3876
3877         /* always allocate the top pgd */
3878         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3879         if (!domain->pgd)
3880                 return -ENOMEM;
3881         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3882         return 0;
3883 }
3884
3885 static void iommu_free_vm_domain(struct dmar_domain *domain)
3886 {
3887         unsigned long flags;
3888         struct dmar_drhd_unit *drhd;
3889         struct intel_iommu *iommu;
3890         unsigned long i;
3891         unsigned long ndomains;
3892
3893         for_each_drhd_unit(drhd) {
3894                 if (drhd->ignored)
3895                         continue;
3896                 iommu = drhd->iommu;
3897
3898                 ndomains = cap_ndoms(iommu->cap);
3899                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3900                         if (iommu->domains[i] == domain) {
3901                                 spin_lock_irqsave(&iommu->lock, flags);
3902                                 clear_bit(i, iommu->domain_ids);
3903                                 iommu->domains[i] = NULL;
3904                                 spin_unlock_irqrestore(&iommu->lock, flags);
3905                                 break;
3906                         }
3907                 }
3908         }
3909 }
3910
3911 static void vm_domain_exit(struct dmar_domain *domain)
3912 {
3913         /* Domain 0 is reserved, so dont process it */
3914         if (!domain)
3915                 return;
3916
3917         vm_domain_remove_all_dev_info(domain);
3918         /* destroy iovas */
3919         put_iova_domain(&domain->iovad);
3920
3921         /* clear ptes */
3922         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3923
3924         /* free page tables */
3925         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3926
3927         iommu_free_vm_domain(domain);
3928         free_domain_mem(domain);
3929 }
3930
3931 static int intel_iommu_domain_init(struct iommu_domain *domain)
3932 {
3933         struct dmar_domain *dmar_domain;
3934
3935         dmar_domain = iommu_alloc_vm_domain();
3936         if (!dmar_domain) {
3937                 printk(KERN_ERR
3938                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3939                 return -ENOMEM;
3940         }
3941         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3942                 printk(KERN_ERR
3943                         "intel_iommu_domain_init() failed\n");
3944                 vm_domain_exit(dmar_domain);
3945                 return -ENOMEM;
3946         }
3947         domain_update_iommu_cap(dmar_domain);
3948         domain->priv = dmar_domain;
3949
3950         return 0;
3951 }
3952
3953 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3954 {
3955         struct dmar_domain *dmar_domain = domain->priv;
3956
3957         domain->priv = NULL;
3958         vm_domain_exit(dmar_domain);
3959 }
3960
3961 static int intel_iommu_attach_device(struct iommu_domain *domain,
3962                                      struct device *dev)
3963 {
3964         struct dmar_domain *dmar_domain = domain->priv;
3965         struct pci_dev *pdev = to_pci_dev(dev);
3966         struct intel_iommu *iommu;
3967         int addr_width;
3968
3969         /* normally pdev is not mapped */
3970         if (unlikely(domain_context_mapped(pdev))) {
3971                 struct dmar_domain *old_domain;
3972
3973                 old_domain = find_domain(pdev);
3974                 if (old_domain) {
3975                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3976                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3977                                 domain_remove_one_dev_info(old_domain, pdev);
3978                         else
3979                                 domain_remove_dev_info(old_domain);
3980                 }
3981         }
3982
3983         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3984                                 pdev->devfn);
3985         if (!iommu)
3986                 return -ENODEV;
3987
3988         /* check if this iommu agaw is sufficient for max mapped address */
3989         addr_width = agaw_to_width(iommu->agaw);
3990         if (addr_width > cap_mgaw(iommu->cap))
3991                 addr_width = cap_mgaw(iommu->cap);
3992
3993         if (dmar_domain->max_addr > (1LL << addr_width)) {
3994                 printk(KERN_ERR "%s: iommu width (%d) is not "
3995                        "sufficient for the mapped address (%llx)\n",
3996                        __func__, addr_width, dmar_domain->max_addr);
3997                 return -EFAULT;
3998         }
3999         dmar_domain->gaw = addr_width;
4000
4001         /*
4002          * Knock out extra levels of page tables if necessary
4003          */
4004         while (iommu->agaw < dmar_domain->agaw) {
4005                 struct dma_pte *pte;
4006
4007                 pte = dmar_domain->pgd;
4008                 if (dma_pte_present(pte)) {
4009                         dmar_domain->pgd = (struct dma_pte *)
4010                                 phys_to_virt(dma_pte_addr(pte));
4011                         free_pgtable_page(pte);
4012                 }
4013                 dmar_domain->agaw--;
4014         }
4015
4016         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4017 }
4018
4019 static void intel_iommu_detach_device(struct iommu_domain *domain,
4020                                       struct device *dev)
4021 {
4022         struct dmar_domain *dmar_domain = domain->priv;
4023         struct pci_dev *pdev = to_pci_dev(dev);
4024
4025         domain_remove_one_dev_info(dmar_domain, pdev);
4026 }
4027
4028 static int intel_iommu_map(struct iommu_domain *domain,
4029                            unsigned long iova, phys_addr_t hpa,
4030                            int gfp_order, int iommu_prot)
4031 {
4032         struct dmar_domain *dmar_domain = domain->priv;
4033         u64 max_addr;
4034         int prot = 0;
4035         size_t size;
4036         int ret;
4037
4038         if (iommu_prot & IOMMU_READ)
4039                 prot |= DMA_PTE_READ;
4040         if (iommu_prot & IOMMU_WRITE)
4041                 prot |= DMA_PTE_WRITE;
4042         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4043                 prot |= DMA_PTE_SNP;
4044
4045         size     = PAGE_SIZE << gfp_order;
4046         max_addr = iova + size;
4047         if (dmar_domain->max_addr < max_addr) {
4048                 u64 end;
4049
4050                 /* check if minimum agaw is sufficient for mapped address */
4051                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4052                 if (end < max_addr) {
4053                         printk(KERN_ERR "%s: iommu width (%d) is not "
4054                                "sufficient for the mapped address (%llx)\n",
4055                                __func__, dmar_domain->gaw, max_addr);
4056                         return -EFAULT;
4057                 }
4058                 dmar_domain->max_addr = max_addr;
4059         }
4060         /* Round up size to next multiple of PAGE_SIZE, if it and
4061            the low bits of hpa would take us onto the next page */
4062         size = aligned_nrpages(hpa, size);
4063         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4064                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4065         return ret;
4066 }
4067
4068 static int intel_iommu_unmap(struct iommu_domain *domain,
4069                              unsigned long iova, int gfp_order)
4070 {
4071         struct dmar_domain *dmar_domain = domain->priv;
4072         size_t size = PAGE_SIZE << gfp_order;
4073         int order, iommu_id;
4074
4075         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4076                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4077
4078         if (dmar_domain->max_addr == iova + size)
4079                 dmar_domain->max_addr = iova;
4080
4081         for_each_set_bit(iommu_id, &dmar_domain->iommu_bmp, g_num_of_iommus) {
4082                 struct intel_iommu *iommu = g_iommus[iommu_id];
4083                 int num, ndomains;
4084
4085                 /*
4086                  * find bit position of dmar_domain
4087                  */
4088                 ndomains = cap_ndoms(iommu->cap);
4089                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
4090                         if (iommu->domains[num] == dmar_domain)
4091                                 iommu_flush_iotlb_psi(iommu, num,
4092                                                       iova >> VTD_PAGE_SHIFT,
4093                                                       1 << order, 0);
4094                 }
4095         }
4096
4097         return order;
4098 }
4099
4100 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4101                                             unsigned long iova)
4102 {
4103         struct dmar_domain *dmar_domain = domain->priv;
4104         struct dma_pte *pte;
4105         u64 phys = 0;
4106
4107         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4108         if (pte)
4109                 phys = dma_pte_addr(pte);
4110
4111         return phys;
4112 }
4113
4114 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4115                                       unsigned long cap)
4116 {
4117         struct dmar_domain *dmar_domain = domain->priv;
4118
4119         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4120                 return dmar_domain->iommu_snooping;
4121         if (cap == IOMMU_CAP_INTR_REMAP)
4122                 return intr_remapping_enabled;
4123
4124         return 0;
4125 }
4126
4127 static struct iommu_ops intel_iommu_ops = {
4128         .domain_init    = intel_iommu_domain_init,
4129         .domain_destroy = intel_iommu_domain_destroy,
4130         .attach_dev     = intel_iommu_attach_device,
4131         .detach_dev     = intel_iommu_detach_device,
4132         .map            = intel_iommu_map,
4133         .unmap          = intel_iommu_unmap,
4134         .iova_to_phys   = intel_iommu_iova_to_phys,
4135         .domain_has_cap = intel_iommu_domain_has_cap,
4136 };
4137
4138 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4139 {
4140         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4141         printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4142         dmar_map_gfx = 0;
4143 }
4144
4145 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4146 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4147 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4148 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4150 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4151 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4152
4153 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4154 {
4155         /*
4156          * Mobile 4 Series Chipset neglects to set RWBF capability,
4157          * but needs it. Same seems to hold for the desktop versions.
4158          */
4159         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4160         rwbf_quirk = 1;
4161 }
4162
4163 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4164 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4165 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4166 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4167 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4168 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4169 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4170
4171 #define GGC 0x52
4172 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4173 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4174 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4175 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4176 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4177 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4178 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4179 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4180
4181 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4182 {
4183         unsigned short ggc;
4184
4185         if (pci_read_config_word(dev, GGC, &ggc))
4186                 return;
4187
4188         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4189                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4190                 dmar_map_gfx = 0;
4191         } else if (dmar_map_gfx) {
4192                 /* we have to ensure the gfx device is idle before we flush */
4193                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4194                 intel_iommu_strict = 1;
4195        }
4196 }
4197 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4198 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4199 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4200 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4201
4202 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4203    ISOCH DMAR unit for the Azalia sound device, but not give it any
4204    TLB entries, which causes it to deadlock. Check for that.  We do
4205    this in a function called from init_dmars(), instead of in a PCI
4206    quirk, because we don't want to print the obnoxious "BIOS broken"
4207    message if VT-d is actually disabled.
4208 */
4209 static void __init check_tylersburg_isoch(void)
4210 {
4211         struct pci_dev *pdev;
4212         uint32_t vtisochctrl;
4213
4214         /* If there's no Azalia in the system anyway, forget it. */
4215         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4216         if (!pdev)
4217                 return;
4218         pci_dev_put(pdev);
4219
4220         /* System Management Registers. Might be hidden, in which case
4221            we can't do the sanity check. But that's OK, because the
4222            known-broken BIOSes _don't_ actually hide it, so far. */
4223         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4224         if (!pdev)
4225                 return;
4226
4227         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4228                 pci_dev_put(pdev);
4229                 return;
4230         }
4231
4232         pci_dev_put(pdev);
4233
4234         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4235         if (vtisochctrl & 1)
4236                 return;
4237
4238         /* Drop all bits other than the number of TLB entries */
4239         vtisochctrl &= 0x1c;
4240
4241         /* If we have the recommended number of TLB entries (16), fine. */
4242         if (vtisochctrl == 0x10)
4243                 return;
4244
4245         /* Zero TLB entries? You get to ride the short bus to school. */
4246         if (!vtisochctrl) {
4247                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4248                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4249                      dmi_get_system_info(DMI_BIOS_VENDOR),
4250                      dmi_get_system_info(DMI_BIOS_VERSION),
4251                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4252                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4253                 return;
4254         }
4255         
4256         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4257                vtisochctrl);
4258 }