Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu
[pandora-kernel.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/syscore_ops.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <linux/pci-ats.h>
43 #include <asm/cacheflush.h>
44 #include <asm/iommu.h>
45
46 #define ROOT_SIZE               VTD_PAGE_SIZE
47 #define CONTEXT_SIZE            VTD_PAGE_SIZE
48
49 #define IS_BRIDGE_HOST_DEVICE(pdev) \
50                             ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
51 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
52 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
53 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
54
55 #define IOAPIC_RANGE_START      (0xfee00000)
56 #define IOAPIC_RANGE_END        (0xfeefffff)
57 #define IOVA_START_ADDR         (0x1000)
58
59 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
60
61 #define MAX_AGAW_WIDTH 64
62
63 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
64 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
65
66 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
67    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
68 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
69                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
70 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
71
72 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
73 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
74 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
75
76 /* page table handling */
77 #define LEVEL_STRIDE            (9)
78 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
79
80 static inline int agaw_to_level(int agaw)
81 {
82         return agaw + 2;
83 }
84
85 static inline int agaw_to_width(int agaw)
86 {
87         return 30 + agaw * LEVEL_STRIDE;
88 }
89
90 static inline int width_to_agaw(int width)
91 {
92         return (width - 30) / LEVEL_STRIDE;
93 }
94
95 static inline unsigned int level_to_offset_bits(int level)
96 {
97         return (level - 1) * LEVEL_STRIDE;
98 }
99
100 static inline int pfn_level_offset(unsigned long pfn, int level)
101 {
102         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
103 }
104
105 static inline unsigned long level_mask(int level)
106 {
107         return -1UL << level_to_offset_bits(level);
108 }
109
110 static inline unsigned long level_size(int level)
111 {
112         return 1UL << level_to_offset_bits(level);
113 }
114
115 static inline unsigned long align_to_level(unsigned long pfn, int level)
116 {
117         return (pfn + level_size(level) - 1) & level_mask(level);
118 }
119
120 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
121 {
122         return  1 << ((lvl - 1) * LEVEL_STRIDE);
123 }
124
125 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
126    are never going to work. */
127 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
128 {
129         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
130 }
131
132 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
133 {
134         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
135 }
136 static inline unsigned long page_to_dma_pfn(struct page *pg)
137 {
138         return mm_to_dma_pfn(page_to_pfn(pg));
139 }
140 static inline unsigned long virt_to_dma_pfn(void *p)
141 {
142         return page_to_dma_pfn(virt_to_page(p));
143 }
144
145 /* global iommu list, set NULL for ignored DMAR units */
146 static struct intel_iommu **g_iommus;
147
148 static void __init check_tylersburg_isoch(void);
149 static int rwbf_quirk;
150
151 /*
152  * set to 1 to panic kernel if can't successfully enable VT-d
153  * (used when kernel is launched w/ TXT)
154  */
155 static int force_on = 0;
156
157 /*
158  * 0: Present
159  * 1-11: Reserved
160  * 12-63: Context Ptr (12 - (haw-1))
161  * 64-127: Reserved
162  */
163 struct root_entry {
164         u64     val;
165         u64     rsvd1;
166 };
167 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
168 static inline bool root_present(struct root_entry *root)
169 {
170         return (root->val & 1);
171 }
172 static inline void set_root_present(struct root_entry *root)
173 {
174         root->val |= 1;
175 }
176 static inline void set_root_value(struct root_entry *root, unsigned long value)
177 {
178         root->val |= value & VTD_PAGE_MASK;
179 }
180
181 static inline struct context_entry *
182 get_context_addr_from_root(struct root_entry *root)
183 {
184         return (struct context_entry *)
185                 (root_present(root)?phys_to_virt(
186                 root->val & VTD_PAGE_MASK) :
187                 NULL);
188 }
189
190 /*
191  * low 64 bits:
192  * 0: present
193  * 1: fault processing disable
194  * 2-3: translation type
195  * 12-63: address space root
196  * high 64 bits:
197  * 0-2: address width
198  * 3-6: aval
199  * 8-23: domain id
200  */
201 struct context_entry {
202         u64 lo;
203         u64 hi;
204 };
205
206 static inline bool context_present(struct context_entry *context)
207 {
208         return (context->lo & 1);
209 }
210 static inline void context_set_present(struct context_entry *context)
211 {
212         context->lo |= 1;
213 }
214
215 static inline void context_set_fault_enable(struct context_entry *context)
216 {
217         context->lo &= (((u64)-1) << 2) | 1;
218 }
219
220 static inline void context_set_translation_type(struct context_entry *context,
221                                                 unsigned long value)
222 {
223         context->lo &= (((u64)-1) << 4) | 3;
224         context->lo |= (value & 3) << 2;
225 }
226
227 static inline void context_set_address_root(struct context_entry *context,
228                                             unsigned long value)
229 {
230         context->lo |= value & VTD_PAGE_MASK;
231 }
232
233 static inline void context_set_address_width(struct context_entry *context,
234                                              unsigned long value)
235 {
236         context->hi |= value & 7;
237 }
238
239 static inline void context_set_domain_id(struct context_entry *context,
240                                          unsigned long value)
241 {
242         context->hi |= (value & ((1 << 16) - 1)) << 8;
243 }
244
245 static inline void context_clear_entry(struct context_entry *context)
246 {
247         context->lo = 0;
248         context->hi = 0;
249 }
250
251 /*
252  * 0: readable
253  * 1: writable
254  * 2-6: reserved
255  * 7: super page
256  * 8-10: available
257  * 11: snoop behavior
258  * 12-63: Host physcial address
259  */
260 struct dma_pte {
261         u64 val;
262 };
263
264 static inline void dma_clear_pte(struct dma_pte *pte)
265 {
266         pte->val = 0;
267 }
268
269 static inline void dma_set_pte_readable(struct dma_pte *pte)
270 {
271         pte->val |= DMA_PTE_READ;
272 }
273
274 static inline void dma_set_pte_writable(struct dma_pte *pte)
275 {
276         pte->val |= DMA_PTE_WRITE;
277 }
278
279 static inline void dma_set_pte_snp(struct dma_pte *pte)
280 {
281         pte->val |= DMA_PTE_SNP;
282 }
283
284 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
285 {
286         pte->val = (pte->val & ~3) | (prot & 3);
287 }
288
289 static inline u64 dma_pte_addr(struct dma_pte *pte)
290 {
291 #ifdef CONFIG_64BIT
292         return pte->val & VTD_PAGE_MASK;
293 #else
294         /* Must have a full atomic 64-bit read */
295         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
296 #endif
297 }
298
299 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
300 {
301         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
302 }
303
304 static inline bool dma_pte_present(struct dma_pte *pte)
305 {
306         return (pte->val & 3) != 0;
307 }
308
309 static inline bool dma_pte_superpage(struct dma_pte *pte)
310 {
311         return (pte->val & (1 << 7));
312 }
313
314 static inline int first_pte_in_page(struct dma_pte *pte)
315 {
316         return !((unsigned long)pte & ~VTD_PAGE_MASK);
317 }
318
319 /*
320  * This domain is a statically identity mapping domain.
321  *      1. This domain creats a static 1:1 mapping to all usable memory.
322  *      2. It maps to each iommu if successful.
323  *      3. Each iommu mapps to this domain if successful.
324  */
325 static struct dmar_domain *si_domain;
326 static int hw_pass_through = 1;
327
328 /* devices under the same p2p bridge are owned in one domain */
329 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
330
331 /* domain represents a virtual machine, more than one devices
332  * across iommus may be owned in one domain, e.g. kvm guest.
333  */
334 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
335
336 /* si_domain contains mulitple devices */
337 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
338
339 struct dmar_domain {
340         int     id;                     /* domain id */
341         int     nid;                    /* node id */
342         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
343
344         struct list_head devices;       /* all devices' list */
345         struct iova_domain iovad;       /* iova's that belong to this domain */
346
347         struct dma_pte  *pgd;           /* virtual address */
348         int             gaw;            /* max guest address width */
349
350         /* adjusted guest address width, 0 is level 2 30-bit */
351         int             agaw;
352
353         int             flags;          /* flags to find out type of domain */
354
355         int             iommu_coherency;/* indicate coherency of iommu access */
356         int             iommu_snooping; /* indicate snooping control feature*/
357         int             iommu_count;    /* reference count of iommu */
358         int             iommu_superpage;/* Level of superpages supported:
359                                            0 == 4KiB (no superpages), 1 == 2MiB,
360                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
361         spinlock_t      iommu_lock;     /* protect iommu set in domain */
362         u64             max_addr;       /* maximum mapped address */
363 };
364
365 /* PCI domain-device relationship */
366 struct device_domain_info {
367         struct list_head link;  /* link to domain siblings */
368         struct list_head global; /* link to global list */
369         int segment;            /* PCI domain */
370         u8 bus;                 /* PCI bus number */
371         u8 devfn;               /* PCI devfn number */
372         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
373         struct intel_iommu *iommu; /* IOMMU used by this device */
374         struct dmar_domain *domain; /* pointer to domain */
375 };
376
377 static void flush_unmaps_timeout(unsigned long data);
378
379 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
380
381 #define HIGH_WATER_MARK 250
382 struct deferred_flush_tables {
383         int next;
384         struct iova *iova[HIGH_WATER_MARK];
385         struct dmar_domain *domain[HIGH_WATER_MARK];
386 };
387
388 static struct deferred_flush_tables *deferred_flush;
389
390 /* bitmap for indexing intel_iommus */
391 static int g_num_of_iommus;
392
393 static DEFINE_SPINLOCK(async_umap_flush_lock);
394 static LIST_HEAD(unmaps_to_do);
395
396 static int timer_on;
397 static long list_size;
398
399 static void domain_remove_dev_info(struct dmar_domain *domain);
400
401 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
402 int dmar_disabled = 0;
403 #else
404 int dmar_disabled = 1;
405 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
406
407 static int dmar_map_gfx = 1;
408 static int dmar_forcedac;
409 static int intel_iommu_strict;
410 static int intel_iommu_superpage = 1;
411
412 int intel_iommu_gfx_mapped;
413 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
414
415 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
416 static DEFINE_SPINLOCK(device_domain_lock);
417 static LIST_HEAD(device_domain_list);
418
419 static struct iommu_ops intel_iommu_ops;
420
421 static int __init intel_iommu_setup(char *str)
422 {
423         if (!str)
424                 return -EINVAL;
425         while (*str) {
426                 if (!strncmp(str, "on", 2)) {
427                         dmar_disabled = 0;
428                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
429                 } else if (!strncmp(str, "off", 3)) {
430                         dmar_disabled = 1;
431                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
432                 } else if (!strncmp(str, "igfx_off", 8)) {
433                         dmar_map_gfx = 0;
434                         printk(KERN_INFO
435                                 "Intel-IOMMU: disable GFX device mapping\n");
436                 } else if (!strncmp(str, "forcedac", 8)) {
437                         printk(KERN_INFO
438                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
439                         dmar_forcedac = 1;
440                 } else if (!strncmp(str, "strict", 6)) {
441                         printk(KERN_INFO
442                                 "Intel-IOMMU: disable batched IOTLB flush\n");
443                         intel_iommu_strict = 1;
444                 } else if (!strncmp(str, "sp_off", 6)) {
445                         printk(KERN_INFO
446                                 "Intel-IOMMU: disable supported super page\n");
447                         intel_iommu_superpage = 0;
448                 }
449
450                 str += strcspn(str, ",");
451                 while (*str == ',')
452                         str++;
453         }
454         return 0;
455 }
456 __setup("intel_iommu=", intel_iommu_setup);
457
458 static struct kmem_cache *iommu_domain_cache;
459 static struct kmem_cache *iommu_devinfo_cache;
460 static struct kmem_cache *iommu_iova_cache;
461
462 static inline void *alloc_pgtable_page(int node)
463 {
464         struct page *page;
465         void *vaddr = NULL;
466
467         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
468         if (page)
469                 vaddr = page_address(page);
470         return vaddr;
471 }
472
473 static inline void free_pgtable_page(void *vaddr)
474 {
475         free_page((unsigned long)vaddr);
476 }
477
478 static inline void *alloc_domain_mem(void)
479 {
480         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
481 }
482
483 static void free_domain_mem(void *vaddr)
484 {
485         kmem_cache_free(iommu_domain_cache, vaddr);
486 }
487
488 static inline void * alloc_devinfo_mem(void)
489 {
490         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
491 }
492
493 static inline void free_devinfo_mem(void *vaddr)
494 {
495         kmem_cache_free(iommu_devinfo_cache, vaddr);
496 }
497
498 struct iova *alloc_iova_mem(void)
499 {
500         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
501 }
502
503 void free_iova_mem(struct iova *iova)
504 {
505         kmem_cache_free(iommu_iova_cache, iova);
506 }
507
508
509 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
510 {
511         unsigned long sagaw;
512         int agaw = -1;
513
514         sagaw = cap_sagaw(iommu->cap);
515         for (agaw = width_to_agaw(max_gaw);
516              agaw >= 0; agaw--) {
517                 if (test_bit(agaw, &sagaw))
518                         break;
519         }
520
521         return agaw;
522 }
523
524 /*
525  * Calculate max SAGAW for each iommu.
526  */
527 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
528 {
529         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
530 }
531
532 /*
533  * calculate agaw for each iommu.
534  * "SAGAW" may be different across iommus, use a default agaw, and
535  * get a supported less agaw for iommus that don't support the default agaw.
536  */
537 int iommu_calculate_agaw(struct intel_iommu *iommu)
538 {
539         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
540 }
541
542 /* This functionin only returns single iommu in a domain */
543 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
544 {
545         int iommu_id;
546
547         /* si_domain and vm domain should not get here. */
548         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
549         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
550
551         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
552         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
553                 return NULL;
554
555         return g_iommus[iommu_id];
556 }
557
558 static void domain_update_iommu_coherency(struct dmar_domain *domain)
559 {
560         int i;
561
562         domain->iommu_coherency = 1;
563
564         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
565                 if (!ecap_coherent(g_iommus[i]->ecap)) {
566                         domain->iommu_coherency = 0;
567                         break;
568                 }
569         }
570 }
571
572 static void domain_update_iommu_snooping(struct dmar_domain *domain)
573 {
574         int i;
575
576         domain->iommu_snooping = 1;
577
578         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
579                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
580                         domain->iommu_snooping = 0;
581                         break;
582                 }
583         }
584 }
585
586 static void domain_update_iommu_superpage(struct dmar_domain *domain)
587 {
588         struct dmar_drhd_unit *drhd;
589         struct intel_iommu *iommu = NULL;
590         int mask = 0xf;
591
592         if (!intel_iommu_superpage) {
593                 domain->iommu_superpage = 0;
594                 return;
595         }
596
597         /* set iommu_superpage to the smallest common denominator */
598         for_each_active_iommu(iommu, drhd) {
599                 mask &= cap_super_page_val(iommu->cap);
600                 if (!mask) {
601                         break;
602                 }
603         }
604         domain->iommu_superpage = fls(mask);
605 }
606
607 /* Some capabilities may be different across iommus */
608 static void domain_update_iommu_cap(struct dmar_domain *domain)
609 {
610         domain_update_iommu_coherency(domain);
611         domain_update_iommu_snooping(domain);
612         domain_update_iommu_superpage(domain);
613 }
614
615 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
616 {
617         struct dmar_drhd_unit *drhd = NULL;
618         int i;
619
620         for_each_drhd_unit(drhd) {
621                 if (drhd->ignored)
622                         continue;
623                 if (segment != drhd->segment)
624                         continue;
625
626                 for (i = 0; i < drhd->devices_cnt; i++) {
627                         if (drhd->devices[i] &&
628                             drhd->devices[i]->bus->number == bus &&
629                             drhd->devices[i]->devfn == devfn)
630                                 return drhd->iommu;
631                         if (drhd->devices[i] &&
632                             drhd->devices[i]->subordinate &&
633                             drhd->devices[i]->subordinate->number <= bus &&
634                             drhd->devices[i]->subordinate->subordinate >= bus)
635                                 return drhd->iommu;
636                 }
637
638                 if (drhd->include_all)
639                         return drhd->iommu;
640         }
641
642         return NULL;
643 }
644
645 static void domain_flush_cache(struct dmar_domain *domain,
646                                void *addr, int size)
647 {
648         if (!domain->iommu_coherency)
649                 clflush_cache_range(addr, size);
650 }
651
652 /* Gets context entry for a given bus and devfn */
653 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
654                 u8 bus, u8 devfn)
655 {
656         struct root_entry *root;
657         struct context_entry *context;
658         unsigned long phy_addr;
659         unsigned long flags;
660
661         spin_lock_irqsave(&iommu->lock, flags);
662         root = &iommu->root_entry[bus];
663         context = get_context_addr_from_root(root);
664         if (!context) {
665                 context = (struct context_entry *)
666                                 alloc_pgtable_page(iommu->node);
667                 if (!context) {
668                         spin_unlock_irqrestore(&iommu->lock, flags);
669                         return NULL;
670                 }
671                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
672                 phy_addr = virt_to_phys((void *)context);
673                 set_root_value(root, phy_addr);
674                 set_root_present(root);
675                 __iommu_flush_cache(iommu, root, sizeof(*root));
676         }
677         spin_unlock_irqrestore(&iommu->lock, flags);
678         return &context[devfn];
679 }
680
681 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
682 {
683         struct root_entry *root;
684         struct context_entry *context;
685         int ret;
686         unsigned long flags;
687
688         spin_lock_irqsave(&iommu->lock, flags);
689         root = &iommu->root_entry[bus];
690         context = get_context_addr_from_root(root);
691         if (!context) {
692                 ret = 0;
693                 goto out;
694         }
695         ret = context_present(&context[devfn]);
696 out:
697         spin_unlock_irqrestore(&iommu->lock, flags);
698         return ret;
699 }
700
701 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
702 {
703         struct root_entry *root;
704         struct context_entry *context;
705         unsigned long flags;
706
707         spin_lock_irqsave(&iommu->lock, flags);
708         root = &iommu->root_entry[bus];
709         context = get_context_addr_from_root(root);
710         if (context) {
711                 context_clear_entry(&context[devfn]);
712                 __iommu_flush_cache(iommu, &context[devfn], \
713                         sizeof(*context));
714         }
715         spin_unlock_irqrestore(&iommu->lock, flags);
716 }
717
718 static void free_context_table(struct intel_iommu *iommu)
719 {
720         struct root_entry *root;
721         int i;
722         unsigned long flags;
723         struct context_entry *context;
724
725         spin_lock_irqsave(&iommu->lock, flags);
726         if (!iommu->root_entry) {
727                 goto out;
728         }
729         for (i = 0; i < ROOT_ENTRY_NR; i++) {
730                 root = &iommu->root_entry[i];
731                 context = get_context_addr_from_root(root);
732                 if (context)
733                         free_pgtable_page(context);
734         }
735         free_pgtable_page(iommu->root_entry);
736         iommu->root_entry = NULL;
737 out:
738         spin_unlock_irqrestore(&iommu->lock, flags);
739 }
740
741 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
742                                       unsigned long pfn, int target_level)
743 {
744         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
745         struct dma_pte *parent, *pte = NULL;
746         int level = agaw_to_level(domain->agaw);
747         int offset;
748
749         BUG_ON(!domain->pgd);
750         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
751         parent = domain->pgd;
752
753         while (level > 0) {
754                 void *tmp_page;
755
756                 offset = pfn_level_offset(pfn, level);
757                 pte = &parent[offset];
758                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
759                         break;
760                 if (level == target_level)
761                         break;
762
763                 if (!dma_pte_present(pte)) {
764                         uint64_t pteval;
765
766                         tmp_page = alloc_pgtable_page(domain->nid);
767
768                         if (!tmp_page)
769                                 return NULL;
770
771                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
772                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
773                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
774                                 /* Someone else set it while we were thinking; use theirs. */
775                                 free_pgtable_page(tmp_page);
776                         } else {
777                                 dma_pte_addr(pte);
778                                 domain_flush_cache(domain, pte, sizeof(*pte));
779                         }
780                 }
781                 parent = phys_to_virt(dma_pte_addr(pte));
782                 level--;
783         }
784
785         return pte;
786 }
787
788
789 /* return address's pte at specific level */
790 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
791                                          unsigned long pfn,
792                                          int level, int *large_page)
793 {
794         struct dma_pte *parent, *pte = NULL;
795         int total = agaw_to_level(domain->agaw);
796         int offset;
797
798         parent = domain->pgd;
799         while (level <= total) {
800                 offset = pfn_level_offset(pfn, total);
801                 pte = &parent[offset];
802                 if (level == total)
803                         return pte;
804
805                 if (!dma_pte_present(pte)) {
806                         *large_page = total;
807                         break;
808                 }
809
810                 if (pte->val & DMA_PTE_LARGE_PAGE) {
811                         *large_page = total;
812                         return pte;
813                 }
814
815                 parent = phys_to_virt(dma_pte_addr(pte));
816                 total--;
817         }
818         return NULL;
819 }
820
821 /* clear last level pte, a tlb flush should be followed */
822 static int dma_pte_clear_range(struct dmar_domain *domain,
823                                 unsigned long start_pfn,
824                                 unsigned long last_pfn)
825 {
826         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
827         unsigned int large_page = 1;
828         struct dma_pte *first_pte, *pte;
829         int order;
830
831         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
832         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
833         BUG_ON(start_pfn > last_pfn);
834
835         /* we don't need lock here; nobody else touches the iova range */
836         do {
837                 large_page = 1;
838                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
839                 if (!pte) {
840                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
841                         continue;
842                 }
843                 do {
844                         dma_clear_pte(pte);
845                         start_pfn += lvl_to_nr_pages(large_page);
846                         pte++;
847                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
848
849                 domain_flush_cache(domain, first_pte,
850                                    (void *)pte - (void *)first_pte);
851
852         } while (start_pfn && start_pfn <= last_pfn);
853
854         order = (large_page - 1) * 9;
855         return order;
856 }
857
858 /* free page table pages. last level pte should already be cleared */
859 static void dma_pte_free_pagetable(struct dmar_domain *domain,
860                                    unsigned long start_pfn,
861                                    unsigned long last_pfn)
862 {
863         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
864         struct dma_pte *first_pte, *pte;
865         int total = agaw_to_level(domain->agaw);
866         int level;
867         unsigned long tmp;
868         int large_page = 2;
869
870         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
871         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
872         BUG_ON(start_pfn > last_pfn);
873
874         /* We don't need lock here; nobody else touches the iova range */
875         level = 2;
876         while (level <= total) {
877                 tmp = align_to_level(start_pfn, level);
878
879                 /* If we can't even clear one PTE at this level, we're done */
880                 if (tmp + level_size(level) - 1 > last_pfn)
881                         return;
882
883                 do {
884                         large_page = level;
885                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
886                         if (large_page > level)
887                                 level = large_page + 1;
888                         if (!pte) {
889                                 tmp = align_to_level(tmp + 1, level + 1);
890                                 continue;
891                         }
892                         do {
893                                 if (dma_pte_present(pte)) {
894                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
895                                         dma_clear_pte(pte);
896                                 }
897                                 pte++;
898                                 tmp += level_size(level);
899                         } while (!first_pte_in_page(pte) &&
900                                  tmp + level_size(level) - 1 <= last_pfn);
901
902                         domain_flush_cache(domain, first_pte,
903                                            (void *)pte - (void *)first_pte);
904                         
905                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
906                 level++;
907         }
908         /* free pgd */
909         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
910                 free_pgtable_page(domain->pgd);
911                 domain->pgd = NULL;
912         }
913 }
914
915 /* iommu handling */
916 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
917 {
918         struct root_entry *root;
919         unsigned long flags;
920
921         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
922         if (!root)
923                 return -ENOMEM;
924
925         __iommu_flush_cache(iommu, root, ROOT_SIZE);
926
927         spin_lock_irqsave(&iommu->lock, flags);
928         iommu->root_entry = root;
929         spin_unlock_irqrestore(&iommu->lock, flags);
930
931         return 0;
932 }
933
934 static void iommu_set_root_entry(struct intel_iommu *iommu)
935 {
936         void *addr;
937         u32 sts;
938         unsigned long flag;
939
940         addr = iommu->root_entry;
941
942         raw_spin_lock_irqsave(&iommu->register_lock, flag);
943         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
944
945         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
946
947         /* Make sure hardware complete it */
948         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
949                       readl, (sts & DMA_GSTS_RTPS), sts);
950
951         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
952 }
953
954 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
955 {
956         u32 val;
957         unsigned long flag;
958
959         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
960                 return;
961
962         raw_spin_lock_irqsave(&iommu->register_lock, flag);
963         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
964
965         /* Make sure hardware complete it */
966         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
967                       readl, (!(val & DMA_GSTS_WBFS)), val);
968
969         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
970 }
971
972 /* return value determine if we need a write buffer flush */
973 static void __iommu_flush_context(struct intel_iommu *iommu,
974                                   u16 did, u16 source_id, u8 function_mask,
975                                   u64 type)
976 {
977         u64 val = 0;
978         unsigned long flag;
979
980         switch (type) {
981         case DMA_CCMD_GLOBAL_INVL:
982                 val = DMA_CCMD_GLOBAL_INVL;
983                 break;
984         case DMA_CCMD_DOMAIN_INVL:
985                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
986                 break;
987         case DMA_CCMD_DEVICE_INVL:
988                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
989                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
990                 break;
991         default:
992                 BUG();
993         }
994         val |= DMA_CCMD_ICC;
995
996         raw_spin_lock_irqsave(&iommu->register_lock, flag);
997         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
998
999         /* Make sure hardware complete it */
1000         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1001                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1002
1003         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1004 }
1005
1006 /* return value determine if we need a write buffer flush */
1007 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1008                                 u64 addr, unsigned int size_order, u64 type)
1009 {
1010         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1011         u64 val = 0, val_iva = 0;
1012         unsigned long flag;
1013
1014         switch (type) {
1015         case DMA_TLB_GLOBAL_FLUSH:
1016                 /* global flush doesn't need set IVA_REG */
1017                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1018                 break;
1019         case DMA_TLB_DSI_FLUSH:
1020                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1021                 break;
1022         case DMA_TLB_PSI_FLUSH:
1023                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1024                 /* Note: always flush non-leaf currently */
1025                 val_iva = size_order | addr;
1026                 break;
1027         default:
1028                 BUG();
1029         }
1030         /* Note: set drain read/write */
1031 #if 0
1032         /*
1033          * This is probably to be super secure.. Looks like we can
1034          * ignore it without any impact.
1035          */
1036         if (cap_read_drain(iommu->cap))
1037                 val |= DMA_TLB_READ_DRAIN;
1038 #endif
1039         if (cap_write_drain(iommu->cap))
1040                 val |= DMA_TLB_WRITE_DRAIN;
1041
1042         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1043         /* Note: Only uses first TLB reg currently */
1044         if (val_iva)
1045                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1046         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1047
1048         /* Make sure hardware complete it */
1049         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1050                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1051
1052         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1053
1054         /* check IOTLB invalidation granularity */
1055         if (DMA_TLB_IAIG(val) == 0)
1056                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1057         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1058                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1059                         (unsigned long long)DMA_TLB_IIRG(type),
1060                         (unsigned long long)DMA_TLB_IAIG(val));
1061 }
1062
1063 static struct device_domain_info *iommu_support_dev_iotlb(
1064         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1065 {
1066         int found = 0;
1067         unsigned long flags;
1068         struct device_domain_info *info;
1069         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1070
1071         if (!ecap_dev_iotlb_support(iommu->ecap))
1072                 return NULL;
1073
1074         if (!iommu->qi)
1075                 return NULL;
1076
1077         spin_lock_irqsave(&device_domain_lock, flags);
1078         list_for_each_entry(info, &domain->devices, link)
1079                 if (info->bus == bus && info->devfn == devfn) {
1080                         found = 1;
1081                         break;
1082                 }
1083         spin_unlock_irqrestore(&device_domain_lock, flags);
1084
1085         if (!found || !info->dev)
1086                 return NULL;
1087
1088         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1089                 return NULL;
1090
1091         if (!dmar_find_matched_atsr_unit(info->dev))
1092                 return NULL;
1093
1094         info->iommu = iommu;
1095
1096         return info;
1097 }
1098
1099 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1100 {
1101         if (!info)
1102                 return;
1103
1104         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1105 }
1106
1107 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1108 {
1109         if (!info->dev || !pci_ats_enabled(info->dev))
1110                 return;
1111
1112         pci_disable_ats(info->dev);
1113 }
1114
1115 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1116                                   u64 addr, unsigned mask)
1117 {
1118         u16 sid, qdep;
1119         unsigned long flags;
1120         struct device_domain_info *info;
1121
1122         spin_lock_irqsave(&device_domain_lock, flags);
1123         list_for_each_entry(info, &domain->devices, link) {
1124                 if (!info->dev || !pci_ats_enabled(info->dev))
1125                         continue;
1126
1127                 sid = info->bus << 8 | info->devfn;
1128                 qdep = pci_ats_queue_depth(info->dev);
1129                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1130         }
1131         spin_unlock_irqrestore(&device_domain_lock, flags);
1132 }
1133
1134 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1135                                   unsigned long pfn, unsigned int pages, int map)
1136 {
1137         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1138         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1139
1140         BUG_ON(pages == 0);
1141
1142         /*
1143          * Fallback to domain selective flush if no PSI support or the size is
1144          * too big.
1145          * PSI requires page size to be 2 ^ x, and the base address is naturally
1146          * aligned to the size
1147          */
1148         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1149                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1150                                                 DMA_TLB_DSI_FLUSH);
1151         else
1152                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1153                                                 DMA_TLB_PSI_FLUSH);
1154
1155         /*
1156          * In caching mode, changes of pages from non-present to present require
1157          * flush. However, device IOTLB doesn't need to be flushed in this case.
1158          */
1159         if (!cap_caching_mode(iommu->cap) || !map)
1160                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1161 }
1162
1163 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1164 {
1165         u32 pmen;
1166         unsigned long flags;
1167
1168         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1169         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1170         pmen &= ~DMA_PMEN_EPM;
1171         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1172
1173         /* wait for the protected region status bit to clear */
1174         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1175                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1176
1177         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1178 }
1179
1180 static int iommu_enable_translation(struct intel_iommu *iommu)
1181 {
1182         u32 sts;
1183         unsigned long flags;
1184
1185         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1186         iommu->gcmd |= DMA_GCMD_TE;
1187         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1188
1189         /* Make sure hardware complete it */
1190         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1191                       readl, (sts & DMA_GSTS_TES), sts);
1192
1193         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1194         return 0;
1195 }
1196
1197 static int iommu_disable_translation(struct intel_iommu *iommu)
1198 {
1199         u32 sts;
1200         unsigned long flag;
1201
1202         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1203         iommu->gcmd &= ~DMA_GCMD_TE;
1204         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1205
1206         /* Make sure hardware complete it */
1207         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1208                       readl, (!(sts & DMA_GSTS_TES)), sts);
1209
1210         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1211         return 0;
1212 }
1213
1214
1215 static int iommu_init_domains(struct intel_iommu *iommu)
1216 {
1217         unsigned long ndomains;
1218         unsigned long nlongs;
1219
1220         ndomains = cap_ndoms(iommu->cap);
1221         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1222                         ndomains);
1223         nlongs = BITS_TO_LONGS(ndomains);
1224
1225         spin_lock_init(&iommu->lock);
1226
1227         /* TBD: there might be 64K domains,
1228          * consider other allocation for future chip
1229          */
1230         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1231         if (!iommu->domain_ids) {
1232                 printk(KERN_ERR "Allocating domain id array failed\n");
1233                 return -ENOMEM;
1234         }
1235         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1236                         GFP_KERNEL);
1237         if (!iommu->domains) {
1238                 printk(KERN_ERR "Allocating domain array failed\n");
1239                 return -ENOMEM;
1240         }
1241
1242         /*
1243          * if Caching mode is set, then invalid translations are tagged
1244          * with domainid 0. Hence we need to pre-allocate it.
1245          */
1246         if (cap_caching_mode(iommu->cap))
1247                 set_bit(0, iommu->domain_ids);
1248         return 0;
1249 }
1250
1251
1252 static void domain_exit(struct dmar_domain *domain);
1253 static void vm_domain_exit(struct dmar_domain *domain);
1254
1255 void free_dmar_iommu(struct intel_iommu *iommu)
1256 {
1257         struct dmar_domain *domain;
1258         int i;
1259         unsigned long flags;
1260
1261         if ((iommu->domains) && (iommu->domain_ids)) {
1262                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1263                         domain = iommu->domains[i];
1264                         clear_bit(i, iommu->domain_ids);
1265
1266                         spin_lock_irqsave(&domain->iommu_lock, flags);
1267                         if (--domain->iommu_count == 0) {
1268                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1269                                         vm_domain_exit(domain);
1270                                 else
1271                                         domain_exit(domain);
1272                         }
1273                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1274                 }
1275         }
1276
1277         if (iommu->gcmd & DMA_GCMD_TE)
1278                 iommu_disable_translation(iommu);
1279
1280         if (iommu->irq) {
1281                 irq_set_handler_data(iommu->irq, NULL);
1282                 /* This will mask the irq */
1283                 free_irq(iommu->irq, iommu);
1284                 destroy_irq(iommu->irq);
1285         }
1286
1287         kfree(iommu->domains);
1288         kfree(iommu->domain_ids);
1289
1290         g_iommus[iommu->seq_id] = NULL;
1291
1292         /* if all iommus are freed, free g_iommus */
1293         for (i = 0; i < g_num_of_iommus; i++) {
1294                 if (g_iommus[i])
1295                         break;
1296         }
1297
1298         if (i == g_num_of_iommus)
1299                 kfree(g_iommus);
1300
1301         /* free context mapping */
1302         free_context_table(iommu);
1303 }
1304
1305 static struct dmar_domain *alloc_domain(void)
1306 {
1307         struct dmar_domain *domain;
1308
1309         domain = alloc_domain_mem();
1310         if (!domain)
1311                 return NULL;
1312
1313         domain->nid = -1;
1314         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1315         domain->flags = 0;
1316
1317         return domain;
1318 }
1319
1320 static int iommu_attach_domain(struct dmar_domain *domain,
1321                                struct intel_iommu *iommu)
1322 {
1323         int num;
1324         unsigned long ndomains;
1325         unsigned long flags;
1326
1327         ndomains = cap_ndoms(iommu->cap);
1328
1329         spin_lock_irqsave(&iommu->lock, flags);
1330
1331         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1332         if (num >= ndomains) {
1333                 spin_unlock_irqrestore(&iommu->lock, flags);
1334                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1335                 return -ENOMEM;
1336         }
1337
1338         domain->id = num;
1339         set_bit(num, iommu->domain_ids);
1340         set_bit(iommu->seq_id, &domain->iommu_bmp);
1341         iommu->domains[num] = domain;
1342         spin_unlock_irqrestore(&iommu->lock, flags);
1343
1344         return 0;
1345 }
1346
1347 static void iommu_detach_domain(struct dmar_domain *domain,
1348                                 struct intel_iommu *iommu)
1349 {
1350         unsigned long flags;
1351         int num, ndomains;
1352         int found = 0;
1353
1354         spin_lock_irqsave(&iommu->lock, flags);
1355         ndomains = cap_ndoms(iommu->cap);
1356         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1357                 if (iommu->domains[num] == domain) {
1358                         found = 1;
1359                         break;
1360                 }
1361         }
1362
1363         if (found) {
1364                 clear_bit(num, iommu->domain_ids);
1365                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1366                 iommu->domains[num] = NULL;
1367         }
1368         spin_unlock_irqrestore(&iommu->lock, flags);
1369 }
1370
1371 static struct iova_domain reserved_iova_list;
1372 static struct lock_class_key reserved_rbtree_key;
1373
1374 static int dmar_init_reserved_ranges(void)
1375 {
1376         struct pci_dev *pdev = NULL;
1377         struct iova *iova;
1378         int i;
1379
1380         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1381
1382         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1383                 &reserved_rbtree_key);
1384
1385         /* IOAPIC ranges shouldn't be accessed by DMA */
1386         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1387                 IOVA_PFN(IOAPIC_RANGE_END));
1388         if (!iova) {
1389                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1390                 return -ENODEV;
1391         }
1392
1393         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1394         for_each_pci_dev(pdev) {
1395                 struct resource *r;
1396
1397                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1398                         r = &pdev->resource[i];
1399                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1400                                 continue;
1401                         iova = reserve_iova(&reserved_iova_list,
1402                                             IOVA_PFN(r->start),
1403                                             IOVA_PFN(r->end));
1404                         if (!iova) {
1405                                 printk(KERN_ERR "Reserve iova failed\n");
1406                                 return -ENODEV;
1407                         }
1408                 }
1409         }
1410         return 0;
1411 }
1412
1413 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1414 {
1415         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1416 }
1417
1418 static inline int guestwidth_to_adjustwidth(int gaw)
1419 {
1420         int agaw;
1421         int r = (gaw - 12) % 9;
1422
1423         if (r == 0)
1424                 agaw = gaw;
1425         else
1426                 agaw = gaw + 9 - r;
1427         if (agaw > 64)
1428                 agaw = 64;
1429         return agaw;
1430 }
1431
1432 static int domain_init(struct dmar_domain *domain, int guest_width)
1433 {
1434         struct intel_iommu *iommu;
1435         int adjust_width, agaw;
1436         unsigned long sagaw;
1437
1438         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1439         spin_lock_init(&domain->iommu_lock);
1440
1441         domain_reserve_special_ranges(domain);
1442
1443         /* calculate AGAW */
1444         iommu = domain_get_iommu(domain);
1445         if (guest_width > cap_mgaw(iommu->cap))
1446                 guest_width = cap_mgaw(iommu->cap);
1447         domain->gaw = guest_width;
1448         adjust_width = guestwidth_to_adjustwidth(guest_width);
1449         agaw = width_to_agaw(adjust_width);
1450         sagaw = cap_sagaw(iommu->cap);
1451         if (!test_bit(agaw, &sagaw)) {
1452                 /* hardware doesn't support it, choose a bigger one */
1453                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1454                 agaw = find_next_bit(&sagaw, 5, agaw);
1455                 if (agaw >= 5)
1456                         return -ENODEV;
1457         }
1458         domain->agaw = agaw;
1459         INIT_LIST_HEAD(&domain->devices);
1460
1461         if (ecap_coherent(iommu->ecap))
1462                 domain->iommu_coherency = 1;
1463         else
1464                 domain->iommu_coherency = 0;
1465
1466         if (ecap_sc_support(iommu->ecap))
1467                 domain->iommu_snooping = 1;
1468         else
1469                 domain->iommu_snooping = 0;
1470
1471         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1472         domain->iommu_count = 1;
1473         domain->nid = iommu->node;
1474
1475         /* always allocate the top pgd */
1476         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1477         if (!domain->pgd)
1478                 return -ENOMEM;
1479         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1480         return 0;
1481 }
1482
1483 static void domain_exit(struct dmar_domain *domain)
1484 {
1485         struct dmar_drhd_unit *drhd;
1486         struct intel_iommu *iommu;
1487
1488         /* Domain 0 is reserved, so dont process it */
1489         if (!domain)
1490                 return;
1491
1492         /* Flush any lazy unmaps that may reference this domain */
1493         if (!intel_iommu_strict)
1494                 flush_unmaps_timeout(0);
1495
1496         domain_remove_dev_info(domain);
1497         /* destroy iovas */
1498         put_iova_domain(&domain->iovad);
1499
1500         /* clear ptes */
1501         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1502
1503         /* free page tables */
1504         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1505
1506         for_each_active_iommu(iommu, drhd)
1507                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1508                         iommu_detach_domain(domain, iommu);
1509
1510         free_domain_mem(domain);
1511 }
1512
1513 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1514                                  u8 bus, u8 devfn, int translation)
1515 {
1516         struct context_entry *context;
1517         unsigned long flags;
1518         struct intel_iommu *iommu;
1519         struct dma_pte *pgd;
1520         unsigned long num;
1521         unsigned long ndomains;
1522         int id;
1523         int agaw;
1524         struct device_domain_info *info = NULL;
1525
1526         pr_debug("Set context mapping for %02x:%02x.%d\n",
1527                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1528
1529         BUG_ON(!domain->pgd);
1530         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1531                translation != CONTEXT_TT_MULTI_LEVEL);
1532
1533         iommu = device_to_iommu(segment, bus, devfn);
1534         if (!iommu)
1535                 return -ENODEV;
1536
1537         context = device_to_context_entry(iommu, bus, devfn);
1538         if (!context)
1539                 return -ENOMEM;
1540         spin_lock_irqsave(&iommu->lock, flags);
1541         if (context_present(context)) {
1542                 spin_unlock_irqrestore(&iommu->lock, flags);
1543                 return 0;
1544         }
1545
1546         id = domain->id;
1547         pgd = domain->pgd;
1548
1549         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1550             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1551                 int found = 0;
1552
1553                 /* find an available domain id for this device in iommu */
1554                 ndomains = cap_ndoms(iommu->cap);
1555                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1556                         if (iommu->domains[num] == domain) {
1557                                 id = num;
1558                                 found = 1;
1559                                 break;
1560                         }
1561                 }
1562
1563                 if (found == 0) {
1564                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1565                         if (num >= ndomains) {
1566                                 spin_unlock_irqrestore(&iommu->lock, flags);
1567                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1568                                 return -EFAULT;
1569                         }
1570
1571                         set_bit(num, iommu->domain_ids);
1572                         iommu->domains[num] = domain;
1573                         id = num;
1574                 }
1575
1576                 /* Skip top levels of page tables for
1577                  * iommu which has less agaw than default.
1578                  * Unnecessary for PT mode.
1579                  */
1580                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1581                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1582                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1583                                 if (!dma_pte_present(pgd)) {
1584                                         spin_unlock_irqrestore(&iommu->lock, flags);
1585                                         return -ENOMEM;
1586                                 }
1587                         }
1588                 }
1589         }
1590
1591         context_set_domain_id(context, id);
1592
1593         if (translation != CONTEXT_TT_PASS_THROUGH) {
1594                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1595                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1596                                      CONTEXT_TT_MULTI_LEVEL;
1597         }
1598         /*
1599          * In pass through mode, AW must be programmed to indicate the largest
1600          * AGAW value supported by hardware. And ASR is ignored by hardware.
1601          */
1602         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1603                 context_set_address_width(context, iommu->msagaw);
1604         else {
1605                 context_set_address_root(context, virt_to_phys(pgd));
1606                 context_set_address_width(context, iommu->agaw);
1607         }
1608
1609         context_set_translation_type(context, translation);
1610         context_set_fault_enable(context);
1611         context_set_present(context);
1612         domain_flush_cache(domain, context, sizeof(*context));
1613
1614         /*
1615          * It's a non-present to present mapping. If hardware doesn't cache
1616          * non-present entry we only need to flush the write-buffer. If the
1617          * _does_ cache non-present entries, then it does so in the special
1618          * domain #0, which we have to flush:
1619          */
1620         if (cap_caching_mode(iommu->cap)) {
1621                 iommu->flush.flush_context(iommu, 0,
1622                                            (((u16)bus) << 8) | devfn,
1623                                            DMA_CCMD_MASK_NOBIT,
1624                                            DMA_CCMD_DEVICE_INVL);
1625                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1626         } else {
1627                 iommu_flush_write_buffer(iommu);
1628         }
1629         iommu_enable_dev_iotlb(info);
1630         spin_unlock_irqrestore(&iommu->lock, flags);
1631
1632         spin_lock_irqsave(&domain->iommu_lock, flags);
1633         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1634                 domain->iommu_count++;
1635                 if (domain->iommu_count == 1)
1636                         domain->nid = iommu->node;
1637                 domain_update_iommu_cap(domain);
1638         }
1639         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1640         return 0;
1641 }
1642
1643 static int
1644 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1645                         int translation)
1646 {
1647         int ret;
1648         struct pci_dev *tmp, *parent;
1649
1650         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1651                                          pdev->bus->number, pdev->devfn,
1652                                          translation);
1653         if (ret)
1654                 return ret;
1655
1656         /* dependent device mapping */
1657         tmp = pci_find_upstream_pcie_bridge(pdev);
1658         if (!tmp)
1659                 return 0;
1660         /* Secondary interface's bus number and devfn 0 */
1661         parent = pdev->bus->self;
1662         while (parent != tmp) {
1663                 ret = domain_context_mapping_one(domain,
1664                                                  pci_domain_nr(parent->bus),
1665                                                  parent->bus->number,
1666                                                  parent->devfn, translation);
1667                 if (ret)
1668                         return ret;
1669                 parent = parent->bus->self;
1670         }
1671         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1672                 return domain_context_mapping_one(domain,
1673                                         pci_domain_nr(tmp->subordinate),
1674                                         tmp->subordinate->number, 0,
1675                                         translation);
1676         else /* this is a legacy PCI bridge */
1677                 return domain_context_mapping_one(domain,
1678                                                   pci_domain_nr(tmp->bus),
1679                                                   tmp->bus->number,
1680                                                   tmp->devfn,
1681                                                   translation);
1682 }
1683
1684 static int domain_context_mapped(struct pci_dev *pdev)
1685 {
1686         int ret;
1687         struct pci_dev *tmp, *parent;
1688         struct intel_iommu *iommu;
1689
1690         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1691                                 pdev->devfn);
1692         if (!iommu)
1693                 return -ENODEV;
1694
1695         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1696         if (!ret)
1697                 return ret;
1698         /* dependent device mapping */
1699         tmp = pci_find_upstream_pcie_bridge(pdev);
1700         if (!tmp)
1701                 return ret;
1702         /* Secondary interface's bus number and devfn 0 */
1703         parent = pdev->bus->self;
1704         while (parent != tmp) {
1705                 ret = device_context_mapped(iommu, parent->bus->number,
1706                                             parent->devfn);
1707                 if (!ret)
1708                         return ret;
1709                 parent = parent->bus->self;
1710         }
1711         if (pci_is_pcie(tmp))
1712                 return device_context_mapped(iommu, tmp->subordinate->number,
1713                                              0);
1714         else
1715                 return device_context_mapped(iommu, tmp->bus->number,
1716                                              tmp->devfn);
1717 }
1718
1719 /* Returns a number of VTD pages, but aligned to MM page size */
1720 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1721                                             size_t size)
1722 {
1723         host_addr &= ~PAGE_MASK;
1724         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1725 }
1726
1727 /* Return largest possible superpage level for a given mapping */
1728 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1729                                           unsigned long iov_pfn,
1730                                           unsigned long phy_pfn,
1731                                           unsigned long pages)
1732 {
1733         int support, level = 1;
1734         unsigned long pfnmerge;
1735
1736         support = domain->iommu_superpage;
1737
1738         /* To use a large page, the virtual *and* physical addresses
1739            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1740            of them will mean we have to use smaller pages. So just
1741            merge them and check both at once. */
1742         pfnmerge = iov_pfn | phy_pfn;
1743
1744         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1745                 pages >>= VTD_STRIDE_SHIFT;
1746                 if (!pages)
1747                         break;
1748                 pfnmerge >>= VTD_STRIDE_SHIFT;
1749                 level++;
1750                 support--;
1751         }
1752         return level;
1753 }
1754
1755 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1756                             struct scatterlist *sg, unsigned long phys_pfn,
1757                             unsigned long nr_pages, int prot)
1758 {
1759         struct dma_pte *first_pte = NULL, *pte = NULL;
1760         phys_addr_t uninitialized_var(pteval);
1761         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1762         unsigned long sg_res;
1763         unsigned int largepage_lvl = 0;
1764         unsigned long lvl_pages = 0;
1765
1766         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1767
1768         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1769                 return -EINVAL;
1770
1771         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1772
1773         if (sg)
1774                 sg_res = 0;
1775         else {
1776                 sg_res = nr_pages + 1;
1777                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1778         }
1779
1780         while (nr_pages > 0) {
1781                 uint64_t tmp;
1782
1783                 if (!sg_res) {
1784                         sg_res = aligned_nrpages(sg->offset, sg->length);
1785                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1786                         sg->dma_length = sg->length;
1787                         pteval = page_to_phys(sg_page(sg)) | prot;
1788                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1789                 }
1790
1791                 if (!pte) {
1792                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1793
1794                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1795                         if (!pte)
1796                                 return -ENOMEM;
1797                         /* It is large page*/
1798                         if (largepage_lvl > 1)
1799                                 pteval |= DMA_PTE_LARGE_PAGE;
1800                         else
1801                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1802
1803                 }
1804                 /* We don't need lock here, nobody else
1805                  * touches the iova range
1806                  */
1807                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1808                 if (tmp) {
1809                         static int dumps = 5;
1810                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1811                                iov_pfn, tmp, (unsigned long long)pteval);
1812                         if (dumps) {
1813                                 dumps--;
1814                                 debug_dma_dump_mappings(NULL);
1815                         }
1816                         WARN_ON(1);
1817                 }
1818
1819                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1820
1821                 BUG_ON(nr_pages < lvl_pages);
1822                 BUG_ON(sg_res < lvl_pages);
1823
1824                 nr_pages -= lvl_pages;
1825                 iov_pfn += lvl_pages;
1826                 phys_pfn += lvl_pages;
1827                 pteval += lvl_pages * VTD_PAGE_SIZE;
1828                 sg_res -= lvl_pages;
1829
1830                 /* If the next PTE would be the first in a new page, then we
1831                    need to flush the cache on the entries we've just written.
1832                    And then we'll need to recalculate 'pte', so clear it and
1833                    let it get set again in the if (!pte) block above.
1834
1835                    If we're done (!nr_pages) we need to flush the cache too.
1836
1837                    Also if we've been setting superpages, we may need to
1838                    recalculate 'pte' and switch back to smaller pages for the
1839                    end of the mapping, if the trailing size is not enough to
1840                    use another superpage (i.e. sg_res < lvl_pages). */
1841                 pte++;
1842                 if (!nr_pages || first_pte_in_page(pte) ||
1843                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1844                         domain_flush_cache(domain, first_pte,
1845                                            (void *)pte - (void *)first_pte);
1846                         pte = NULL;
1847                 }
1848
1849                 if (!sg_res && nr_pages)
1850                         sg = sg_next(sg);
1851         }
1852         return 0;
1853 }
1854
1855 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1856                                     struct scatterlist *sg, unsigned long nr_pages,
1857                                     int prot)
1858 {
1859         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1860 }
1861
1862 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1863                                      unsigned long phys_pfn, unsigned long nr_pages,
1864                                      int prot)
1865 {
1866         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1867 }
1868
1869 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1870 {
1871         if (!iommu)
1872                 return;
1873
1874         clear_context_table(iommu, bus, devfn);
1875         iommu->flush.flush_context(iommu, 0, 0, 0,
1876                                            DMA_CCMD_GLOBAL_INVL);
1877         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1878 }
1879
1880 static void domain_remove_dev_info(struct dmar_domain *domain)
1881 {
1882         struct device_domain_info *info;
1883         unsigned long flags;
1884         struct intel_iommu *iommu;
1885
1886         spin_lock_irqsave(&device_domain_lock, flags);
1887         while (!list_empty(&domain->devices)) {
1888                 info = list_entry(domain->devices.next,
1889                         struct device_domain_info, link);
1890                 list_del(&info->link);
1891                 list_del(&info->global);
1892                 if (info->dev)
1893                         info->dev->dev.archdata.iommu = NULL;
1894                 spin_unlock_irqrestore(&device_domain_lock, flags);
1895
1896                 iommu_disable_dev_iotlb(info);
1897                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1898                 iommu_detach_dev(iommu, info->bus, info->devfn);
1899                 free_devinfo_mem(info);
1900
1901                 spin_lock_irqsave(&device_domain_lock, flags);
1902         }
1903         spin_unlock_irqrestore(&device_domain_lock, flags);
1904 }
1905
1906 /*
1907  * find_domain
1908  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1909  */
1910 static struct dmar_domain *
1911 find_domain(struct pci_dev *pdev)
1912 {
1913         struct device_domain_info *info;
1914
1915         /* No lock here, assumes no domain exit in normal case */
1916         info = pdev->dev.archdata.iommu;
1917         if (info)
1918                 return info->domain;
1919         return NULL;
1920 }
1921
1922 /* domain is initialized */
1923 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1924 {
1925         struct dmar_domain *domain, *found = NULL;
1926         struct intel_iommu *iommu;
1927         struct dmar_drhd_unit *drhd;
1928         struct device_domain_info *info, *tmp;
1929         struct pci_dev *dev_tmp;
1930         unsigned long flags;
1931         int bus = 0, devfn = 0;
1932         int segment;
1933         int ret;
1934
1935         domain = find_domain(pdev);
1936         if (domain)
1937                 return domain;
1938
1939         segment = pci_domain_nr(pdev->bus);
1940
1941         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1942         if (dev_tmp) {
1943                 if (pci_is_pcie(dev_tmp)) {
1944                         bus = dev_tmp->subordinate->number;
1945                         devfn = 0;
1946                 } else {
1947                         bus = dev_tmp->bus->number;
1948                         devfn = dev_tmp->devfn;
1949                 }
1950                 spin_lock_irqsave(&device_domain_lock, flags);
1951                 list_for_each_entry(info, &device_domain_list, global) {
1952                         if (info->segment == segment &&
1953                             info->bus == bus && info->devfn == devfn) {
1954                                 found = info->domain;
1955                                 break;
1956                         }
1957                 }
1958                 spin_unlock_irqrestore(&device_domain_lock, flags);
1959                 /* pcie-pci bridge already has a domain, uses it */
1960                 if (found) {
1961                         domain = found;
1962                         goto found_domain;
1963                 }
1964         }
1965
1966         domain = alloc_domain();
1967         if (!domain)
1968                 goto error;
1969
1970         /* Allocate new domain for the device */
1971         drhd = dmar_find_matched_drhd_unit(pdev);
1972         if (!drhd) {
1973                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1974                         pci_name(pdev));
1975                 return NULL;
1976         }
1977         iommu = drhd->iommu;
1978
1979         ret = iommu_attach_domain(domain, iommu);
1980         if (ret) {
1981                 free_domain_mem(domain);
1982                 goto error;
1983         }
1984
1985         if (domain_init(domain, gaw)) {
1986                 domain_exit(domain);
1987                 goto error;
1988         }
1989
1990         /* register pcie-to-pci device */
1991         if (dev_tmp) {
1992                 info = alloc_devinfo_mem();
1993                 if (!info) {
1994                         domain_exit(domain);
1995                         goto error;
1996                 }
1997                 info->segment = segment;
1998                 info->bus = bus;
1999                 info->devfn = devfn;
2000                 info->dev = NULL;
2001                 info->domain = domain;
2002                 /* This domain is shared by devices under p2p bridge */
2003                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2004
2005                 /* pcie-to-pci bridge already has a domain, uses it */
2006                 found = NULL;
2007                 spin_lock_irqsave(&device_domain_lock, flags);
2008                 list_for_each_entry(tmp, &device_domain_list, global) {
2009                         if (tmp->segment == segment &&
2010                             tmp->bus == bus && tmp->devfn == devfn) {
2011                                 found = tmp->domain;
2012                                 break;
2013                         }
2014                 }
2015                 if (found) {
2016                         spin_unlock_irqrestore(&device_domain_lock, flags);
2017                         free_devinfo_mem(info);
2018                         domain_exit(domain);
2019                         domain = found;
2020                 } else {
2021                         list_add(&info->link, &domain->devices);
2022                         list_add(&info->global, &device_domain_list);
2023                         spin_unlock_irqrestore(&device_domain_lock, flags);
2024                 }
2025         }
2026
2027 found_domain:
2028         info = alloc_devinfo_mem();
2029         if (!info)
2030                 goto error;
2031         info->segment = segment;
2032         info->bus = pdev->bus->number;
2033         info->devfn = pdev->devfn;
2034         info->dev = pdev;
2035         info->domain = domain;
2036         spin_lock_irqsave(&device_domain_lock, flags);
2037         /* somebody is fast */
2038         found = find_domain(pdev);
2039         if (found != NULL) {
2040                 spin_unlock_irqrestore(&device_domain_lock, flags);
2041                 if (found != domain) {
2042                         domain_exit(domain);
2043                         domain = found;
2044                 }
2045                 free_devinfo_mem(info);
2046                 return domain;
2047         }
2048         list_add(&info->link, &domain->devices);
2049         list_add(&info->global, &device_domain_list);
2050         pdev->dev.archdata.iommu = info;
2051         spin_unlock_irqrestore(&device_domain_lock, flags);
2052         return domain;
2053 error:
2054         /* recheck it here, maybe others set it */
2055         return find_domain(pdev);
2056 }
2057
2058 static int iommu_identity_mapping;
2059 #define IDENTMAP_ALL            1
2060 #define IDENTMAP_GFX            2
2061 #define IDENTMAP_AZALIA         4
2062
2063 static int iommu_domain_identity_map(struct dmar_domain *domain,
2064                                      unsigned long long start,
2065                                      unsigned long long end)
2066 {
2067         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2068         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2069
2070         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2071                           dma_to_mm_pfn(last_vpfn))) {
2072                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2073                 return -ENOMEM;
2074         }
2075
2076         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2077                  start, end, domain->id);
2078         /*
2079          * RMRR range might have overlap with physical memory range,
2080          * clear it first
2081          */
2082         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2083
2084         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2085                                   last_vpfn - first_vpfn + 1,
2086                                   DMA_PTE_READ|DMA_PTE_WRITE);
2087 }
2088
2089 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2090                                       unsigned long long start,
2091                                       unsigned long long end)
2092 {
2093         struct dmar_domain *domain;
2094         int ret;
2095
2096         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2097         if (!domain)
2098                 return -ENOMEM;
2099
2100         /* For _hardware_ passthrough, don't bother. But for software
2101            passthrough, we do it anyway -- it may indicate a memory
2102            range which is reserved in E820, so which didn't get set
2103            up to start with in si_domain */
2104         if (domain == si_domain && hw_pass_through) {
2105                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2106                        pci_name(pdev), start, end);
2107                 return 0;
2108         }
2109
2110         printk(KERN_INFO
2111                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2112                pci_name(pdev), start, end);
2113         
2114         if (end < start) {
2115                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2116                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2117                         dmi_get_system_info(DMI_BIOS_VENDOR),
2118                         dmi_get_system_info(DMI_BIOS_VERSION),
2119                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2120                 ret = -EIO;
2121                 goto error;
2122         }
2123
2124         if (end >> agaw_to_width(domain->agaw)) {
2125                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2126                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2127                      agaw_to_width(domain->agaw),
2128                      dmi_get_system_info(DMI_BIOS_VENDOR),
2129                      dmi_get_system_info(DMI_BIOS_VERSION),
2130                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2131                 ret = -EIO;
2132                 goto error;
2133         }
2134
2135         ret = iommu_domain_identity_map(domain, start, end);
2136         if (ret)
2137                 goto error;
2138
2139         /* context entry init */
2140         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2141         if (ret)
2142                 goto error;
2143
2144         return 0;
2145
2146  error:
2147         domain_exit(domain);
2148         return ret;
2149 }
2150
2151 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2152         struct pci_dev *pdev)
2153 {
2154         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2155                 return 0;
2156         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2157                 rmrr->end_address);
2158 }
2159
2160 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2161 static inline void iommu_prepare_isa(void)
2162 {
2163         struct pci_dev *pdev;
2164         int ret;
2165
2166         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2167         if (!pdev)
2168                 return;
2169
2170         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2171         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2172
2173         if (ret)
2174                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2175                        "floppy might not work\n");
2176
2177 }
2178 #else
2179 static inline void iommu_prepare_isa(void)
2180 {
2181         return;
2182 }
2183 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2184
2185 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2186
2187 static int __init si_domain_work_fn(unsigned long start_pfn,
2188                                     unsigned long end_pfn, void *datax)
2189 {
2190         int *ret = datax;
2191
2192         *ret = iommu_domain_identity_map(si_domain,
2193                                          (uint64_t)start_pfn << PAGE_SHIFT,
2194                                          (uint64_t)end_pfn << PAGE_SHIFT);
2195         return *ret;
2196
2197 }
2198
2199 static int __init si_domain_init(int hw)
2200 {
2201         struct dmar_drhd_unit *drhd;
2202         struct intel_iommu *iommu;
2203         int nid, ret = 0;
2204
2205         si_domain = alloc_domain();
2206         if (!si_domain)
2207                 return -EFAULT;
2208
2209         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2210
2211         for_each_active_iommu(iommu, drhd) {
2212                 ret = iommu_attach_domain(si_domain, iommu);
2213                 if (ret) {
2214                         domain_exit(si_domain);
2215                         return -EFAULT;
2216                 }
2217         }
2218
2219         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2220                 domain_exit(si_domain);
2221                 return -EFAULT;
2222         }
2223
2224         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2225
2226         if (hw)
2227                 return 0;
2228
2229         for_each_online_node(nid) {
2230                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2231                 if (ret)
2232                         return ret;
2233         }
2234
2235         return 0;
2236 }
2237
2238 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2239                                           struct pci_dev *pdev);
2240 static int identity_mapping(struct pci_dev *pdev)
2241 {
2242         struct device_domain_info *info;
2243
2244         if (likely(!iommu_identity_mapping))
2245                 return 0;
2246
2247         info = pdev->dev.archdata.iommu;
2248         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2249                 return (info->domain == si_domain);
2250
2251         return 0;
2252 }
2253
2254 static int domain_add_dev_info(struct dmar_domain *domain,
2255                                struct pci_dev *pdev,
2256                                int translation)
2257 {
2258         struct device_domain_info *info;
2259         unsigned long flags;
2260         int ret;
2261
2262         info = alloc_devinfo_mem();
2263         if (!info)
2264                 return -ENOMEM;
2265
2266         ret = domain_context_mapping(domain, pdev, translation);
2267         if (ret) {
2268                 free_devinfo_mem(info);
2269                 return ret;
2270         }
2271
2272         info->segment = pci_domain_nr(pdev->bus);
2273         info->bus = pdev->bus->number;
2274         info->devfn = pdev->devfn;
2275         info->dev = pdev;
2276         info->domain = domain;
2277
2278         spin_lock_irqsave(&device_domain_lock, flags);
2279         list_add(&info->link, &domain->devices);
2280         list_add(&info->global, &device_domain_list);
2281         pdev->dev.archdata.iommu = info;
2282         spin_unlock_irqrestore(&device_domain_lock, flags);
2283
2284         return 0;
2285 }
2286
2287 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2288 {
2289         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2290                 return 1;
2291
2292         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2293                 return 1;
2294
2295         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2296                 return 0;
2297
2298         /*
2299          * We want to start off with all devices in the 1:1 domain, and
2300          * take them out later if we find they can't access all of memory.
2301          *
2302          * However, we can't do this for PCI devices behind bridges,
2303          * because all PCI devices behind the same bridge will end up
2304          * with the same source-id on their transactions.
2305          *
2306          * Practically speaking, we can't change things around for these
2307          * devices at run-time, because we can't be sure there'll be no
2308          * DMA transactions in flight for any of their siblings.
2309          * 
2310          * So PCI devices (unless they're on the root bus) as well as
2311          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2312          * the 1:1 domain, just in _case_ one of their siblings turns out
2313          * not to be able to map all of memory.
2314          */
2315         if (!pci_is_pcie(pdev)) {
2316                 if (!pci_is_root_bus(pdev->bus))
2317                         return 0;
2318                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2319                         return 0;
2320         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2321                 return 0;
2322
2323         /* 
2324          * At boot time, we don't yet know if devices will be 64-bit capable.
2325          * Assume that they will -- if they turn out not to be, then we can 
2326          * take them out of the 1:1 domain later.
2327          */
2328         if (!startup) {
2329                 /*
2330                  * If the device's dma_mask is less than the system's memory
2331                  * size then this is not a candidate for identity mapping.
2332                  */
2333                 u64 dma_mask = pdev->dma_mask;
2334
2335                 if (pdev->dev.coherent_dma_mask &&
2336                     pdev->dev.coherent_dma_mask < dma_mask)
2337                         dma_mask = pdev->dev.coherent_dma_mask;
2338
2339                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2340         }
2341
2342         return 1;
2343 }
2344
2345 static int __init iommu_prepare_static_identity_mapping(int hw)
2346 {
2347         struct pci_dev *pdev = NULL;
2348         int ret;
2349
2350         ret = si_domain_init(hw);
2351         if (ret)
2352                 return -EFAULT;
2353
2354         for_each_pci_dev(pdev) {
2355                 /* Skip Host/PCI Bridge devices */
2356                 if (IS_BRIDGE_HOST_DEVICE(pdev))
2357                         continue;
2358                 if (iommu_should_identity_map(pdev, 1)) {
2359                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2360                                hw ? "hardware" : "software", pci_name(pdev));
2361
2362                         ret = domain_add_dev_info(si_domain, pdev,
2363                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2364                                                      CONTEXT_TT_MULTI_LEVEL);
2365                         if (ret)
2366                                 return ret;
2367                 }
2368         }
2369
2370         return 0;
2371 }
2372
2373 static int __init init_dmars(void)
2374 {
2375         struct dmar_drhd_unit *drhd;
2376         struct dmar_rmrr_unit *rmrr;
2377         struct pci_dev *pdev;
2378         struct intel_iommu *iommu;
2379         int i, ret;
2380
2381         /*
2382          * for each drhd
2383          *    allocate root
2384          *    initialize and program root entry to not present
2385          * endfor
2386          */
2387         for_each_drhd_unit(drhd) {
2388                 g_num_of_iommus++;
2389                 /*
2390                  * lock not needed as this is only incremented in the single
2391                  * threaded kernel __init code path all other access are read
2392                  * only
2393                  */
2394         }
2395
2396         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2397                         GFP_KERNEL);
2398         if (!g_iommus) {
2399                 printk(KERN_ERR "Allocating global iommu array failed\n");
2400                 ret = -ENOMEM;
2401                 goto error;
2402         }
2403
2404         deferred_flush = kzalloc(g_num_of_iommus *
2405                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2406         if (!deferred_flush) {
2407                 ret = -ENOMEM;
2408                 goto error;
2409         }
2410
2411         for_each_drhd_unit(drhd) {
2412                 if (drhd->ignored)
2413                         continue;
2414
2415                 iommu = drhd->iommu;
2416                 g_iommus[iommu->seq_id] = iommu;
2417
2418                 ret = iommu_init_domains(iommu);
2419                 if (ret)
2420                         goto error;
2421
2422                 /*
2423                  * TBD:
2424                  * we could share the same root & context tables
2425                  * among all IOMMU's. Need to Split it later.
2426                  */
2427                 ret = iommu_alloc_root_entry(iommu);
2428                 if (ret) {
2429                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2430                         goto error;
2431                 }
2432                 if (!ecap_pass_through(iommu->ecap))
2433                         hw_pass_through = 0;
2434         }
2435
2436         /*
2437          * Start from the sane iommu hardware state.
2438          */
2439         for_each_drhd_unit(drhd) {
2440                 if (drhd->ignored)
2441                         continue;
2442
2443                 iommu = drhd->iommu;
2444
2445                 /*
2446                  * If the queued invalidation is already initialized by us
2447                  * (for example, while enabling interrupt-remapping) then
2448                  * we got the things already rolling from a sane state.
2449                  */
2450                 if (iommu->qi)
2451                         continue;
2452
2453                 /*
2454                  * Clear any previous faults.
2455                  */
2456                 dmar_fault(-1, iommu);
2457                 /*
2458                  * Disable queued invalidation if supported and already enabled
2459                  * before OS handover.
2460                  */
2461                 dmar_disable_qi(iommu);
2462         }
2463
2464         for_each_drhd_unit(drhd) {
2465                 if (drhd->ignored)
2466                         continue;
2467
2468                 iommu = drhd->iommu;
2469
2470                 if (dmar_enable_qi(iommu)) {
2471                         /*
2472                          * Queued Invalidate not enabled, use Register Based
2473                          * Invalidate
2474                          */
2475                         iommu->flush.flush_context = __iommu_flush_context;
2476                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2477                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2478                                "invalidation\n",
2479                                 iommu->seq_id,
2480                                (unsigned long long)drhd->reg_base_addr);
2481                 } else {
2482                         iommu->flush.flush_context = qi_flush_context;
2483                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2484                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2485                                "invalidation\n",
2486                                 iommu->seq_id,
2487                                (unsigned long long)drhd->reg_base_addr);
2488                 }
2489         }
2490
2491         if (iommu_pass_through)
2492                 iommu_identity_mapping |= IDENTMAP_ALL;
2493
2494 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2495         iommu_identity_mapping |= IDENTMAP_GFX;
2496 #endif
2497
2498         check_tylersburg_isoch();
2499
2500         /*
2501          * If pass through is not set or not enabled, setup context entries for
2502          * identity mappings for rmrr, gfx, and isa and may fall back to static
2503          * identity mapping if iommu_identity_mapping is set.
2504          */
2505         if (iommu_identity_mapping) {
2506                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2507                 if (ret) {
2508                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2509                         goto error;
2510                 }
2511         }
2512         /*
2513          * For each rmrr
2514          *   for each dev attached to rmrr
2515          *   do
2516          *     locate drhd for dev, alloc domain for dev
2517          *     allocate free domain
2518          *     allocate page table entries for rmrr
2519          *     if context not allocated for bus
2520          *           allocate and init context
2521          *           set present in root table for this bus
2522          *     init context with domain, translation etc
2523          *    endfor
2524          * endfor
2525          */
2526         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2527         for_each_rmrr_units(rmrr) {
2528                 for (i = 0; i < rmrr->devices_cnt; i++) {
2529                         pdev = rmrr->devices[i];
2530                         /*
2531                          * some BIOS lists non-exist devices in DMAR
2532                          * table.
2533                          */
2534                         if (!pdev)
2535                                 continue;
2536                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2537                         if (ret)
2538                                 printk(KERN_ERR
2539                                        "IOMMU: mapping reserved region failed\n");
2540                 }
2541         }
2542
2543         iommu_prepare_isa();
2544
2545         /*
2546          * for each drhd
2547          *   enable fault log
2548          *   global invalidate context cache
2549          *   global invalidate iotlb
2550          *   enable translation
2551          */
2552         for_each_drhd_unit(drhd) {
2553                 if (drhd->ignored) {
2554                         /*
2555                          * we always have to disable PMRs or DMA may fail on
2556                          * this device
2557                          */
2558                         if (force_on)
2559                                 iommu_disable_protect_mem_regions(drhd->iommu);
2560                         continue;
2561                 }
2562                 iommu = drhd->iommu;
2563
2564                 iommu_flush_write_buffer(iommu);
2565
2566                 ret = dmar_set_interrupt(iommu);
2567                 if (ret)
2568                         goto error;
2569
2570                 iommu_set_root_entry(iommu);
2571
2572                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2573                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2574
2575                 ret = iommu_enable_translation(iommu);
2576                 if (ret)
2577                         goto error;
2578
2579                 iommu_disable_protect_mem_regions(iommu);
2580         }
2581
2582         return 0;
2583 error:
2584         for_each_drhd_unit(drhd) {
2585                 if (drhd->ignored)
2586                         continue;
2587                 iommu = drhd->iommu;
2588                 free_iommu(iommu);
2589         }
2590         kfree(g_iommus);
2591         return ret;
2592 }
2593
2594 /* This takes a number of _MM_ pages, not VTD pages */
2595 static struct iova *intel_alloc_iova(struct device *dev,
2596                                      struct dmar_domain *domain,
2597                                      unsigned long nrpages, uint64_t dma_mask)
2598 {
2599         struct pci_dev *pdev = to_pci_dev(dev);
2600         struct iova *iova = NULL;
2601
2602         /* Restrict dma_mask to the width that the iommu can handle */
2603         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2604
2605         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2606                 /*
2607                  * First try to allocate an io virtual address in
2608                  * DMA_BIT_MASK(32) and if that fails then try allocating
2609                  * from higher range
2610                  */
2611                 iova = alloc_iova(&domain->iovad, nrpages,
2612                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2613                 if (iova)
2614                         return iova;
2615         }
2616         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2617         if (unlikely(!iova)) {
2618                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2619                        nrpages, pci_name(pdev));
2620                 return NULL;
2621         }
2622
2623         return iova;
2624 }
2625
2626 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2627 {
2628         struct dmar_domain *domain;
2629         int ret;
2630
2631         domain = get_domain_for_dev(pdev,
2632                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2633         if (!domain) {
2634                 printk(KERN_ERR
2635                         "Allocating domain for %s failed", pci_name(pdev));
2636                 return NULL;
2637         }
2638
2639         /* make sure context mapping is ok */
2640         if (unlikely(!domain_context_mapped(pdev))) {
2641                 ret = domain_context_mapping(domain, pdev,
2642                                              CONTEXT_TT_MULTI_LEVEL);
2643                 if (ret) {
2644                         printk(KERN_ERR
2645                                 "Domain context map for %s failed",
2646                                 pci_name(pdev));
2647                         return NULL;
2648                 }
2649         }
2650
2651         return domain;
2652 }
2653
2654 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2655 {
2656         struct device_domain_info *info;
2657
2658         /* No lock here, assumes no domain exit in normal case */
2659         info = dev->dev.archdata.iommu;
2660         if (likely(info))
2661                 return info->domain;
2662
2663         return __get_valid_domain_for_dev(dev);
2664 }
2665
2666 static int iommu_dummy(struct pci_dev *pdev)
2667 {
2668         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2669 }
2670
2671 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2672 static int iommu_no_mapping(struct device *dev)
2673 {
2674         struct pci_dev *pdev;
2675         int found;
2676
2677         if (unlikely(dev->bus != &pci_bus_type))
2678                 return 1;
2679
2680         pdev = to_pci_dev(dev);
2681         if (iommu_dummy(pdev))
2682                 return 1;
2683
2684         if (!iommu_identity_mapping)
2685                 return 0;
2686
2687         found = identity_mapping(pdev);
2688         if (found) {
2689                 if (iommu_should_identity_map(pdev, 0))
2690                         return 1;
2691                 else {
2692                         /*
2693                          * 32 bit DMA is removed from si_domain and fall back
2694                          * to non-identity mapping.
2695                          */
2696                         domain_remove_one_dev_info(si_domain, pdev);
2697                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2698                                pci_name(pdev));
2699                         return 0;
2700                 }
2701         } else {
2702                 /*
2703                  * In case of a detached 64 bit DMA device from vm, the device
2704                  * is put into si_domain for identity mapping.
2705                  */
2706                 if (iommu_should_identity_map(pdev, 0)) {
2707                         int ret;
2708                         ret = domain_add_dev_info(si_domain, pdev,
2709                                                   hw_pass_through ?
2710                                                   CONTEXT_TT_PASS_THROUGH :
2711                                                   CONTEXT_TT_MULTI_LEVEL);
2712                         if (!ret) {
2713                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2714                                        pci_name(pdev));
2715                                 return 1;
2716                         }
2717                 }
2718         }
2719
2720         return 0;
2721 }
2722
2723 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2724                                      size_t size, int dir, u64 dma_mask)
2725 {
2726         struct pci_dev *pdev = to_pci_dev(hwdev);
2727         struct dmar_domain *domain;
2728         phys_addr_t start_paddr;
2729         struct iova *iova;
2730         int prot = 0;
2731         int ret;
2732         struct intel_iommu *iommu;
2733         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2734
2735         BUG_ON(dir == DMA_NONE);
2736
2737         if (iommu_no_mapping(hwdev))
2738                 return paddr;
2739
2740         domain = get_valid_domain_for_dev(pdev);
2741         if (!domain)
2742                 return 0;
2743
2744         iommu = domain_get_iommu(domain);
2745         size = aligned_nrpages(paddr, size);
2746
2747         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2748         if (!iova)
2749                 goto error;
2750
2751         /*
2752          * Check if DMAR supports zero-length reads on write only
2753          * mappings..
2754          */
2755         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2756                         !cap_zlr(iommu->cap))
2757                 prot |= DMA_PTE_READ;
2758         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2759                 prot |= DMA_PTE_WRITE;
2760         /*
2761          * paddr - (paddr + size) might be partial page, we should map the whole
2762          * page.  Note: if two part of one page are separately mapped, we
2763          * might have two guest_addr mapping to the same host paddr, but this
2764          * is not a big problem
2765          */
2766         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2767                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2768         if (ret)
2769                 goto error;
2770
2771         /* it's a non-present to present mapping. Only flush if caching mode */
2772         if (cap_caching_mode(iommu->cap))
2773                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2774         else
2775                 iommu_flush_write_buffer(iommu);
2776
2777         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2778         start_paddr += paddr & ~PAGE_MASK;
2779         return start_paddr;
2780
2781 error:
2782         if (iova)
2783                 __free_iova(&domain->iovad, iova);
2784         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2785                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2786         return 0;
2787 }
2788
2789 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2790                                  unsigned long offset, size_t size,
2791                                  enum dma_data_direction dir,
2792                                  struct dma_attrs *attrs)
2793 {
2794         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2795                                   dir, to_pci_dev(dev)->dma_mask);
2796 }
2797
2798 static void flush_unmaps(void)
2799 {
2800         int i, j;
2801
2802         timer_on = 0;
2803
2804         /* just flush them all */
2805         for (i = 0; i < g_num_of_iommus; i++) {
2806                 struct intel_iommu *iommu = g_iommus[i];
2807                 if (!iommu)
2808                         continue;
2809
2810                 if (!deferred_flush[i].next)
2811                         continue;
2812
2813                 /* In caching mode, global flushes turn emulation expensive */
2814                 if (!cap_caching_mode(iommu->cap))
2815                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2816                                          DMA_TLB_GLOBAL_FLUSH);
2817                 for (j = 0; j < deferred_flush[i].next; j++) {
2818                         unsigned long mask;
2819                         struct iova *iova = deferred_flush[i].iova[j];
2820                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2821
2822                         /* On real hardware multiple invalidations are expensive */
2823                         if (cap_caching_mode(iommu->cap))
2824                                 iommu_flush_iotlb_psi(iommu, domain->id,
2825                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2826                         else {
2827                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2828                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2829                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2830                         }
2831                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2832                 }
2833                 deferred_flush[i].next = 0;
2834         }
2835
2836         list_size = 0;
2837 }
2838
2839 static void flush_unmaps_timeout(unsigned long data)
2840 {
2841         unsigned long flags;
2842
2843         spin_lock_irqsave(&async_umap_flush_lock, flags);
2844         flush_unmaps();
2845         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2846 }
2847
2848 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2849 {
2850         unsigned long flags;
2851         int next, iommu_id;
2852         struct intel_iommu *iommu;
2853
2854         spin_lock_irqsave(&async_umap_flush_lock, flags);
2855         if (list_size == HIGH_WATER_MARK)
2856                 flush_unmaps();
2857
2858         iommu = domain_get_iommu(dom);
2859         iommu_id = iommu->seq_id;
2860
2861         next = deferred_flush[iommu_id].next;
2862         deferred_flush[iommu_id].domain[next] = dom;
2863         deferred_flush[iommu_id].iova[next] = iova;
2864         deferred_flush[iommu_id].next++;
2865
2866         if (!timer_on) {
2867                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2868                 timer_on = 1;
2869         }
2870         list_size++;
2871         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2872 }
2873
2874 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2875                              size_t size, enum dma_data_direction dir,
2876                              struct dma_attrs *attrs)
2877 {
2878         struct pci_dev *pdev = to_pci_dev(dev);
2879         struct dmar_domain *domain;
2880         unsigned long start_pfn, last_pfn;
2881         struct iova *iova;
2882         struct intel_iommu *iommu;
2883
2884         if (iommu_no_mapping(dev))
2885                 return;
2886
2887         domain = find_domain(pdev);
2888         BUG_ON(!domain);
2889
2890         iommu = domain_get_iommu(domain);
2891
2892         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2893         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2894                       (unsigned long long)dev_addr))
2895                 return;
2896
2897         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2898         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2899
2900         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2901                  pci_name(pdev), start_pfn, last_pfn);
2902
2903         /*  clear the whole page */
2904         dma_pte_clear_range(domain, start_pfn, last_pfn);
2905
2906         /* free page tables */
2907         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2908
2909         if (intel_iommu_strict) {
2910                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2911                                       last_pfn - start_pfn + 1, 0);
2912                 /* free iova */
2913                 __free_iova(&domain->iovad, iova);
2914         } else {
2915                 add_unmap(domain, iova);
2916                 /*
2917                  * queue up the release of the unmap to save the 1/6th of the
2918                  * cpu used up by the iotlb flush operation...
2919                  */
2920         }
2921 }
2922
2923 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2924                                   dma_addr_t *dma_handle, gfp_t flags)
2925 {
2926         void *vaddr;
2927         int order;
2928
2929         size = PAGE_ALIGN(size);
2930         order = get_order(size);
2931
2932         if (!iommu_no_mapping(hwdev))
2933                 flags &= ~(GFP_DMA | GFP_DMA32);
2934         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2935                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2936                         flags |= GFP_DMA;
2937                 else
2938                         flags |= GFP_DMA32;
2939         }
2940
2941         vaddr = (void *)__get_free_pages(flags, order);
2942         if (!vaddr)
2943                 return NULL;
2944         memset(vaddr, 0, size);
2945
2946         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2947                                          DMA_BIDIRECTIONAL,
2948                                          hwdev->coherent_dma_mask);
2949         if (*dma_handle)
2950                 return vaddr;
2951         free_pages((unsigned long)vaddr, order);
2952         return NULL;
2953 }
2954
2955 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2956                                 dma_addr_t dma_handle)
2957 {
2958         int order;
2959
2960         size = PAGE_ALIGN(size);
2961         order = get_order(size);
2962
2963         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2964         free_pages((unsigned long)vaddr, order);
2965 }
2966
2967 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2968                            int nelems, enum dma_data_direction dir,
2969                            struct dma_attrs *attrs)
2970 {
2971         struct pci_dev *pdev = to_pci_dev(hwdev);
2972         struct dmar_domain *domain;
2973         unsigned long start_pfn, last_pfn;
2974         struct iova *iova;
2975         struct intel_iommu *iommu;
2976
2977         if (iommu_no_mapping(hwdev))
2978                 return;
2979
2980         domain = find_domain(pdev);
2981         BUG_ON(!domain);
2982
2983         iommu = domain_get_iommu(domain);
2984
2985         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2986         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2987                       (unsigned long long)sglist[0].dma_address))
2988                 return;
2989
2990         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2991         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2992
2993         /*  clear the whole page */
2994         dma_pte_clear_range(domain, start_pfn, last_pfn);
2995
2996         /* free page tables */
2997         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2998
2999         if (intel_iommu_strict) {
3000                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3001                                       last_pfn - start_pfn + 1, 0);
3002                 /* free iova */
3003                 __free_iova(&domain->iovad, iova);
3004         } else {
3005                 add_unmap(domain, iova);
3006                 /*
3007                  * queue up the release of the unmap to save the 1/6th of the
3008                  * cpu used up by the iotlb flush operation...
3009                  */
3010         }
3011 }
3012
3013 static int intel_nontranslate_map_sg(struct device *hddev,
3014         struct scatterlist *sglist, int nelems, int dir)
3015 {
3016         int i;
3017         struct scatterlist *sg;
3018
3019         for_each_sg(sglist, sg, nelems, i) {
3020                 BUG_ON(!sg_page(sg));
3021                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3022                 sg->dma_length = sg->length;
3023         }
3024         return nelems;
3025 }
3026
3027 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3028                         enum dma_data_direction dir, struct dma_attrs *attrs)
3029 {
3030         int i;
3031         struct pci_dev *pdev = to_pci_dev(hwdev);
3032         struct dmar_domain *domain;
3033         size_t size = 0;
3034         int prot = 0;
3035         struct iova *iova = NULL;
3036         int ret;
3037         struct scatterlist *sg;
3038         unsigned long start_vpfn;
3039         struct intel_iommu *iommu;
3040
3041         BUG_ON(dir == DMA_NONE);
3042         if (iommu_no_mapping(hwdev))
3043                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3044
3045         domain = get_valid_domain_for_dev(pdev);
3046         if (!domain)
3047                 return 0;
3048
3049         iommu = domain_get_iommu(domain);
3050
3051         for_each_sg(sglist, sg, nelems, i)
3052                 size += aligned_nrpages(sg->offset, sg->length);
3053
3054         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3055                                 pdev->dma_mask);
3056         if (!iova) {
3057                 sglist->dma_length = 0;
3058                 return 0;
3059         }
3060
3061         /*
3062          * Check if DMAR supports zero-length reads on write only
3063          * mappings..
3064          */
3065         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3066                         !cap_zlr(iommu->cap))
3067                 prot |= DMA_PTE_READ;
3068         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3069                 prot |= DMA_PTE_WRITE;
3070
3071         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3072
3073         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3074         if (unlikely(ret)) {
3075                 /*  clear the page */
3076                 dma_pte_clear_range(domain, start_vpfn,
3077                                     start_vpfn + size - 1);
3078                 /* free page tables */
3079                 dma_pte_free_pagetable(domain, start_vpfn,
3080                                        start_vpfn + size - 1);
3081                 /* free iova */
3082                 __free_iova(&domain->iovad, iova);
3083                 return 0;
3084         }
3085
3086         /* it's a non-present to present mapping. Only flush if caching mode */
3087         if (cap_caching_mode(iommu->cap))
3088                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3089         else
3090                 iommu_flush_write_buffer(iommu);
3091
3092         return nelems;
3093 }
3094
3095 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3096 {
3097         return !dma_addr;
3098 }
3099
3100 struct dma_map_ops intel_dma_ops = {
3101         .alloc_coherent = intel_alloc_coherent,
3102         .free_coherent = intel_free_coherent,
3103         .map_sg = intel_map_sg,
3104         .unmap_sg = intel_unmap_sg,
3105         .map_page = intel_map_page,
3106         .unmap_page = intel_unmap_page,
3107         .mapping_error = intel_mapping_error,
3108 };
3109
3110 static inline int iommu_domain_cache_init(void)
3111 {
3112         int ret = 0;
3113
3114         iommu_domain_cache = kmem_cache_create("iommu_domain",
3115                                          sizeof(struct dmar_domain),
3116                                          0,
3117                                          SLAB_HWCACHE_ALIGN,
3118
3119                                          NULL);
3120         if (!iommu_domain_cache) {
3121                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3122                 ret = -ENOMEM;
3123         }
3124
3125         return ret;
3126 }
3127
3128 static inline int iommu_devinfo_cache_init(void)
3129 {
3130         int ret = 0;
3131
3132         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3133                                          sizeof(struct device_domain_info),
3134                                          0,
3135                                          SLAB_HWCACHE_ALIGN,
3136                                          NULL);
3137         if (!iommu_devinfo_cache) {
3138                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3139                 ret = -ENOMEM;
3140         }
3141
3142         return ret;
3143 }
3144
3145 static inline int iommu_iova_cache_init(void)
3146 {
3147         int ret = 0;
3148
3149         iommu_iova_cache = kmem_cache_create("iommu_iova",
3150                                          sizeof(struct iova),
3151                                          0,
3152                                          SLAB_HWCACHE_ALIGN,
3153                                          NULL);
3154         if (!iommu_iova_cache) {
3155                 printk(KERN_ERR "Couldn't create iova cache\n");
3156                 ret = -ENOMEM;
3157         }
3158
3159         return ret;
3160 }
3161
3162 static int __init iommu_init_mempool(void)
3163 {
3164         int ret;
3165         ret = iommu_iova_cache_init();
3166         if (ret)
3167                 return ret;
3168
3169         ret = iommu_domain_cache_init();
3170         if (ret)
3171                 goto domain_error;
3172
3173         ret = iommu_devinfo_cache_init();
3174         if (!ret)
3175                 return ret;
3176
3177         kmem_cache_destroy(iommu_domain_cache);
3178 domain_error:
3179         kmem_cache_destroy(iommu_iova_cache);
3180
3181         return -ENOMEM;
3182 }
3183
3184 static void __init iommu_exit_mempool(void)
3185 {
3186         kmem_cache_destroy(iommu_devinfo_cache);
3187         kmem_cache_destroy(iommu_domain_cache);
3188         kmem_cache_destroy(iommu_iova_cache);
3189
3190 }
3191
3192 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3193 {
3194         struct dmar_drhd_unit *drhd;
3195         u32 vtbar;
3196         int rc;
3197
3198         /* We know that this device on this chipset has its own IOMMU.
3199          * If we find it under a different IOMMU, then the BIOS is lying
3200          * to us. Hope that the IOMMU for this device is actually
3201          * disabled, and it needs no translation...
3202          */
3203         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3204         if (rc) {
3205                 /* "can't" happen */
3206                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3207                 return;
3208         }
3209         vtbar &= 0xffff0000;
3210
3211         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3212         drhd = dmar_find_matched_drhd_unit(pdev);
3213         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3214                             TAINT_FIRMWARE_WORKAROUND,
3215                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3216                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3217 }
3218 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3219
3220 static void __init init_no_remapping_devices(void)
3221 {
3222         struct dmar_drhd_unit *drhd;
3223
3224         for_each_drhd_unit(drhd) {
3225                 if (!drhd->include_all) {
3226                         int i;
3227                         for (i = 0; i < drhd->devices_cnt; i++)
3228                                 if (drhd->devices[i] != NULL)
3229                                         break;
3230                         /* ignore DMAR unit if no pci devices exist */
3231                         if (i == drhd->devices_cnt)
3232                                 drhd->ignored = 1;
3233                 }
3234         }
3235
3236         for_each_drhd_unit(drhd) {
3237                 int i;
3238                 if (drhd->ignored || drhd->include_all)
3239                         continue;
3240
3241                 for (i = 0; i < drhd->devices_cnt; i++)
3242                         if (drhd->devices[i] &&
3243                             !IS_GFX_DEVICE(drhd->devices[i]))
3244                                 break;
3245
3246                 if (i < drhd->devices_cnt)
3247                         continue;
3248
3249                 /* This IOMMU has *only* gfx devices. Either bypass it or
3250                    set the gfx_mapped flag, as appropriate */
3251                 if (dmar_map_gfx) {
3252                         intel_iommu_gfx_mapped = 1;
3253                 } else {
3254                         drhd->ignored = 1;
3255                         for (i = 0; i < drhd->devices_cnt; i++) {
3256                                 if (!drhd->devices[i])
3257                                         continue;
3258                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3259                         }
3260                 }
3261         }
3262 }
3263
3264 #ifdef CONFIG_SUSPEND
3265 static int init_iommu_hw(void)
3266 {
3267         struct dmar_drhd_unit *drhd;
3268         struct intel_iommu *iommu = NULL;
3269
3270         for_each_active_iommu(iommu, drhd)
3271                 if (iommu->qi)
3272                         dmar_reenable_qi(iommu);
3273
3274         for_each_iommu(iommu, drhd) {
3275                 if (drhd->ignored) {
3276                         /*
3277                          * we always have to disable PMRs or DMA may fail on
3278                          * this device
3279                          */
3280                         if (force_on)
3281                                 iommu_disable_protect_mem_regions(iommu);
3282                         continue;
3283                 }
3284         
3285                 iommu_flush_write_buffer(iommu);
3286
3287                 iommu_set_root_entry(iommu);
3288
3289                 iommu->flush.flush_context(iommu, 0, 0, 0,
3290                                            DMA_CCMD_GLOBAL_INVL);
3291                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3292                                          DMA_TLB_GLOBAL_FLUSH);
3293                 if (iommu_enable_translation(iommu))
3294                         return 1;
3295                 iommu_disable_protect_mem_regions(iommu);
3296         }
3297
3298         return 0;
3299 }
3300
3301 static void iommu_flush_all(void)
3302 {
3303         struct dmar_drhd_unit *drhd;
3304         struct intel_iommu *iommu;
3305
3306         for_each_active_iommu(iommu, drhd) {
3307                 iommu->flush.flush_context(iommu, 0, 0, 0,
3308                                            DMA_CCMD_GLOBAL_INVL);
3309                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3310                                          DMA_TLB_GLOBAL_FLUSH);
3311         }
3312 }
3313
3314 static int iommu_suspend(void)
3315 {
3316         struct dmar_drhd_unit *drhd;
3317         struct intel_iommu *iommu = NULL;
3318         unsigned long flag;
3319
3320         for_each_active_iommu(iommu, drhd) {
3321                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3322                                                  GFP_ATOMIC);
3323                 if (!iommu->iommu_state)
3324                         goto nomem;
3325         }
3326
3327         iommu_flush_all();
3328
3329         for_each_active_iommu(iommu, drhd) {
3330                 iommu_disable_translation(iommu);
3331
3332                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3333
3334                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3335                         readl(iommu->reg + DMAR_FECTL_REG);
3336                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3337                         readl(iommu->reg + DMAR_FEDATA_REG);
3338                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3339                         readl(iommu->reg + DMAR_FEADDR_REG);
3340                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3341                         readl(iommu->reg + DMAR_FEUADDR_REG);
3342
3343                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3344         }
3345         return 0;
3346
3347 nomem:
3348         for_each_active_iommu(iommu, drhd)
3349                 kfree(iommu->iommu_state);
3350
3351         return -ENOMEM;
3352 }
3353
3354 static void iommu_resume(void)
3355 {
3356         struct dmar_drhd_unit *drhd;
3357         struct intel_iommu *iommu = NULL;
3358         unsigned long flag;
3359
3360         if (init_iommu_hw()) {
3361                 if (force_on)
3362                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3363                 else
3364                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3365                 return;
3366         }
3367
3368         for_each_active_iommu(iommu, drhd) {
3369
3370                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3371
3372                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3373                         iommu->reg + DMAR_FECTL_REG);
3374                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3375                         iommu->reg + DMAR_FEDATA_REG);
3376                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3377                         iommu->reg + DMAR_FEADDR_REG);
3378                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3379                         iommu->reg + DMAR_FEUADDR_REG);
3380
3381                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3382         }
3383
3384         for_each_active_iommu(iommu, drhd)
3385                 kfree(iommu->iommu_state);
3386 }
3387
3388 static struct syscore_ops iommu_syscore_ops = {
3389         .resume         = iommu_resume,
3390         .suspend        = iommu_suspend,
3391 };
3392
3393 static void __init init_iommu_pm_ops(void)
3394 {
3395         register_syscore_ops(&iommu_syscore_ops);
3396 }
3397
3398 #else
3399 static inline void init_iommu_pm_ops(void) {}
3400 #endif  /* CONFIG_PM */
3401
3402 LIST_HEAD(dmar_rmrr_units);
3403
3404 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3405 {
3406         list_add(&rmrr->list, &dmar_rmrr_units);
3407 }
3408
3409
3410 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3411 {
3412         struct acpi_dmar_reserved_memory *rmrr;
3413         struct dmar_rmrr_unit *rmrru;
3414
3415         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3416         if (!rmrru)
3417                 return -ENOMEM;
3418
3419         rmrru->hdr = header;
3420         rmrr = (struct acpi_dmar_reserved_memory *)header;
3421         rmrru->base_address = rmrr->base_address;
3422         rmrru->end_address = rmrr->end_address;
3423
3424         dmar_register_rmrr_unit(rmrru);
3425         return 0;
3426 }
3427
3428 static int __init
3429 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3430 {
3431         struct acpi_dmar_reserved_memory *rmrr;
3432         int ret;
3433
3434         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3435         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3436                 ((void *)rmrr) + rmrr->header.length,
3437                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3438
3439         if (ret || (rmrru->devices_cnt == 0)) {
3440                 list_del(&rmrru->list);
3441                 kfree(rmrru);
3442         }
3443         return ret;
3444 }
3445
3446 static LIST_HEAD(dmar_atsr_units);
3447
3448 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3449 {
3450         struct acpi_dmar_atsr *atsr;
3451         struct dmar_atsr_unit *atsru;
3452
3453         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3454         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3455         if (!atsru)
3456                 return -ENOMEM;
3457
3458         atsru->hdr = hdr;
3459         atsru->include_all = atsr->flags & 0x1;
3460
3461         list_add(&atsru->list, &dmar_atsr_units);
3462
3463         return 0;
3464 }
3465
3466 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3467 {
3468         int rc;
3469         struct acpi_dmar_atsr *atsr;
3470
3471         if (atsru->include_all)
3472                 return 0;
3473
3474         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3475         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3476                                 (void *)atsr + atsr->header.length,
3477                                 &atsru->devices_cnt, &atsru->devices,
3478                                 atsr->segment);
3479         if (rc || !atsru->devices_cnt) {
3480                 list_del(&atsru->list);
3481                 kfree(atsru);
3482         }
3483
3484         return rc;
3485 }
3486
3487 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3488 {
3489         int i;
3490         struct pci_bus *bus;
3491         struct acpi_dmar_atsr *atsr;
3492         struct dmar_atsr_unit *atsru;
3493
3494         dev = pci_physfn(dev);
3495
3496         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3497                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3498                 if (atsr->segment == pci_domain_nr(dev->bus))
3499                         goto found;
3500         }
3501
3502         return 0;
3503
3504 found:
3505         for (bus = dev->bus; bus; bus = bus->parent) {
3506                 struct pci_dev *bridge = bus->self;
3507
3508                 if (!bridge || !pci_is_pcie(bridge) ||
3509                     bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3510                         return 0;
3511
3512                 if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3513                         for (i = 0; i < atsru->devices_cnt; i++)
3514                                 if (atsru->devices[i] == bridge)
3515                                         return 1;
3516                         break;
3517                 }
3518         }
3519
3520         if (atsru->include_all)
3521                 return 1;
3522
3523         return 0;
3524 }
3525
3526 int dmar_parse_rmrr_atsr_dev(void)
3527 {
3528         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3529         struct dmar_atsr_unit *atsr, *atsr_n;
3530         int ret = 0;
3531
3532         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3533                 ret = rmrr_parse_dev(rmrr);
3534                 if (ret)
3535                         return ret;
3536         }
3537
3538         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3539                 ret = atsr_parse_dev(atsr);
3540                 if (ret)
3541                         return ret;
3542         }
3543
3544         return ret;
3545 }
3546
3547 /*
3548  * Here we only respond to action of unbound device from driver.
3549  *
3550  * Added device is not attached to its DMAR domain here yet. That will happen
3551  * when mapping the device to iova.
3552  */
3553 static int device_notifier(struct notifier_block *nb,
3554                                   unsigned long action, void *data)
3555 {
3556         struct device *dev = data;
3557         struct pci_dev *pdev = to_pci_dev(dev);
3558         struct dmar_domain *domain;
3559
3560         if (iommu_no_mapping(dev))
3561                 return 0;
3562
3563         domain = find_domain(pdev);
3564         if (!domain)
3565                 return 0;
3566
3567         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3568                 domain_remove_one_dev_info(domain, pdev);
3569
3570                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3571                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3572                     list_empty(&domain->devices))
3573                         domain_exit(domain);
3574         }
3575
3576         return 0;
3577 }
3578
3579 static struct notifier_block device_nb = {
3580         .notifier_call = device_notifier,
3581 };
3582
3583 int __init intel_iommu_init(void)
3584 {
3585         int ret = 0;
3586
3587         /* VT-d is required for a TXT/tboot launch, so enforce that */
3588         force_on = tboot_force_iommu();
3589
3590         if (dmar_table_init()) {
3591                 if (force_on)
3592                         panic("tboot: Failed to initialize DMAR table\n");
3593                 return  -ENODEV;
3594         }
3595
3596         if (dmar_dev_scope_init() < 0) {
3597                 if (force_on)
3598                         panic("tboot: Failed to initialize DMAR device scope\n");
3599                 return  -ENODEV;
3600         }
3601
3602         if (no_iommu || dmar_disabled)
3603                 return -ENODEV;
3604
3605         if (iommu_init_mempool()) {
3606                 if (force_on)
3607                         panic("tboot: Failed to initialize iommu memory\n");
3608                 return  -ENODEV;
3609         }
3610
3611         if (list_empty(&dmar_rmrr_units))
3612                 printk(KERN_INFO "DMAR: No RMRR found\n");
3613
3614         if (list_empty(&dmar_atsr_units))
3615                 printk(KERN_INFO "DMAR: No ATSR found\n");
3616
3617         if (dmar_init_reserved_ranges()) {
3618                 if (force_on)
3619                         panic("tboot: Failed to reserve iommu ranges\n");
3620                 return  -ENODEV;
3621         }
3622
3623         init_no_remapping_devices();
3624
3625         ret = init_dmars();
3626         if (ret) {
3627                 if (force_on)
3628                         panic("tboot: Failed to initialize DMARs\n");
3629                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3630                 put_iova_domain(&reserved_iova_list);
3631                 iommu_exit_mempool();
3632                 return ret;
3633         }
3634         printk(KERN_INFO
3635         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3636
3637         init_timer(&unmap_timer);
3638 #ifdef CONFIG_SWIOTLB
3639         swiotlb = 0;
3640 #endif
3641         dma_ops = &intel_dma_ops;
3642
3643         init_iommu_pm_ops();
3644
3645         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3646
3647         bus_register_notifier(&pci_bus_type, &device_nb);
3648
3649         return 0;
3650 }
3651
3652 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3653                                            struct pci_dev *pdev)
3654 {
3655         struct pci_dev *tmp, *parent;
3656
3657         if (!iommu || !pdev)
3658                 return;
3659
3660         /* dependent device detach */
3661         tmp = pci_find_upstream_pcie_bridge(pdev);
3662         /* Secondary interface's bus number and devfn 0 */
3663         if (tmp) {
3664                 parent = pdev->bus->self;
3665                 while (parent != tmp) {
3666                         iommu_detach_dev(iommu, parent->bus->number,
3667                                          parent->devfn);
3668                         parent = parent->bus->self;
3669                 }
3670                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3671                         iommu_detach_dev(iommu,
3672                                 tmp->subordinate->number, 0);
3673                 else /* this is a legacy PCI bridge */
3674                         iommu_detach_dev(iommu, tmp->bus->number,
3675                                          tmp->devfn);
3676         }
3677 }
3678
3679 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3680                                           struct pci_dev *pdev)
3681 {
3682         struct device_domain_info *info;
3683         struct intel_iommu *iommu;
3684         unsigned long flags;
3685         int found = 0;
3686         struct list_head *entry, *tmp;
3687
3688         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3689                                 pdev->devfn);
3690         if (!iommu)
3691                 return;
3692
3693         spin_lock_irqsave(&device_domain_lock, flags);
3694         list_for_each_safe(entry, tmp, &domain->devices) {
3695                 info = list_entry(entry, struct device_domain_info, link);
3696                 if (info->segment == pci_domain_nr(pdev->bus) &&
3697                     info->bus == pdev->bus->number &&
3698                     info->devfn == pdev->devfn) {
3699                         list_del(&info->link);
3700                         list_del(&info->global);
3701                         if (info->dev)
3702                                 info->dev->dev.archdata.iommu = NULL;
3703                         spin_unlock_irqrestore(&device_domain_lock, flags);
3704
3705                         iommu_disable_dev_iotlb(info);
3706                         iommu_detach_dev(iommu, info->bus, info->devfn);
3707                         iommu_detach_dependent_devices(iommu, pdev);
3708                         free_devinfo_mem(info);
3709
3710                         spin_lock_irqsave(&device_domain_lock, flags);
3711
3712                         if (found)
3713                                 break;
3714                         else
3715                                 continue;
3716                 }
3717
3718                 /* if there is no other devices under the same iommu
3719                  * owned by this domain, clear this iommu in iommu_bmp
3720                  * update iommu count and coherency
3721                  */
3722                 if (iommu == device_to_iommu(info->segment, info->bus,
3723                                             info->devfn))
3724                         found = 1;
3725         }
3726
3727         spin_unlock_irqrestore(&device_domain_lock, flags);
3728
3729         if (found == 0) {
3730                 unsigned long tmp_flags;
3731                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3732                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3733                 domain->iommu_count--;
3734                 domain_update_iommu_cap(domain);
3735                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3736
3737                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3738                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3739                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3740                         clear_bit(domain->id, iommu->domain_ids);
3741                         iommu->domains[domain->id] = NULL;
3742                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3743                 }
3744         }
3745 }
3746
3747 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3748 {
3749         struct device_domain_info *info;
3750         struct intel_iommu *iommu;
3751         unsigned long flags1, flags2;
3752
3753         spin_lock_irqsave(&device_domain_lock, flags1);
3754         while (!list_empty(&domain->devices)) {
3755                 info = list_entry(domain->devices.next,
3756                         struct device_domain_info, link);
3757                 list_del(&info->link);
3758                 list_del(&info->global);
3759                 if (info->dev)
3760                         info->dev->dev.archdata.iommu = NULL;
3761
3762                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3763
3764                 iommu_disable_dev_iotlb(info);
3765                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3766                 iommu_detach_dev(iommu, info->bus, info->devfn);
3767                 iommu_detach_dependent_devices(iommu, info->dev);
3768
3769                 /* clear this iommu in iommu_bmp, update iommu count
3770                  * and capabilities
3771                  */
3772                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3773                 if (test_and_clear_bit(iommu->seq_id,
3774                                        &domain->iommu_bmp)) {
3775                         domain->iommu_count--;
3776                         domain_update_iommu_cap(domain);
3777                 }
3778                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3779
3780                 free_devinfo_mem(info);
3781                 spin_lock_irqsave(&device_domain_lock, flags1);
3782         }
3783         spin_unlock_irqrestore(&device_domain_lock, flags1);
3784 }
3785
3786 /* domain id for virtual machine, it won't be set in context */
3787 static unsigned long vm_domid;
3788
3789 static struct dmar_domain *iommu_alloc_vm_domain(void)
3790 {
3791         struct dmar_domain *domain;
3792
3793         domain = alloc_domain_mem();
3794         if (!domain)
3795                 return NULL;
3796
3797         domain->id = vm_domid++;
3798         domain->nid = -1;
3799         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3800         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3801
3802         return domain;
3803 }
3804
3805 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3806 {
3807         int adjust_width;
3808
3809         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3810         spin_lock_init(&domain->iommu_lock);
3811
3812         domain_reserve_special_ranges(domain);
3813
3814         /* calculate AGAW */
3815         domain->gaw = guest_width;
3816         adjust_width = guestwidth_to_adjustwidth(guest_width);
3817         domain->agaw = width_to_agaw(adjust_width);
3818
3819         INIT_LIST_HEAD(&domain->devices);
3820
3821         domain->iommu_count = 0;
3822         domain->iommu_coherency = 0;
3823         domain->iommu_snooping = 0;
3824         domain->iommu_superpage = 0;
3825         domain->max_addr = 0;
3826         domain->nid = -1;
3827
3828         /* always allocate the top pgd */
3829         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3830         if (!domain->pgd)
3831                 return -ENOMEM;
3832         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3833         return 0;
3834 }
3835
3836 static void iommu_free_vm_domain(struct dmar_domain *domain)
3837 {
3838         unsigned long flags;
3839         struct dmar_drhd_unit *drhd;
3840         struct intel_iommu *iommu;
3841         unsigned long i;
3842         unsigned long ndomains;
3843
3844         for_each_drhd_unit(drhd) {
3845                 if (drhd->ignored)
3846                         continue;
3847                 iommu = drhd->iommu;
3848
3849                 ndomains = cap_ndoms(iommu->cap);
3850                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3851                         if (iommu->domains[i] == domain) {
3852                                 spin_lock_irqsave(&iommu->lock, flags);
3853                                 clear_bit(i, iommu->domain_ids);
3854                                 iommu->domains[i] = NULL;
3855                                 spin_unlock_irqrestore(&iommu->lock, flags);
3856                                 break;
3857                         }
3858                 }
3859         }
3860 }
3861
3862 static void vm_domain_exit(struct dmar_domain *domain)
3863 {
3864         /* Domain 0 is reserved, so dont process it */
3865         if (!domain)
3866                 return;
3867
3868         vm_domain_remove_all_dev_info(domain);
3869         /* destroy iovas */
3870         put_iova_domain(&domain->iovad);
3871
3872         /* clear ptes */
3873         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3874
3875         /* free page tables */
3876         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3877
3878         iommu_free_vm_domain(domain);
3879         free_domain_mem(domain);
3880 }
3881
3882 static int intel_iommu_domain_init(struct iommu_domain *domain)
3883 {
3884         struct dmar_domain *dmar_domain;
3885
3886         dmar_domain = iommu_alloc_vm_domain();
3887         if (!dmar_domain) {
3888                 printk(KERN_ERR
3889                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3890                 return -ENOMEM;
3891         }
3892         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3893                 printk(KERN_ERR
3894                         "intel_iommu_domain_init() failed\n");
3895                 vm_domain_exit(dmar_domain);
3896                 return -ENOMEM;
3897         }
3898         domain_update_iommu_cap(dmar_domain);
3899         domain->priv = dmar_domain;
3900
3901         return 0;
3902 }
3903
3904 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3905 {
3906         struct dmar_domain *dmar_domain = domain->priv;
3907
3908         domain->priv = NULL;
3909         vm_domain_exit(dmar_domain);
3910 }
3911
3912 static int intel_iommu_attach_device(struct iommu_domain *domain,
3913                                      struct device *dev)
3914 {
3915         struct dmar_domain *dmar_domain = domain->priv;
3916         struct pci_dev *pdev = to_pci_dev(dev);
3917         struct intel_iommu *iommu;
3918         int addr_width;
3919
3920         /* normally pdev is not mapped */
3921         if (unlikely(domain_context_mapped(pdev))) {
3922                 struct dmar_domain *old_domain;
3923
3924                 old_domain = find_domain(pdev);
3925                 if (old_domain) {
3926                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3927                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3928                                 domain_remove_one_dev_info(old_domain, pdev);
3929                         else
3930                                 domain_remove_dev_info(old_domain);
3931                 }
3932         }
3933
3934         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3935                                 pdev->devfn);
3936         if (!iommu)
3937                 return -ENODEV;
3938
3939         /* check if this iommu agaw is sufficient for max mapped address */
3940         addr_width = agaw_to_width(iommu->agaw);
3941         if (addr_width > cap_mgaw(iommu->cap))
3942                 addr_width = cap_mgaw(iommu->cap);
3943
3944         if (dmar_domain->max_addr > (1LL << addr_width)) {
3945                 printk(KERN_ERR "%s: iommu width (%d) is not "
3946                        "sufficient for the mapped address (%llx)\n",
3947                        __func__, addr_width, dmar_domain->max_addr);
3948                 return -EFAULT;
3949         }
3950         dmar_domain->gaw = addr_width;
3951
3952         /*
3953          * Knock out extra levels of page tables if necessary
3954          */
3955         while (iommu->agaw < dmar_domain->agaw) {
3956                 struct dma_pte *pte;
3957
3958                 pte = dmar_domain->pgd;
3959                 if (dma_pte_present(pte)) {
3960                         dmar_domain->pgd = (struct dma_pte *)
3961                                 phys_to_virt(dma_pte_addr(pte));
3962                         free_pgtable_page(pte);
3963                 }
3964                 dmar_domain->agaw--;
3965         }
3966
3967         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3968 }
3969
3970 static void intel_iommu_detach_device(struct iommu_domain *domain,
3971                                       struct device *dev)
3972 {
3973         struct dmar_domain *dmar_domain = domain->priv;
3974         struct pci_dev *pdev = to_pci_dev(dev);
3975
3976         domain_remove_one_dev_info(dmar_domain, pdev);
3977 }
3978
3979 static int intel_iommu_map(struct iommu_domain *domain,
3980                            unsigned long iova, phys_addr_t hpa,
3981                            int gfp_order, int iommu_prot)
3982 {
3983         struct dmar_domain *dmar_domain = domain->priv;
3984         u64 max_addr;
3985         int prot = 0;
3986         size_t size;
3987         int ret;
3988
3989         if (iommu_prot & IOMMU_READ)
3990                 prot |= DMA_PTE_READ;
3991         if (iommu_prot & IOMMU_WRITE)
3992                 prot |= DMA_PTE_WRITE;
3993         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3994                 prot |= DMA_PTE_SNP;
3995
3996         size     = PAGE_SIZE << gfp_order;
3997         max_addr = iova + size;
3998         if (dmar_domain->max_addr < max_addr) {
3999                 u64 end;
4000
4001                 /* check if minimum agaw is sufficient for mapped address */
4002                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4003                 if (end < max_addr) {
4004                         printk(KERN_ERR "%s: iommu width (%d) is not "
4005                                "sufficient for the mapped address (%llx)\n",
4006                                __func__, dmar_domain->gaw, max_addr);
4007                         return -EFAULT;
4008                 }
4009                 dmar_domain->max_addr = max_addr;
4010         }
4011         /* Round up size to next multiple of PAGE_SIZE, if it and
4012            the low bits of hpa would take us onto the next page */
4013         size = aligned_nrpages(hpa, size);
4014         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4015                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4016         return ret;
4017 }
4018
4019 static int intel_iommu_unmap(struct iommu_domain *domain,
4020                              unsigned long iova, int gfp_order)
4021 {
4022         struct dmar_domain *dmar_domain = domain->priv;
4023         size_t size = PAGE_SIZE << gfp_order;
4024         int order;
4025
4026         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4027                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4028
4029         if (dmar_domain->max_addr == iova + size)
4030                 dmar_domain->max_addr = iova;
4031
4032         return order;
4033 }
4034
4035 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4036                                             unsigned long iova)
4037 {
4038         struct dmar_domain *dmar_domain = domain->priv;
4039         struct dma_pte *pte;
4040         u64 phys = 0;
4041
4042         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4043         if (pte)
4044                 phys = dma_pte_addr(pte);
4045
4046         return phys;
4047 }
4048
4049 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4050                                       unsigned long cap)
4051 {
4052         struct dmar_domain *dmar_domain = domain->priv;
4053
4054         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4055                 return dmar_domain->iommu_snooping;
4056         if (cap == IOMMU_CAP_INTR_REMAP)
4057                 return intr_remapping_enabled;
4058
4059         return 0;
4060 }
4061
4062 static struct iommu_ops intel_iommu_ops = {
4063         .domain_init    = intel_iommu_domain_init,
4064         .domain_destroy = intel_iommu_domain_destroy,
4065         .attach_dev     = intel_iommu_attach_device,
4066         .detach_dev     = intel_iommu_detach_device,
4067         .map            = intel_iommu_map,
4068         .unmap          = intel_iommu_unmap,
4069         .iova_to_phys   = intel_iommu_iova_to_phys,
4070         .domain_has_cap = intel_iommu_domain_has_cap,
4071 };
4072
4073 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4074 {
4075         /*
4076          * Mobile 4 Series Chipset neglects to set RWBF capability,
4077          * but needs it:
4078          */
4079         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4080         rwbf_quirk = 1;
4081
4082         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4083         if (dev->revision == 0x07) {
4084                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4085                 dmar_map_gfx = 0;
4086         }
4087 }
4088
4089 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4090
4091 #define GGC 0x52
4092 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4093 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4094 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4095 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4096 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4097 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4098 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4099 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4100
4101 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4102 {
4103         unsigned short ggc;
4104
4105         if (pci_read_config_word(dev, GGC, &ggc))
4106                 return;
4107
4108         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4109                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4110                 dmar_map_gfx = 0;
4111         } else if (dmar_map_gfx) {
4112                 /* we have to ensure the gfx device is idle before we flush */
4113                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4114                 intel_iommu_strict = 1;
4115        }
4116 }
4117 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4118 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4119 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4120 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4121
4122 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4123    ISOCH DMAR unit for the Azalia sound device, but not give it any
4124    TLB entries, which causes it to deadlock. Check for that.  We do
4125    this in a function called from init_dmars(), instead of in a PCI
4126    quirk, because we don't want to print the obnoxious "BIOS broken"
4127    message if VT-d is actually disabled.
4128 */
4129 static void __init check_tylersburg_isoch(void)
4130 {
4131         struct pci_dev *pdev;
4132         uint32_t vtisochctrl;
4133
4134         /* If there's no Azalia in the system anyway, forget it. */
4135         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4136         if (!pdev)
4137                 return;
4138         pci_dev_put(pdev);
4139
4140         /* System Management Registers. Might be hidden, in which case
4141            we can't do the sanity check. But that's OK, because the
4142            known-broken BIOSes _don't_ actually hide it, so far. */
4143         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4144         if (!pdev)
4145                 return;
4146
4147         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4148                 pci_dev_put(pdev);
4149                 return;
4150         }
4151
4152         pci_dev_put(pdev);
4153
4154         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4155         if (vtisochctrl & 1)
4156                 return;
4157
4158         /* Drop all bits other than the number of TLB entries */
4159         vtisochctrl &= 0x1c;
4160
4161         /* If we have the recommended number of TLB entries (16), fine. */
4162         if (vtisochctrl == 0x10)
4163                 return;
4164
4165         /* Zero TLB entries? You get to ride the short bus to school. */
4166         if (!vtisochctrl) {
4167                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4168                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4169                      dmi_get_system_info(DMI_BIOS_VENDOR),
4170                      dmi_get_system_info(DMI_BIOS_VERSION),
4171                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4172                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4173                 return;
4174         }
4175         
4176         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4177                vtisochctrl);
4178 }