Merge branch 'omap-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[pandora-kernel.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/syscore_ops.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
45
46 #define ROOT_SIZE               VTD_PAGE_SIZE
47 #define CONTEXT_SIZE            VTD_PAGE_SIZE
48
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
52
53 #define IOAPIC_RANGE_START      (0xfee00000)
54 #define IOAPIC_RANGE_END        (0xfeefffff)
55 #define IOVA_START_ADDR         (0x1000)
56
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
58
59 #define MAX_AGAW_WIDTH 64
60
61 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
63
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
67                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
69
70 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
73
74 /* page table handling */
75 #define LEVEL_STRIDE            (9)
76 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
77
78 static inline int agaw_to_level(int agaw)
79 {
80         return agaw + 2;
81 }
82
83 static inline int agaw_to_width(int agaw)
84 {
85         return 30 + agaw * LEVEL_STRIDE;
86 }
87
88 static inline int width_to_agaw(int width)
89 {
90         return (width - 30) / LEVEL_STRIDE;
91 }
92
93 static inline unsigned int level_to_offset_bits(int level)
94 {
95         return (level - 1) * LEVEL_STRIDE;
96 }
97
98 static inline int pfn_level_offset(unsigned long pfn, int level)
99 {
100         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
101 }
102
103 static inline unsigned long level_mask(int level)
104 {
105         return -1UL << level_to_offset_bits(level);
106 }
107
108 static inline unsigned long level_size(int level)
109 {
110         return 1UL << level_to_offset_bits(level);
111 }
112
113 static inline unsigned long align_to_level(unsigned long pfn, int level)
114 {
115         return (pfn + level_size(level) - 1) & level_mask(level);
116 }
117
118 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
119    are never going to work. */
120 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
121 {
122         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
123 }
124
125 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
126 {
127         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
128 }
129 static inline unsigned long page_to_dma_pfn(struct page *pg)
130 {
131         return mm_to_dma_pfn(page_to_pfn(pg));
132 }
133 static inline unsigned long virt_to_dma_pfn(void *p)
134 {
135         return page_to_dma_pfn(virt_to_page(p));
136 }
137
138 /* global iommu list, set NULL for ignored DMAR units */
139 static struct intel_iommu **g_iommus;
140
141 static void __init check_tylersburg_isoch(void);
142 static int rwbf_quirk;
143
144 /*
145  * 0: Present
146  * 1-11: Reserved
147  * 12-63: Context Ptr (12 - (haw-1))
148  * 64-127: Reserved
149  */
150 struct root_entry {
151         u64     val;
152         u64     rsvd1;
153 };
154 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
155 static inline bool root_present(struct root_entry *root)
156 {
157         return (root->val & 1);
158 }
159 static inline void set_root_present(struct root_entry *root)
160 {
161         root->val |= 1;
162 }
163 static inline void set_root_value(struct root_entry *root, unsigned long value)
164 {
165         root->val |= value & VTD_PAGE_MASK;
166 }
167
168 static inline struct context_entry *
169 get_context_addr_from_root(struct root_entry *root)
170 {
171         return (struct context_entry *)
172                 (root_present(root)?phys_to_virt(
173                 root->val & VTD_PAGE_MASK) :
174                 NULL);
175 }
176
177 /*
178  * low 64 bits:
179  * 0: present
180  * 1: fault processing disable
181  * 2-3: translation type
182  * 12-63: address space root
183  * high 64 bits:
184  * 0-2: address width
185  * 3-6: aval
186  * 8-23: domain id
187  */
188 struct context_entry {
189         u64 lo;
190         u64 hi;
191 };
192
193 static inline bool context_present(struct context_entry *context)
194 {
195         return (context->lo & 1);
196 }
197 static inline void context_set_present(struct context_entry *context)
198 {
199         context->lo |= 1;
200 }
201
202 static inline void context_set_fault_enable(struct context_entry *context)
203 {
204         context->lo &= (((u64)-1) << 2) | 1;
205 }
206
207 static inline void context_set_translation_type(struct context_entry *context,
208                                                 unsigned long value)
209 {
210         context->lo &= (((u64)-1) << 4) | 3;
211         context->lo |= (value & 3) << 2;
212 }
213
214 static inline void context_set_address_root(struct context_entry *context,
215                                             unsigned long value)
216 {
217         context->lo |= value & VTD_PAGE_MASK;
218 }
219
220 static inline void context_set_address_width(struct context_entry *context,
221                                              unsigned long value)
222 {
223         context->hi |= value & 7;
224 }
225
226 static inline void context_set_domain_id(struct context_entry *context,
227                                          unsigned long value)
228 {
229         context->hi |= (value & ((1 << 16) - 1)) << 8;
230 }
231
232 static inline void context_clear_entry(struct context_entry *context)
233 {
234         context->lo = 0;
235         context->hi = 0;
236 }
237
238 /*
239  * 0: readable
240  * 1: writable
241  * 2-6: reserved
242  * 7: super page
243  * 8-10: available
244  * 11: snoop behavior
245  * 12-63: Host physcial address
246  */
247 struct dma_pte {
248         u64 val;
249 };
250
251 static inline void dma_clear_pte(struct dma_pte *pte)
252 {
253         pte->val = 0;
254 }
255
256 static inline void dma_set_pte_readable(struct dma_pte *pte)
257 {
258         pte->val |= DMA_PTE_READ;
259 }
260
261 static inline void dma_set_pte_writable(struct dma_pte *pte)
262 {
263         pte->val |= DMA_PTE_WRITE;
264 }
265
266 static inline void dma_set_pte_snp(struct dma_pte *pte)
267 {
268         pte->val |= DMA_PTE_SNP;
269 }
270
271 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
272 {
273         pte->val = (pte->val & ~3) | (prot & 3);
274 }
275
276 static inline u64 dma_pte_addr(struct dma_pte *pte)
277 {
278 #ifdef CONFIG_64BIT
279         return pte->val & VTD_PAGE_MASK;
280 #else
281         /* Must have a full atomic 64-bit read */
282         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
283 #endif
284 }
285
286 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
287 {
288         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
289 }
290
291 static inline bool dma_pte_present(struct dma_pte *pte)
292 {
293         return (pte->val & 3) != 0;
294 }
295
296 static inline int first_pte_in_page(struct dma_pte *pte)
297 {
298         return !((unsigned long)pte & ~VTD_PAGE_MASK);
299 }
300
301 /*
302  * This domain is a statically identity mapping domain.
303  *      1. This domain creats a static 1:1 mapping to all usable memory.
304  *      2. It maps to each iommu if successful.
305  *      3. Each iommu mapps to this domain if successful.
306  */
307 static struct dmar_domain *si_domain;
308 static int hw_pass_through = 1;
309
310 /* devices under the same p2p bridge are owned in one domain */
311 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
312
313 /* domain represents a virtual machine, more than one devices
314  * across iommus may be owned in one domain, e.g. kvm guest.
315  */
316 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
317
318 /* si_domain contains mulitple devices */
319 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
320
321 struct dmar_domain {
322         int     id;                     /* domain id */
323         int     nid;                    /* node id */
324         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
325
326         struct list_head devices;       /* all devices' list */
327         struct iova_domain iovad;       /* iova's that belong to this domain */
328
329         struct dma_pte  *pgd;           /* virtual address */
330         int             gaw;            /* max guest address width */
331
332         /* adjusted guest address width, 0 is level 2 30-bit */
333         int             agaw;
334
335         int             flags;          /* flags to find out type of domain */
336
337         int             iommu_coherency;/* indicate coherency of iommu access */
338         int             iommu_snooping; /* indicate snooping control feature*/
339         int             iommu_count;    /* reference count of iommu */
340         spinlock_t      iommu_lock;     /* protect iommu set in domain */
341         u64             max_addr;       /* maximum mapped address */
342 };
343
344 /* PCI domain-device relationship */
345 struct device_domain_info {
346         struct list_head link;  /* link to domain siblings */
347         struct list_head global; /* link to global list */
348         int segment;            /* PCI domain */
349         u8 bus;                 /* PCI bus number */
350         u8 devfn;               /* PCI devfn number */
351         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
352         struct intel_iommu *iommu; /* IOMMU used by this device */
353         struct dmar_domain *domain; /* pointer to domain */
354 };
355
356 static void flush_unmaps_timeout(unsigned long data);
357
358 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
359
360 #define HIGH_WATER_MARK 250
361 struct deferred_flush_tables {
362         int next;
363         struct iova *iova[HIGH_WATER_MARK];
364         struct dmar_domain *domain[HIGH_WATER_MARK];
365 };
366
367 static struct deferred_flush_tables *deferred_flush;
368
369 /* bitmap for indexing intel_iommus */
370 static int g_num_of_iommus;
371
372 static DEFINE_SPINLOCK(async_umap_flush_lock);
373 static LIST_HEAD(unmaps_to_do);
374
375 static int timer_on;
376 static long list_size;
377
378 static void domain_remove_dev_info(struct dmar_domain *domain);
379
380 #ifdef CONFIG_DMAR_DEFAULT_ON
381 int dmar_disabled = 0;
382 #else
383 int dmar_disabled = 1;
384 #endif /*CONFIG_DMAR_DEFAULT_ON*/
385
386 static int dmar_map_gfx = 1;
387 static int dmar_forcedac;
388 static int intel_iommu_strict;
389
390 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
391 static DEFINE_SPINLOCK(device_domain_lock);
392 static LIST_HEAD(device_domain_list);
393
394 static struct iommu_ops intel_iommu_ops;
395
396 static int __init intel_iommu_setup(char *str)
397 {
398         if (!str)
399                 return -EINVAL;
400         while (*str) {
401                 if (!strncmp(str, "on", 2)) {
402                         dmar_disabled = 0;
403                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
404                 } else if (!strncmp(str, "off", 3)) {
405                         dmar_disabled = 1;
406                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
407                 } else if (!strncmp(str, "igfx_off", 8)) {
408                         dmar_map_gfx = 0;
409                         printk(KERN_INFO
410                                 "Intel-IOMMU: disable GFX device mapping\n");
411                 } else if (!strncmp(str, "forcedac", 8)) {
412                         printk(KERN_INFO
413                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
414                         dmar_forcedac = 1;
415                 } else if (!strncmp(str, "strict", 6)) {
416                         printk(KERN_INFO
417                                 "Intel-IOMMU: disable batched IOTLB flush\n");
418                         intel_iommu_strict = 1;
419                 }
420
421                 str += strcspn(str, ",");
422                 while (*str == ',')
423                         str++;
424         }
425         return 0;
426 }
427 __setup("intel_iommu=", intel_iommu_setup);
428
429 static struct kmem_cache *iommu_domain_cache;
430 static struct kmem_cache *iommu_devinfo_cache;
431 static struct kmem_cache *iommu_iova_cache;
432
433 static inline void *alloc_pgtable_page(int node)
434 {
435         struct page *page;
436         void *vaddr = NULL;
437
438         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
439         if (page)
440                 vaddr = page_address(page);
441         return vaddr;
442 }
443
444 static inline void free_pgtable_page(void *vaddr)
445 {
446         free_page((unsigned long)vaddr);
447 }
448
449 static inline void *alloc_domain_mem(void)
450 {
451         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
452 }
453
454 static void free_domain_mem(void *vaddr)
455 {
456         kmem_cache_free(iommu_domain_cache, vaddr);
457 }
458
459 static inline void * alloc_devinfo_mem(void)
460 {
461         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
462 }
463
464 static inline void free_devinfo_mem(void *vaddr)
465 {
466         kmem_cache_free(iommu_devinfo_cache, vaddr);
467 }
468
469 struct iova *alloc_iova_mem(void)
470 {
471         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
472 }
473
474 void free_iova_mem(struct iova *iova)
475 {
476         kmem_cache_free(iommu_iova_cache, iova);
477 }
478
479
480 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
481 {
482         unsigned long sagaw;
483         int agaw = -1;
484
485         sagaw = cap_sagaw(iommu->cap);
486         for (agaw = width_to_agaw(max_gaw);
487              agaw >= 0; agaw--) {
488                 if (test_bit(agaw, &sagaw))
489                         break;
490         }
491
492         return agaw;
493 }
494
495 /*
496  * Calculate max SAGAW for each iommu.
497  */
498 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
499 {
500         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
501 }
502
503 /*
504  * calculate agaw for each iommu.
505  * "SAGAW" may be different across iommus, use a default agaw, and
506  * get a supported less agaw for iommus that don't support the default agaw.
507  */
508 int iommu_calculate_agaw(struct intel_iommu *iommu)
509 {
510         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
511 }
512
513 /* This functionin only returns single iommu in a domain */
514 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
515 {
516         int iommu_id;
517
518         /* si_domain and vm domain should not get here. */
519         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
520         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
521
522         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
523         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
524                 return NULL;
525
526         return g_iommus[iommu_id];
527 }
528
529 static void domain_update_iommu_coherency(struct dmar_domain *domain)
530 {
531         int i;
532
533         domain->iommu_coherency = 1;
534
535         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
536                 if (!ecap_coherent(g_iommus[i]->ecap)) {
537                         domain->iommu_coherency = 0;
538                         break;
539                 }
540         }
541 }
542
543 static void domain_update_iommu_snooping(struct dmar_domain *domain)
544 {
545         int i;
546
547         domain->iommu_snooping = 1;
548
549         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
550                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
551                         domain->iommu_snooping = 0;
552                         break;
553                 }
554         }
555 }
556
557 /* Some capabilities may be different across iommus */
558 static void domain_update_iommu_cap(struct dmar_domain *domain)
559 {
560         domain_update_iommu_coherency(domain);
561         domain_update_iommu_snooping(domain);
562 }
563
564 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
565 {
566         struct dmar_drhd_unit *drhd = NULL;
567         int i;
568
569         for_each_drhd_unit(drhd) {
570                 if (drhd->ignored)
571                         continue;
572                 if (segment != drhd->segment)
573                         continue;
574
575                 for (i = 0; i < drhd->devices_cnt; i++) {
576                         if (drhd->devices[i] &&
577                             drhd->devices[i]->bus->number == bus &&
578                             drhd->devices[i]->devfn == devfn)
579                                 return drhd->iommu;
580                         if (drhd->devices[i] &&
581                             drhd->devices[i]->subordinate &&
582                             drhd->devices[i]->subordinate->number <= bus &&
583                             drhd->devices[i]->subordinate->subordinate >= bus)
584                                 return drhd->iommu;
585                 }
586
587                 if (drhd->include_all)
588                         return drhd->iommu;
589         }
590
591         return NULL;
592 }
593
594 static void domain_flush_cache(struct dmar_domain *domain,
595                                void *addr, int size)
596 {
597         if (!domain->iommu_coherency)
598                 clflush_cache_range(addr, size);
599 }
600
601 /* Gets context entry for a given bus and devfn */
602 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
603                 u8 bus, u8 devfn)
604 {
605         struct root_entry *root;
606         struct context_entry *context;
607         unsigned long phy_addr;
608         unsigned long flags;
609
610         spin_lock_irqsave(&iommu->lock, flags);
611         root = &iommu->root_entry[bus];
612         context = get_context_addr_from_root(root);
613         if (!context) {
614                 context = (struct context_entry *)
615                                 alloc_pgtable_page(iommu->node);
616                 if (!context) {
617                         spin_unlock_irqrestore(&iommu->lock, flags);
618                         return NULL;
619                 }
620                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
621                 phy_addr = virt_to_phys((void *)context);
622                 set_root_value(root, phy_addr);
623                 set_root_present(root);
624                 __iommu_flush_cache(iommu, root, sizeof(*root));
625         }
626         spin_unlock_irqrestore(&iommu->lock, flags);
627         return &context[devfn];
628 }
629
630 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
631 {
632         struct root_entry *root;
633         struct context_entry *context;
634         int ret;
635         unsigned long flags;
636
637         spin_lock_irqsave(&iommu->lock, flags);
638         root = &iommu->root_entry[bus];
639         context = get_context_addr_from_root(root);
640         if (!context) {
641                 ret = 0;
642                 goto out;
643         }
644         ret = context_present(&context[devfn]);
645 out:
646         spin_unlock_irqrestore(&iommu->lock, flags);
647         return ret;
648 }
649
650 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
651 {
652         struct root_entry *root;
653         struct context_entry *context;
654         unsigned long flags;
655
656         spin_lock_irqsave(&iommu->lock, flags);
657         root = &iommu->root_entry[bus];
658         context = get_context_addr_from_root(root);
659         if (context) {
660                 context_clear_entry(&context[devfn]);
661                 __iommu_flush_cache(iommu, &context[devfn], \
662                         sizeof(*context));
663         }
664         spin_unlock_irqrestore(&iommu->lock, flags);
665 }
666
667 static void free_context_table(struct intel_iommu *iommu)
668 {
669         struct root_entry *root;
670         int i;
671         unsigned long flags;
672         struct context_entry *context;
673
674         spin_lock_irqsave(&iommu->lock, flags);
675         if (!iommu->root_entry) {
676                 goto out;
677         }
678         for (i = 0; i < ROOT_ENTRY_NR; i++) {
679                 root = &iommu->root_entry[i];
680                 context = get_context_addr_from_root(root);
681                 if (context)
682                         free_pgtable_page(context);
683         }
684         free_pgtable_page(iommu->root_entry);
685         iommu->root_entry = NULL;
686 out:
687         spin_unlock_irqrestore(&iommu->lock, flags);
688 }
689
690 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
691                                       unsigned long pfn)
692 {
693         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
694         struct dma_pte *parent, *pte = NULL;
695         int level = agaw_to_level(domain->agaw);
696         int offset;
697
698         BUG_ON(!domain->pgd);
699         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
700         parent = domain->pgd;
701
702         while (level > 0) {
703                 void *tmp_page;
704
705                 offset = pfn_level_offset(pfn, level);
706                 pte = &parent[offset];
707                 if (level == 1)
708                         break;
709
710                 if (!dma_pte_present(pte)) {
711                         uint64_t pteval;
712
713                         tmp_page = alloc_pgtable_page(domain->nid);
714
715                         if (!tmp_page)
716                                 return NULL;
717
718                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
719                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
720                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
721                                 /* Someone else set it while we were thinking; use theirs. */
722                                 free_pgtable_page(tmp_page);
723                         } else {
724                                 dma_pte_addr(pte);
725                                 domain_flush_cache(domain, pte, sizeof(*pte));
726                         }
727                 }
728                 parent = phys_to_virt(dma_pte_addr(pte));
729                 level--;
730         }
731
732         return pte;
733 }
734
735 /* return address's pte at specific level */
736 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
737                                          unsigned long pfn,
738                                          int level)
739 {
740         struct dma_pte *parent, *pte = NULL;
741         int total = agaw_to_level(domain->agaw);
742         int offset;
743
744         parent = domain->pgd;
745         while (level <= total) {
746                 offset = pfn_level_offset(pfn, total);
747                 pte = &parent[offset];
748                 if (level == total)
749                         return pte;
750
751                 if (!dma_pte_present(pte))
752                         break;
753                 parent = phys_to_virt(dma_pte_addr(pte));
754                 total--;
755         }
756         return NULL;
757 }
758
759 /* clear last level pte, a tlb flush should be followed */
760 static void dma_pte_clear_range(struct dmar_domain *domain,
761                                 unsigned long start_pfn,
762                                 unsigned long last_pfn)
763 {
764         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
765         struct dma_pte *first_pte, *pte;
766
767         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
768         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
769         BUG_ON(start_pfn > last_pfn);
770
771         /* we don't need lock here; nobody else touches the iova range */
772         do {
773                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
774                 if (!pte) {
775                         start_pfn = align_to_level(start_pfn + 1, 2);
776                         continue;
777                 }
778                 do { 
779                         dma_clear_pte(pte);
780                         start_pfn++;
781                         pte++;
782                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
783
784                 domain_flush_cache(domain, first_pte,
785                                    (void *)pte - (void *)first_pte);
786
787         } while (start_pfn && start_pfn <= last_pfn);
788 }
789
790 /* free page table pages. last level pte should already be cleared */
791 static void dma_pte_free_pagetable(struct dmar_domain *domain,
792                                    unsigned long start_pfn,
793                                    unsigned long last_pfn)
794 {
795         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
796         struct dma_pte *first_pte, *pte;
797         int total = agaw_to_level(domain->agaw);
798         int level;
799         unsigned long tmp;
800
801         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
802         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
803         BUG_ON(start_pfn > last_pfn);
804
805         /* We don't need lock here; nobody else touches the iova range */
806         level = 2;
807         while (level <= total) {
808                 tmp = align_to_level(start_pfn, level);
809
810                 /* If we can't even clear one PTE at this level, we're done */
811                 if (tmp + level_size(level) - 1 > last_pfn)
812                         return;
813
814                 do {
815                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
816                         if (!pte) {
817                                 tmp = align_to_level(tmp + 1, level + 1);
818                                 continue;
819                         }
820                         do {
821                                 if (dma_pte_present(pte)) {
822                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
823                                         dma_clear_pte(pte);
824                                 }
825                                 pte++;
826                                 tmp += level_size(level);
827                         } while (!first_pte_in_page(pte) &&
828                                  tmp + level_size(level) - 1 <= last_pfn);
829
830                         domain_flush_cache(domain, first_pte,
831                                            (void *)pte - (void *)first_pte);
832                         
833                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
834                 level++;
835         }
836         /* free pgd */
837         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
838                 free_pgtable_page(domain->pgd);
839                 domain->pgd = NULL;
840         }
841 }
842
843 /* iommu handling */
844 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
845 {
846         struct root_entry *root;
847         unsigned long flags;
848
849         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
850         if (!root)
851                 return -ENOMEM;
852
853         __iommu_flush_cache(iommu, root, ROOT_SIZE);
854
855         spin_lock_irqsave(&iommu->lock, flags);
856         iommu->root_entry = root;
857         spin_unlock_irqrestore(&iommu->lock, flags);
858
859         return 0;
860 }
861
862 static void iommu_set_root_entry(struct intel_iommu *iommu)
863 {
864         void *addr;
865         u32 sts;
866         unsigned long flag;
867
868         addr = iommu->root_entry;
869
870         spin_lock_irqsave(&iommu->register_lock, flag);
871         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
872
873         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
874
875         /* Make sure hardware complete it */
876         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
877                       readl, (sts & DMA_GSTS_RTPS), sts);
878
879         spin_unlock_irqrestore(&iommu->register_lock, flag);
880 }
881
882 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
883 {
884         u32 val;
885         unsigned long flag;
886
887         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
888                 return;
889
890         spin_lock_irqsave(&iommu->register_lock, flag);
891         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
892
893         /* Make sure hardware complete it */
894         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
895                       readl, (!(val & DMA_GSTS_WBFS)), val);
896
897         spin_unlock_irqrestore(&iommu->register_lock, flag);
898 }
899
900 /* return value determine if we need a write buffer flush */
901 static void __iommu_flush_context(struct intel_iommu *iommu,
902                                   u16 did, u16 source_id, u8 function_mask,
903                                   u64 type)
904 {
905         u64 val = 0;
906         unsigned long flag;
907
908         switch (type) {
909         case DMA_CCMD_GLOBAL_INVL:
910                 val = DMA_CCMD_GLOBAL_INVL;
911                 break;
912         case DMA_CCMD_DOMAIN_INVL:
913                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
914                 break;
915         case DMA_CCMD_DEVICE_INVL:
916                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
917                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
918                 break;
919         default:
920                 BUG();
921         }
922         val |= DMA_CCMD_ICC;
923
924         spin_lock_irqsave(&iommu->register_lock, flag);
925         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
926
927         /* Make sure hardware complete it */
928         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
929                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
930
931         spin_unlock_irqrestore(&iommu->register_lock, flag);
932 }
933
934 /* return value determine if we need a write buffer flush */
935 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
936                                 u64 addr, unsigned int size_order, u64 type)
937 {
938         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
939         u64 val = 0, val_iva = 0;
940         unsigned long flag;
941
942         switch (type) {
943         case DMA_TLB_GLOBAL_FLUSH:
944                 /* global flush doesn't need set IVA_REG */
945                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
946                 break;
947         case DMA_TLB_DSI_FLUSH:
948                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
949                 break;
950         case DMA_TLB_PSI_FLUSH:
951                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
952                 /* Note: always flush non-leaf currently */
953                 val_iva = size_order | addr;
954                 break;
955         default:
956                 BUG();
957         }
958         /* Note: set drain read/write */
959 #if 0
960         /*
961          * This is probably to be super secure.. Looks like we can
962          * ignore it without any impact.
963          */
964         if (cap_read_drain(iommu->cap))
965                 val |= DMA_TLB_READ_DRAIN;
966 #endif
967         if (cap_write_drain(iommu->cap))
968                 val |= DMA_TLB_WRITE_DRAIN;
969
970         spin_lock_irqsave(&iommu->register_lock, flag);
971         /* Note: Only uses first TLB reg currently */
972         if (val_iva)
973                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
974         dmar_writeq(iommu->reg + tlb_offset + 8, val);
975
976         /* Make sure hardware complete it */
977         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
978                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
979
980         spin_unlock_irqrestore(&iommu->register_lock, flag);
981
982         /* check IOTLB invalidation granularity */
983         if (DMA_TLB_IAIG(val) == 0)
984                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
985         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
986                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
987                         (unsigned long long)DMA_TLB_IIRG(type),
988                         (unsigned long long)DMA_TLB_IAIG(val));
989 }
990
991 static struct device_domain_info *iommu_support_dev_iotlb(
992         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
993 {
994         int found = 0;
995         unsigned long flags;
996         struct device_domain_info *info;
997         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
998
999         if (!ecap_dev_iotlb_support(iommu->ecap))
1000                 return NULL;
1001
1002         if (!iommu->qi)
1003                 return NULL;
1004
1005         spin_lock_irqsave(&device_domain_lock, flags);
1006         list_for_each_entry(info, &domain->devices, link)
1007                 if (info->bus == bus && info->devfn == devfn) {
1008                         found = 1;
1009                         break;
1010                 }
1011         spin_unlock_irqrestore(&device_domain_lock, flags);
1012
1013         if (!found || !info->dev)
1014                 return NULL;
1015
1016         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1017                 return NULL;
1018
1019         if (!dmar_find_matched_atsr_unit(info->dev))
1020                 return NULL;
1021
1022         info->iommu = iommu;
1023
1024         return info;
1025 }
1026
1027 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1028 {
1029         if (!info)
1030                 return;
1031
1032         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1033 }
1034
1035 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1036 {
1037         if (!info->dev || !pci_ats_enabled(info->dev))
1038                 return;
1039
1040         pci_disable_ats(info->dev);
1041 }
1042
1043 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1044                                   u64 addr, unsigned mask)
1045 {
1046         u16 sid, qdep;
1047         unsigned long flags;
1048         struct device_domain_info *info;
1049
1050         spin_lock_irqsave(&device_domain_lock, flags);
1051         list_for_each_entry(info, &domain->devices, link) {
1052                 if (!info->dev || !pci_ats_enabled(info->dev))
1053                         continue;
1054
1055                 sid = info->bus << 8 | info->devfn;
1056                 qdep = pci_ats_queue_depth(info->dev);
1057                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1058         }
1059         spin_unlock_irqrestore(&device_domain_lock, flags);
1060 }
1061
1062 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1063                                   unsigned long pfn, unsigned int pages, int map)
1064 {
1065         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1066         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1067
1068         BUG_ON(pages == 0);
1069
1070         /*
1071          * Fallback to domain selective flush if no PSI support or the size is
1072          * too big.
1073          * PSI requires page size to be 2 ^ x, and the base address is naturally
1074          * aligned to the size
1075          */
1076         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1077                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1078                                                 DMA_TLB_DSI_FLUSH);
1079         else
1080                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1081                                                 DMA_TLB_PSI_FLUSH);
1082
1083         /*
1084          * In caching mode, changes of pages from non-present to present require
1085          * flush. However, device IOTLB doesn't need to be flushed in this case.
1086          */
1087         if (!cap_caching_mode(iommu->cap) || !map)
1088                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1089 }
1090
1091 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1092 {
1093         u32 pmen;
1094         unsigned long flags;
1095
1096         spin_lock_irqsave(&iommu->register_lock, flags);
1097         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1098         pmen &= ~DMA_PMEN_EPM;
1099         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1100
1101         /* wait for the protected region status bit to clear */
1102         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1103                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1104
1105         spin_unlock_irqrestore(&iommu->register_lock, flags);
1106 }
1107
1108 static int iommu_enable_translation(struct intel_iommu *iommu)
1109 {
1110         u32 sts;
1111         unsigned long flags;
1112
1113         spin_lock_irqsave(&iommu->register_lock, flags);
1114         iommu->gcmd |= DMA_GCMD_TE;
1115         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1116
1117         /* Make sure hardware complete it */
1118         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1119                       readl, (sts & DMA_GSTS_TES), sts);
1120
1121         spin_unlock_irqrestore(&iommu->register_lock, flags);
1122         return 0;
1123 }
1124
1125 static int iommu_disable_translation(struct intel_iommu *iommu)
1126 {
1127         u32 sts;
1128         unsigned long flag;
1129
1130         spin_lock_irqsave(&iommu->register_lock, flag);
1131         iommu->gcmd &= ~DMA_GCMD_TE;
1132         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1133
1134         /* Make sure hardware complete it */
1135         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1136                       readl, (!(sts & DMA_GSTS_TES)), sts);
1137
1138         spin_unlock_irqrestore(&iommu->register_lock, flag);
1139         return 0;
1140 }
1141
1142
1143 static int iommu_init_domains(struct intel_iommu *iommu)
1144 {
1145         unsigned long ndomains;
1146         unsigned long nlongs;
1147
1148         ndomains = cap_ndoms(iommu->cap);
1149         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1150                         ndomains);
1151         nlongs = BITS_TO_LONGS(ndomains);
1152
1153         spin_lock_init(&iommu->lock);
1154
1155         /* TBD: there might be 64K domains,
1156          * consider other allocation for future chip
1157          */
1158         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1159         if (!iommu->domain_ids) {
1160                 printk(KERN_ERR "Allocating domain id array failed\n");
1161                 return -ENOMEM;
1162         }
1163         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1164                         GFP_KERNEL);
1165         if (!iommu->domains) {
1166                 printk(KERN_ERR "Allocating domain array failed\n");
1167                 return -ENOMEM;
1168         }
1169
1170         /*
1171          * if Caching mode is set, then invalid translations are tagged
1172          * with domainid 0. Hence we need to pre-allocate it.
1173          */
1174         if (cap_caching_mode(iommu->cap))
1175                 set_bit(0, iommu->domain_ids);
1176         return 0;
1177 }
1178
1179
1180 static void domain_exit(struct dmar_domain *domain);
1181 static void vm_domain_exit(struct dmar_domain *domain);
1182
1183 void free_dmar_iommu(struct intel_iommu *iommu)
1184 {
1185         struct dmar_domain *domain;
1186         int i;
1187         unsigned long flags;
1188
1189         if ((iommu->domains) && (iommu->domain_ids)) {
1190                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1191                         domain = iommu->domains[i];
1192                         clear_bit(i, iommu->domain_ids);
1193
1194                         spin_lock_irqsave(&domain->iommu_lock, flags);
1195                         if (--domain->iommu_count == 0) {
1196                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1197                                         vm_domain_exit(domain);
1198                                 else
1199                                         domain_exit(domain);
1200                         }
1201                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1202                 }
1203         }
1204
1205         if (iommu->gcmd & DMA_GCMD_TE)
1206                 iommu_disable_translation(iommu);
1207
1208         if (iommu->irq) {
1209                 irq_set_handler_data(iommu->irq, NULL);
1210                 /* This will mask the irq */
1211                 free_irq(iommu->irq, iommu);
1212                 destroy_irq(iommu->irq);
1213         }
1214
1215         kfree(iommu->domains);
1216         kfree(iommu->domain_ids);
1217
1218         g_iommus[iommu->seq_id] = NULL;
1219
1220         /* if all iommus are freed, free g_iommus */
1221         for (i = 0; i < g_num_of_iommus; i++) {
1222                 if (g_iommus[i])
1223                         break;
1224         }
1225
1226         if (i == g_num_of_iommus)
1227                 kfree(g_iommus);
1228
1229         /* free context mapping */
1230         free_context_table(iommu);
1231 }
1232
1233 static struct dmar_domain *alloc_domain(void)
1234 {
1235         struct dmar_domain *domain;
1236
1237         domain = alloc_domain_mem();
1238         if (!domain)
1239                 return NULL;
1240
1241         domain->nid = -1;
1242         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1243         domain->flags = 0;
1244
1245         return domain;
1246 }
1247
1248 static int iommu_attach_domain(struct dmar_domain *domain,
1249                                struct intel_iommu *iommu)
1250 {
1251         int num;
1252         unsigned long ndomains;
1253         unsigned long flags;
1254
1255         ndomains = cap_ndoms(iommu->cap);
1256
1257         spin_lock_irqsave(&iommu->lock, flags);
1258
1259         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1260         if (num >= ndomains) {
1261                 spin_unlock_irqrestore(&iommu->lock, flags);
1262                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1263                 return -ENOMEM;
1264         }
1265
1266         domain->id = num;
1267         set_bit(num, iommu->domain_ids);
1268         set_bit(iommu->seq_id, &domain->iommu_bmp);
1269         iommu->domains[num] = domain;
1270         spin_unlock_irqrestore(&iommu->lock, flags);
1271
1272         return 0;
1273 }
1274
1275 static void iommu_detach_domain(struct dmar_domain *domain,
1276                                 struct intel_iommu *iommu)
1277 {
1278         unsigned long flags;
1279         int num, ndomains;
1280         int found = 0;
1281
1282         spin_lock_irqsave(&iommu->lock, flags);
1283         ndomains = cap_ndoms(iommu->cap);
1284         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1285                 if (iommu->domains[num] == domain) {
1286                         found = 1;
1287                         break;
1288                 }
1289         }
1290
1291         if (found) {
1292                 clear_bit(num, iommu->domain_ids);
1293                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1294                 iommu->domains[num] = NULL;
1295         }
1296         spin_unlock_irqrestore(&iommu->lock, flags);
1297 }
1298
1299 static struct iova_domain reserved_iova_list;
1300 static struct lock_class_key reserved_rbtree_key;
1301
1302 static int dmar_init_reserved_ranges(void)
1303 {
1304         struct pci_dev *pdev = NULL;
1305         struct iova *iova;
1306         int i;
1307
1308         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1309
1310         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1311                 &reserved_rbtree_key);
1312
1313         /* IOAPIC ranges shouldn't be accessed by DMA */
1314         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1315                 IOVA_PFN(IOAPIC_RANGE_END));
1316         if (!iova) {
1317                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1318                 return -ENODEV;
1319         }
1320
1321         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1322         for_each_pci_dev(pdev) {
1323                 struct resource *r;
1324
1325                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1326                         r = &pdev->resource[i];
1327                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1328                                 continue;
1329                         iova = reserve_iova(&reserved_iova_list,
1330                                             IOVA_PFN(r->start),
1331                                             IOVA_PFN(r->end));
1332                         if (!iova) {
1333                                 printk(KERN_ERR "Reserve iova failed\n");
1334                                 return -ENODEV;
1335                         }
1336                 }
1337         }
1338         return 0;
1339 }
1340
1341 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1342 {
1343         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1344 }
1345
1346 static inline int guestwidth_to_adjustwidth(int gaw)
1347 {
1348         int agaw;
1349         int r = (gaw - 12) % 9;
1350
1351         if (r == 0)
1352                 agaw = gaw;
1353         else
1354                 agaw = gaw + 9 - r;
1355         if (agaw > 64)
1356                 agaw = 64;
1357         return agaw;
1358 }
1359
1360 static int domain_init(struct dmar_domain *domain, int guest_width)
1361 {
1362         struct intel_iommu *iommu;
1363         int adjust_width, agaw;
1364         unsigned long sagaw;
1365
1366         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1367         spin_lock_init(&domain->iommu_lock);
1368
1369         domain_reserve_special_ranges(domain);
1370
1371         /* calculate AGAW */
1372         iommu = domain_get_iommu(domain);
1373         if (guest_width > cap_mgaw(iommu->cap))
1374                 guest_width = cap_mgaw(iommu->cap);
1375         domain->gaw = guest_width;
1376         adjust_width = guestwidth_to_adjustwidth(guest_width);
1377         agaw = width_to_agaw(adjust_width);
1378         sagaw = cap_sagaw(iommu->cap);
1379         if (!test_bit(agaw, &sagaw)) {
1380                 /* hardware doesn't support it, choose a bigger one */
1381                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1382                 agaw = find_next_bit(&sagaw, 5, agaw);
1383                 if (agaw >= 5)
1384                         return -ENODEV;
1385         }
1386         domain->agaw = agaw;
1387         INIT_LIST_HEAD(&domain->devices);
1388
1389         if (ecap_coherent(iommu->ecap))
1390                 domain->iommu_coherency = 1;
1391         else
1392                 domain->iommu_coherency = 0;
1393
1394         if (ecap_sc_support(iommu->ecap))
1395                 domain->iommu_snooping = 1;
1396         else
1397                 domain->iommu_snooping = 0;
1398
1399         domain->iommu_count = 1;
1400         domain->nid = iommu->node;
1401
1402         /* always allocate the top pgd */
1403         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1404         if (!domain->pgd)
1405                 return -ENOMEM;
1406         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1407         return 0;
1408 }
1409
1410 static void domain_exit(struct dmar_domain *domain)
1411 {
1412         struct dmar_drhd_unit *drhd;
1413         struct intel_iommu *iommu;
1414
1415         /* Domain 0 is reserved, so dont process it */
1416         if (!domain)
1417                 return;
1418
1419         domain_remove_dev_info(domain);
1420         /* destroy iovas */
1421         put_iova_domain(&domain->iovad);
1422
1423         /* clear ptes */
1424         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1425
1426         /* free page tables */
1427         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1428
1429         for_each_active_iommu(iommu, drhd)
1430                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1431                         iommu_detach_domain(domain, iommu);
1432
1433         free_domain_mem(domain);
1434 }
1435
1436 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1437                                  u8 bus, u8 devfn, int translation)
1438 {
1439         struct context_entry *context;
1440         unsigned long flags;
1441         struct intel_iommu *iommu;
1442         struct dma_pte *pgd;
1443         unsigned long num;
1444         unsigned long ndomains;
1445         int id;
1446         int agaw;
1447         struct device_domain_info *info = NULL;
1448
1449         pr_debug("Set context mapping for %02x:%02x.%d\n",
1450                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1451
1452         BUG_ON(!domain->pgd);
1453         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1454                translation != CONTEXT_TT_MULTI_LEVEL);
1455
1456         iommu = device_to_iommu(segment, bus, devfn);
1457         if (!iommu)
1458                 return -ENODEV;
1459
1460         context = device_to_context_entry(iommu, bus, devfn);
1461         if (!context)
1462                 return -ENOMEM;
1463         spin_lock_irqsave(&iommu->lock, flags);
1464         if (context_present(context)) {
1465                 spin_unlock_irqrestore(&iommu->lock, flags);
1466                 return 0;
1467         }
1468
1469         id = domain->id;
1470         pgd = domain->pgd;
1471
1472         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1473             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1474                 int found = 0;
1475
1476                 /* find an available domain id for this device in iommu */
1477                 ndomains = cap_ndoms(iommu->cap);
1478                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1479                         if (iommu->domains[num] == domain) {
1480                                 id = num;
1481                                 found = 1;
1482                                 break;
1483                         }
1484                 }
1485
1486                 if (found == 0) {
1487                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1488                         if (num >= ndomains) {
1489                                 spin_unlock_irqrestore(&iommu->lock, flags);
1490                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1491                                 return -EFAULT;
1492                         }
1493
1494                         set_bit(num, iommu->domain_ids);
1495                         iommu->domains[num] = domain;
1496                         id = num;
1497                 }
1498
1499                 /* Skip top levels of page tables for
1500                  * iommu which has less agaw than default.
1501                  * Unnecessary for PT mode.
1502                  */
1503                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1504                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1505                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1506                                 if (!dma_pte_present(pgd)) {
1507                                         spin_unlock_irqrestore(&iommu->lock, flags);
1508                                         return -ENOMEM;
1509                                 }
1510                         }
1511                 }
1512         }
1513
1514         context_set_domain_id(context, id);
1515
1516         if (translation != CONTEXT_TT_PASS_THROUGH) {
1517                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1518                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1519                                      CONTEXT_TT_MULTI_LEVEL;
1520         }
1521         /*
1522          * In pass through mode, AW must be programmed to indicate the largest
1523          * AGAW value supported by hardware. And ASR is ignored by hardware.
1524          */
1525         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1526                 context_set_address_width(context, iommu->msagaw);
1527         else {
1528                 context_set_address_root(context, virt_to_phys(pgd));
1529                 context_set_address_width(context, iommu->agaw);
1530         }
1531
1532         context_set_translation_type(context, translation);
1533         context_set_fault_enable(context);
1534         context_set_present(context);
1535         domain_flush_cache(domain, context, sizeof(*context));
1536
1537         /*
1538          * It's a non-present to present mapping. If hardware doesn't cache
1539          * non-present entry we only need to flush the write-buffer. If the
1540          * _does_ cache non-present entries, then it does so in the special
1541          * domain #0, which we have to flush:
1542          */
1543         if (cap_caching_mode(iommu->cap)) {
1544                 iommu->flush.flush_context(iommu, 0,
1545                                            (((u16)bus) << 8) | devfn,
1546                                            DMA_CCMD_MASK_NOBIT,
1547                                            DMA_CCMD_DEVICE_INVL);
1548                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1549         } else {
1550                 iommu_flush_write_buffer(iommu);
1551         }
1552         iommu_enable_dev_iotlb(info);
1553         spin_unlock_irqrestore(&iommu->lock, flags);
1554
1555         spin_lock_irqsave(&domain->iommu_lock, flags);
1556         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1557                 domain->iommu_count++;
1558                 if (domain->iommu_count == 1)
1559                         domain->nid = iommu->node;
1560                 domain_update_iommu_cap(domain);
1561         }
1562         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1563         return 0;
1564 }
1565
1566 static int
1567 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1568                         int translation)
1569 {
1570         int ret;
1571         struct pci_dev *tmp, *parent;
1572
1573         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1574                                          pdev->bus->number, pdev->devfn,
1575                                          translation);
1576         if (ret)
1577                 return ret;
1578
1579         /* dependent device mapping */
1580         tmp = pci_find_upstream_pcie_bridge(pdev);
1581         if (!tmp)
1582                 return 0;
1583         /* Secondary interface's bus number and devfn 0 */
1584         parent = pdev->bus->self;
1585         while (parent != tmp) {
1586                 ret = domain_context_mapping_one(domain,
1587                                                  pci_domain_nr(parent->bus),
1588                                                  parent->bus->number,
1589                                                  parent->devfn, translation);
1590                 if (ret)
1591                         return ret;
1592                 parent = parent->bus->self;
1593         }
1594         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1595                 return domain_context_mapping_one(domain,
1596                                         pci_domain_nr(tmp->subordinate),
1597                                         tmp->subordinate->number, 0,
1598                                         translation);
1599         else /* this is a legacy PCI bridge */
1600                 return domain_context_mapping_one(domain,
1601                                                   pci_domain_nr(tmp->bus),
1602                                                   tmp->bus->number,
1603                                                   tmp->devfn,
1604                                                   translation);
1605 }
1606
1607 static int domain_context_mapped(struct pci_dev *pdev)
1608 {
1609         int ret;
1610         struct pci_dev *tmp, *parent;
1611         struct intel_iommu *iommu;
1612
1613         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1614                                 pdev->devfn);
1615         if (!iommu)
1616                 return -ENODEV;
1617
1618         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1619         if (!ret)
1620                 return ret;
1621         /* dependent device mapping */
1622         tmp = pci_find_upstream_pcie_bridge(pdev);
1623         if (!tmp)
1624                 return ret;
1625         /* Secondary interface's bus number and devfn 0 */
1626         parent = pdev->bus->self;
1627         while (parent != tmp) {
1628                 ret = device_context_mapped(iommu, parent->bus->number,
1629                                             parent->devfn);
1630                 if (!ret)
1631                         return ret;
1632                 parent = parent->bus->self;
1633         }
1634         if (pci_is_pcie(tmp))
1635                 return device_context_mapped(iommu, tmp->subordinate->number,
1636                                              0);
1637         else
1638                 return device_context_mapped(iommu, tmp->bus->number,
1639                                              tmp->devfn);
1640 }
1641
1642 /* Returns a number of VTD pages, but aligned to MM page size */
1643 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1644                                             size_t size)
1645 {
1646         host_addr &= ~PAGE_MASK;
1647         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1648 }
1649
1650 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1651                             struct scatterlist *sg, unsigned long phys_pfn,
1652                             unsigned long nr_pages, int prot)
1653 {
1654         struct dma_pte *first_pte = NULL, *pte = NULL;
1655         phys_addr_t uninitialized_var(pteval);
1656         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1657         unsigned long sg_res;
1658
1659         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1660
1661         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1662                 return -EINVAL;
1663
1664         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1665
1666         if (sg)
1667                 sg_res = 0;
1668         else {
1669                 sg_res = nr_pages + 1;
1670                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1671         }
1672
1673         while (nr_pages--) {
1674                 uint64_t tmp;
1675
1676                 if (!sg_res) {
1677                         sg_res = aligned_nrpages(sg->offset, sg->length);
1678                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1679                         sg->dma_length = sg->length;
1680                         pteval = page_to_phys(sg_page(sg)) | prot;
1681                 }
1682                 if (!pte) {
1683                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1684                         if (!pte)
1685                                 return -ENOMEM;
1686                 }
1687                 /* We don't need lock here, nobody else
1688                  * touches the iova range
1689                  */
1690                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1691                 if (tmp) {
1692                         static int dumps = 5;
1693                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1694                                iov_pfn, tmp, (unsigned long long)pteval);
1695                         if (dumps) {
1696                                 dumps--;
1697                                 debug_dma_dump_mappings(NULL);
1698                         }
1699                         WARN_ON(1);
1700                 }
1701                 pte++;
1702                 if (!nr_pages || first_pte_in_page(pte)) {
1703                         domain_flush_cache(domain, first_pte,
1704                                            (void *)pte - (void *)first_pte);
1705                         pte = NULL;
1706                 }
1707                 iov_pfn++;
1708                 pteval += VTD_PAGE_SIZE;
1709                 sg_res--;
1710                 if (!sg_res)
1711                         sg = sg_next(sg);
1712         }
1713         return 0;
1714 }
1715
1716 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1717                                     struct scatterlist *sg, unsigned long nr_pages,
1718                                     int prot)
1719 {
1720         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1721 }
1722
1723 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1724                                      unsigned long phys_pfn, unsigned long nr_pages,
1725                                      int prot)
1726 {
1727         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1728 }
1729
1730 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1731 {
1732         if (!iommu)
1733                 return;
1734
1735         clear_context_table(iommu, bus, devfn);
1736         iommu->flush.flush_context(iommu, 0, 0, 0,
1737                                            DMA_CCMD_GLOBAL_INVL);
1738         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1739 }
1740
1741 static void domain_remove_dev_info(struct dmar_domain *domain)
1742 {
1743         struct device_domain_info *info;
1744         unsigned long flags;
1745         struct intel_iommu *iommu;
1746
1747         spin_lock_irqsave(&device_domain_lock, flags);
1748         while (!list_empty(&domain->devices)) {
1749                 info = list_entry(domain->devices.next,
1750                         struct device_domain_info, link);
1751                 list_del(&info->link);
1752                 list_del(&info->global);
1753                 if (info->dev)
1754                         info->dev->dev.archdata.iommu = NULL;
1755                 spin_unlock_irqrestore(&device_domain_lock, flags);
1756
1757                 iommu_disable_dev_iotlb(info);
1758                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1759                 iommu_detach_dev(iommu, info->bus, info->devfn);
1760                 free_devinfo_mem(info);
1761
1762                 spin_lock_irqsave(&device_domain_lock, flags);
1763         }
1764         spin_unlock_irqrestore(&device_domain_lock, flags);
1765 }
1766
1767 /*
1768  * find_domain
1769  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1770  */
1771 static struct dmar_domain *
1772 find_domain(struct pci_dev *pdev)
1773 {
1774         struct device_domain_info *info;
1775
1776         /* No lock here, assumes no domain exit in normal case */
1777         info = pdev->dev.archdata.iommu;
1778         if (info)
1779                 return info->domain;
1780         return NULL;
1781 }
1782
1783 /* domain is initialized */
1784 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1785 {
1786         struct dmar_domain *domain, *found = NULL;
1787         struct intel_iommu *iommu;
1788         struct dmar_drhd_unit *drhd;
1789         struct device_domain_info *info, *tmp;
1790         struct pci_dev *dev_tmp;
1791         unsigned long flags;
1792         int bus = 0, devfn = 0;
1793         int segment;
1794         int ret;
1795
1796         domain = find_domain(pdev);
1797         if (domain)
1798                 return domain;
1799
1800         segment = pci_domain_nr(pdev->bus);
1801
1802         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1803         if (dev_tmp) {
1804                 if (pci_is_pcie(dev_tmp)) {
1805                         bus = dev_tmp->subordinate->number;
1806                         devfn = 0;
1807                 } else {
1808                         bus = dev_tmp->bus->number;
1809                         devfn = dev_tmp->devfn;
1810                 }
1811                 spin_lock_irqsave(&device_domain_lock, flags);
1812                 list_for_each_entry(info, &device_domain_list, global) {
1813                         if (info->segment == segment &&
1814                             info->bus == bus && info->devfn == devfn) {
1815                                 found = info->domain;
1816                                 break;
1817                         }
1818                 }
1819                 spin_unlock_irqrestore(&device_domain_lock, flags);
1820                 /* pcie-pci bridge already has a domain, uses it */
1821                 if (found) {
1822                         domain = found;
1823                         goto found_domain;
1824                 }
1825         }
1826
1827         domain = alloc_domain();
1828         if (!domain)
1829                 goto error;
1830
1831         /* Allocate new domain for the device */
1832         drhd = dmar_find_matched_drhd_unit(pdev);
1833         if (!drhd) {
1834                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1835                         pci_name(pdev));
1836                 return NULL;
1837         }
1838         iommu = drhd->iommu;
1839
1840         ret = iommu_attach_domain(domain, iommu);
1841         if (ret) {
1842                 free_domain_mem(domain);
1843                 goto error;
1844         }
1845
1846         if (domain_init(domain, gaw)) {
1847                 domain_exit(domain);
1848                 goto error;
1849         }
1850
1851         /* register pcie-to-pci device */
1852         if (dev_tmp) {
1853                 info = alloc_devinfo_mem();
1854                 if (!info) {
1855                         domain_exit(domain);
1856                         goto error;
1857                 }
1858                 info->segment = segment;
1859                 info->bus = bus;
1860                 info->devfn = devfn;
1861                 info->dev = NULL;
1862                 info->domain = domain;
1863                 /* This domain is shared by devices under p2p bridge */
1864                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1865
1866                 /* pcie-to-pci bridge already has a domain, uses it */
1867                 found = NULL;
1868                 spin_lock_irqsave(&device_domain_lock, flags);
1869                 list_for_each_entry(tmp, &device_domain_list, global) {
1870                         if (tmp->segment == segment &&
1871                             tmp->bus == bus && tmp->devfn == devfn) {
1872                                 found = tmp->domain;
1873                                 break;
1874                         }
1875                 }
1876                 if (found) {
1877                         spin_unlock_irqrestore(&device_domain_lock, flags);
1878                         free_devinfo_mem(info);
1879                         domain_exit(domain);
1880                         domain = found;
1881                 } else {
1882                         list_add(&info->link, &domain->devices);
1883                         list_add(&info->global, &device_domain_list);
1884                         spin_unlock_irqrestore(&device_domain_lock, flags);
1885                 }
1886         }
1887
1888 found_domain:
1889         info = alloc_devinfo_mem();
1890         if (!info)
1891                 goto error;
1892         info->segment = segment;
1893         info->bus = pdev->bus->number;
1894         info->devfn = pdev->devfn;
1895         info->dev = pdev;
1896         info->domain = domain;
1897         spin_lock_irqsave(&device_domain_lock, flags);
1898         /* somebody is fast */
1899         found = find_domain(pdev);
1900         if (found != NULL) {
1901                 spin_unlock_irqrestore(&device_domain_lock, flags);
1902                 if (found != domain) {
1903                         domain_exit(domain);
1904                         domain = found;
1905                 }
1906                 free_devinfo_mem(info);
1907                 return domain;
1908         }
1909         list_add(&info->link, &domain->devices);
1910         list_add(&info->global, &device_domain_list);
1911         pdev->dev.archdata.iommu = info;
1912         spin_unlock_irqrestore(&device_domain_lock, flags);
1913         return domain;
1914 error:
1915         /* recheck it here, maybe others set it */
1916         return find_domain(pdev);
1917 }
1918
1919 static int iommu_identity_mapping;
1920 #define IDENTMAP_ALL            1
1921 #define IDENTMAP_GFX            2
1922 #define IDENTMAP_AZALIA         4
1923
1924 static int iommu_domain_identity_map(struct dmar_domain *domain,
1925                                      unsigned long long start,
1926                                      unsigned long long end)
1927 {
1928         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1929         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1930
1931         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1932                           dma_to_mm_pfn(last_vpfn))) {
1933                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1934                 return -ENOMEM;
1935         }
1936
1937         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1938                  start, end, domain->id);
1939         /*
1940          * RMRR range might have overlap with physical memory range,
1941          * clear it first
1942          */
1943         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1944
1945         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1946                                   last_vpfn - first_vpfn + 1,
1947                                   DMA_PTE_READ|DMA_PTE_WRITE);
1948 }
1949
1950 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1951                                       unsigned long long start,
1952                                       unsigned long long end)
1953 {
1954         struct dmar_domain *domain;
1955         int ret;
1956
1957         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1958         if (!domain)
1959                 return -ENOMEM;
1960
1961         /* For _hardware_ passthrough, don't bother. But for software
1962            passthrough, we do it anyway -- it may indicate a memory
1963            range which is reserved in E820, so which didn't get set
1964            up to start with in si_domain */
1965         if (domain == si_domain && hw_pass_through) {
1966                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1967                        pci_name(pdev), start, end);
1968                 return 0;
1969         }
1970
1971         printk(KERN_INFO
1972                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1973                pci_name(pdev), start, end);
1974         
1975         if (end < start) {
1976                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1977                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1978                         dmi_get_system_info(DMI_BIOS_VENDOR),
1979                         dmi_get_system_info(DMI_BIOS_VERSION),
1980                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1981                 ret = -EIO;
1982                 goto error;
1983         }
1984
1985         if (end >> agaw_to_width(domain->agaw)) {
1986                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1987                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1988                      agaw_to_width(domain->agaw),
1989                      dmi_get_system_info(DMI_BIOS_VENDOR),
1990                      dmi_get_system_info(DMI_BIOS_VERSION),
1991                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1992                 ret = -EIO;
1993                 goto error;
1994         }
1995
1996         ret = iommu_domain_identity_map(domain, start, end);
1997         if (ret)
1998                 goto error;
1999
2000         /* context entry init */
2001         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2002         if (ret)
2003                 goto error;
2004
2005         return 0;
2006
2007  error:
2008         domain_exit(domain);
2009         return ret;
2010 }
2011
2012 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2013         struct pci_dev *pdev)
2014 {
2015         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2016                 return 0;
2017         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2018                 rmrr->end_address + 1);
2019 }
2020
2021 #ifdef CONFIG_DMAR_FLOPPY_WA
2022 static inline void iommu_prepare_isa(void)
2023 {
2024         struct pci_dev *pdev;
2025         int ret;
2026
2027         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2028         if (!pdev)
2029                 return;
2030
2031         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2032         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2033
2034         if (ret)
2035                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2036                        "floppy might not work\n");
2037
2038 }
2039 #else
2040 static inline void iommu_prepare_isa(void)
2041 {
2042         return;
2043 }
2044 #endif /* !CONFIG_DMAR_FLPY_WA */
2045
2046 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2047
2048 static int __init si_domain_work_fn(unsigned long start_pfn,
2049                                     unsigned long end_pfn, void *datax)
2050 {
2051         int *ret = datax;
2052
2053         *ret = iommu_domain_identity_map(si_domain,
2054                                          (uint64_t)start_pfn << PAGE_SHIFT,
2055                                          (uint64_t)end_pfn << PAGE_SHIFT);
2056         return *ret;
2057
2058 }
2059
2060 static int __init si_domain_init(int hw)
2061 {
2062         struct dmar_drhd_unit *drhd;
2063         struct intel_iommu *iommu;
2064         int nid, ret = 0;
2065
2066         si_domain = alloc_domain();
2067         if (!si_domain)
2068                 return -EFAULT;
2069
2070         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2071
2072         for_each_active_iommu(iommu, drhd) {
2073                 ret = iommu_attach_domain(si_domain, iommu);
2074                 if (ret) {
2075                         domain_exit(si_domain);
2076                         return -EFAULT;
2077                 }
2078         }
2079
2080         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2081                 domain_exit(si_domain);
2082                 return -EFAULT;
2083         }
2084
2085         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2086
2087         if (hw)
2088                 return 0;
2089
2090         for_each_online_node(nid) {
2091                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2092                 if (ret)
2093                         return ret;
2094         }
2095
2096         return 0;
2097 }
2098
2099 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2100                                           struct pci_dev *pdev);
2101 static int identity_mapping(struct pci_dev *pdev)
2102 {
2103         struct device_domain_info *info;
2104
2105         if (likely(!iommu_identity_mapping))
2106                 return 0;
2107
2108
2109         list_for_each_entry(info, &si_domain->devices, link)
2110                 if (info->dev == pdev)
2111                         return 1;
2112         return 0;
2113 }
2114
2115 static int domain_add_dev_info(struct dmar_domain *domain,
2116                                struct pci_dev *pdev,
2117                                int translation)
2118 {
2119         struct device_domain_info *info;
2120         unsigned long flags;
2121         int ret;
2122
2123         info = alloc_devinfo_mem();
2124         if (!info)
2125                 return -ENOMEM;
2126
2127         ret = domain_context_mapping(domain, pdev, translation);
2128         if (ret) {
2129                 free_devinfo_mem(info);
2130                 return ret;
2131         }
2132
2133         info->segment = pci_domain_nr(pdev->bus);
2134         info->bus = pdev->bus->number;
2135         info->devfn = pdev->devfn;
2136         info->dev = pdev;
2137         info->domain = domain;
2138
2139         spin_lock_irqsave(&device_domain_lock, flags);
2140         list_add(&info->link, &domain->devices);
2141         list_add(&info->global, &device_domain_list);
2142         pdev->dev.archdata.iommu = info;
2143         spin_unlock_irqrestore(&device_domain_lock, flags);
2144
2145         return 0;
2146 }
2147
2148 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2149 {
2150         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2151                 return 1;
2152
2153         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2154                 return 1;
2155
2156         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2157                 return 0;
2158
2159         /*
2160          * We want to start off with all devices in the 1:1 domain, and
2161          * take them out later if we find they can't access all of memory.
2162          *
2163          * However, we can't do this for PCI devices behind bridges,
2164          * because all PCI devices behind the same bridge will end up
2165          * with the same source-id on their transactions.
2166          *
2167          * Practically speaking, we can't change things around for these
2168          * devices at run-time, because we can't be sure there'll be no
2169          * DMA transactions in flight for any of their siblings.
2170          * 
2171          * So PCI devices (unless they're on the root bus) as well as
2172          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2173          * the 1:1 domain, just in _case_ one of their siblings turns out
2174          * not to be able to map all of memory.
2175          */
2176         if (!pci_is_pcie(pdev)) {
2177                 if (!pci_is_root_bus(pdev->bus))
2178                         return 0;
2179                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2180                         return 0;
2181         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2182                 return 0;
2183
2184         /* 
2185          * At boot time, we don't yet know if devices will be 64-bit capable.
2186          * Assume that they will -- if they turn out not to be, then we can 
2187          * take them out of the 1:1 domain later.
2188          */
2189         if (!startup)
2190                 return pdev->dma_mask > DMA_BIT_MASK(32);
2191
2192         return 1;
2193 }
2194
2195 static int __init iommu_prepare_static_identity_mapping(int hw)
2196 {
2197         struct pci_dev *pdev = NULL;
2198         int ret;
2199
2200         ret = si_domain_init(hw);
2201         if (ret)
2202                 return -EFAULT;
2203
2204         for_each_pci_dev(pdev) {
2205                 if (iommu_should_identity_map(pdev, 1)) {
2206                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2207                                hw ? "hardware" : "software", pci_name(pdev));
2208
2209                         ret = domain_add_dev_info(si_domain, pdev,
2210                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2211                                                      CONTEXT_TT_MULTI_LEVEL);
2212                         if (ret)
2213                                 return ret;
2214                 }
2215         }
2216
2217         return 0;
2218 }
2219
2220 static int __init init_dmars(int force_on)
2221 {
2222         struct dmar_drhd_unit *drhd;
2223         struct dmar_rmrr_unit *rmrr;
2224         struct pci_dev *pdev;
2225         struct intel_iommu *iommu;
2226         int i, ret;
2227
2228         /*
2229          * for each drhd
2230          *    allocate root
2231          *    initialize and program root entry to not present
2232          * endfor
2233          */
2234         for_each_drhd_unit(drhd) {
2235                 g_num_of_iommus++;
2236                 /*
2237                  * lock not needed as this is only incremented in the single
2238                  * threaded kernel __init code path all other access are read
2239                  * only
2240                  */
2241         }
2242
2243         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2244                         GFP_KERNEL);
2245         if (!g_iommus) {
2246                 printk(KERN_ERR "Allocating global iommu array failed\n");
2247                 ret = -ENOMEM;
2248                 goto error;
2249         }
2250
2251         deferred_flush = kzalloc(g_num_of_iommus *
2252                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2253         if (!deferred_flush) {
2254                 ret = -ENOMEM;
2255                 goto error;
2256         }
2257
2258         for_each_drhd_unit(drhd) {
2259                 if (drhd->ignored)
2260                         continue;
2261
2262                 iommu = drhd->iommu;
2263                 g_iommus[iommu->seq_id] = iommu;
2264
2265                 ret = iommu_init_domains(iommu);
2266                 if (ret)
2267                         goto error;
2268
2269                 /*
2270                  * TBD:
2271                  * we could share the same root & context tables
2272                  * among all IOMMU's. Need to Split it later.
2273                  */
2274                 ret = iommu_alloc_root_entry(iommu);
2275                 if (ret) {
2276                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2277                         goto error;
2278                 }
2279                 if (!ecap_pass_through(iommu->ecap))
2280                         hw_pass_through = 0;
2281         }
2282
2283         /*
2284          * Start from the sane iommu hardware state.
2285          */
2286         for_each_drhd_unit(drhd) {
2287                 if (drhd->ignored)
2288                         continue;
2289
2290                 iommu = drhd->iommu;
2291
2292                 /*
2293                  * If the queued invalidation is already initialized by us
2294                  * (for example, while enabling interrupt-remapping) then
2295                  * we got the things already rolling from a sane state.
2296                  */
2297                 if (iommu->qi)
2298                         continue;
2299
2300                 /*
2301                  * Clear any previous faults.
2302                  */
2303                 dmar_fault(-1, iommu);
2304                 /*
2305                  * Disable queued invalidation if supported and already enabled
2306                  * before OS handover.
2307                  */
2308                 dmar_disable_qi(iommu);
2309         }
2310
2311         for_each_drhd_unit(drhd) {
2312                 if (drhd->ignored)
2313                         continue;
2314
2315                 iommu = drhd->iommu;
2316
2317                 if (dmar_enable_qi(iommu)) {
2318                         /*
2319                          * Queued Invalidate not enabled, use Register Based
2320                          * Invalidate
2321                          */
2322                         iommu->flush.flush_context = __iommu_flush_context;
2323                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2324                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2325                                "invalidation\n",
2326                                 iommu->seq_id,
2327                                (unsigned long long)drhd->reg_base_addr);
2328                 } else {
2329                         iommu->flush.flush_context = qi_flush_context;
2330                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2331                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2332                                "invalidation\n",
2333                                 iommu->seq_id,
2334                                (unsigned long long)drhd->reg_base_addr);
2335                 }
2336         }
2337
2338         if (iommu_pass_through)
2339                 iommu_identity_mapping |= IDENTMAP_ALL;
2340
2341 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2342         iommu_identity_mapping |= IDENTMAP_GFX;
2343 #endif
2344
2345         check_tylersburg_isoch();
2346
2347         /*
2348          * If pass through is not set or not enabled, setup context entries for
2349          * identity mappings for rmrr, gfx, and isa and may fall back to static
2350          * identity mapping if iommu_identity_mapping is set.
2351          */
2352         if (iommu_identity_mapping) {
2353                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2354                 if (ret) {
2355                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2356                         goto error;
2357                 }
2358         }
2359         /*
2360          * For each rmrr
2361          *   for each dev attached to rmrr
2362          *   do
2363          *     locate drhd for dev, alloc domain for dev
2364          *     allocate free domain
2365          *     allocate page table entries for rmrr
2366          *     if context not allocated for bus
2367          *           allocate and init context
2368          *           set present in root table for this bus
2369          *     init context with domain, translation etc
2370          *    endfor
2371          * endfor
2372          */
2373         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2374         for_each_rmrr_units(rmrr) {
2375                 for (i = 0; i < rmrr->devices_cnt; i++) {
2376                         pdev = rmrr->devices[i];
2377                         /*
2378                          * some BIOS lists non-exist devices in DMAR
2379                          * table.
2380                          */
2381                         if (!pdev)
2382                                 continue;
2383                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2384                         if (ret)
2385                                 printk(KERN_ERR
2386                                        "IOMMU: mapping reserved region failed\n");
2387                 }
2388         }
2389
2390         iommu_prepare_isa();
2391
2392         /*
2393          * for each drhd
2394          *   enable fault log
2395          *   global invalidate context cache
2396          *   global invalidate iotlb
2397          *   enable translation
2398          */
2399         for_each_drhd_unit(drhd) {
2400                 if (drhd->ignored) {
2401                         /*
2402                          * we always have to disable PMRs or DMA may fail on
2403                          * this device
2404                          */
2405                         if (force_on)
2406                                 iommu_disable_protect_mem_regions(drhd->iommu);
2407                         continue;
2408                 }
2409                 iommu = drhd->iommu;
2410
2411                 iommu_flush_write_buffer(iommu);
2412
2413                 ret = dmar_set_interrupt(iommu);
2414                 if (ret)
2415                         goto error;
2416
2417                 iommu_set_root_entry(iommu);
2418
2419                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2420                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2421
2422                 ret = iommu_enable_translation(iommu);
2423                 if (ret)
2424                         goto error;
2425
2426                 iommu_disable_protect_mem_regions(iommu);
2427         }
2428
2429         return 0;
2430 error:
2431         for_each_drhd_unit(drhd) {
2432                 if (drhd->ignored)
2433                         continue;
2434                 iommu = drhd->iommu;
2435                 free_iommu(iommu);
2436         }
2437         kfree(g_iommus);
2438         return ret;
2439 }
2440
2441 /* This takes a number of _MM_ pages, not VTD pages */
2442 static struct iova *intel_alloc_iova(struct device *dev,
2443                                      struct dmar_domain *domain,
2444                                      unsigned long nrpages, uint64_t dma_mask)
2445 {
2446         struct pci_dev *pdev = to_pci_dev(dev);
2447         struct iova *iova = NULL;
2448
2449         /* Restrict dma_mask to the width that the iommu can handle */
2450         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2451
2452         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2453                 /*
2454                  * First try to allocate an io virtual address in
2455                  * DMA_BIT_MASK(32) and if that fails then try allocating
2456                  * from higher range
2457                  */
2458                 iova = alloc_iova(&domain->iovad, nrpages,
2459                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2460                 if (iova)
2461                         return iova;
2462         }
2463         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2464         if (unlikely(!iova)) {
2465                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2466                        nrpages, pci_name(pdev));
2467                 return NULL;
2468         }
2469
2470         return iova;
2471 }
2472
2473 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2474 {
2475         struct dmar_domain *domain;
2476         int ret;
2477
2478         domain = get_domain_for_dev(pdev,
2479                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2480         if (!domain) {
2481                 printk(KERN_ERR
2482                         "Allocating domain for %s failed", pci_name(pdev));
2483                 return NULL;
2484         }
2485
2486         /* make sure context mapping is ok */
2487         if (unlikely(!domain_context_mapped(pdev))) {
2488                 ret = domain_context_mapping(domain, pdev,
2489                                              CONTEXT_TT_MULTI_LEVEL);
2490                 if (ret) {
2491                         printk(KERN_ERR
2492                                 "Domain context map for %s failed",
2493                                 pci_name(pdev));
2494                         return NULL;
2495                 }
2496         }
2497
2498         return domain;
2499 }
2500
2501 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2502 {
2503         struct device_domain_info *info;
2504
2505         /* No lock here, assumes no domain exit in normal case */
2506         info = dev->dev.archdata.iommu;
2507         if (likely(info))
2508                 return info->domain;
2509
2510         return __get_valid_domain_for_dev(dev);
2511 }
2512
2513 static int iommu_dummy(struct pci_dev *pdev)
2514 {
2515         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2516 }
2517
2518 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2519 static int iommu_no_mapping(struct device *dev)
2520 {
2521         struct pci_dev *pdev;
2522         int found;
2523
2524         if (unlikely(dev->bus != &pci_bus_type))
2525                 return 1;
2526
2527         pdev = to_pci_dev(dev);
2528         if (iommu_dummy(pdev))
2529                 return 1;
2530
2531         if (!iommu_identity_mapping)
2532                 return 0;
2533
2534         found = identity_mapping(pdev);
2535         if (found) {
2536                 if (iommu_should_identity_map(pdev, 0))
2537                         return 1;
2538                 else {
2539                         /*
2540                          * 32 bit DMA is removed from si_domain and fall back
2541                          * to non-identity mapping.
2542                          */
2543                         domain_remove_one_dev_info(si_domain, pdev);
2544                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2545                                pci_name(pdev));
2546                         return 0;
2547                 }
2548         } else {
2549                 /*
2550                  * In case of a detached 64 bit DMA device from vm, the device
2551                  * is put into si_domain for identity mapping.
2552                  */
2553                 if (iommu_should_identity_map(pdev, 0)) {
2554                         int ret;
2555                         ret = domain_add_dev_info(si_domain, pdev,
2556                                                   hw_pass_through ?
2557                                                   CONTEXT_TT_PASS_THROUGH :
2558                                                   CONTEXT_TT_MULTI_LEVEL);
2559                         if (!ret) {
2560                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2561                                        pci_name(pdev));
2562                                 return 1;
2563                         }
2564                 }
2565         }
2566
2567         return 0;
2568 }
2569
2570 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2571                                      size_t size, int dir, u64 dma_mask)
2572 {
2573         struct pci_dev *pdev = to_pci_dev(hwdev);
2574         struct dmar_domain *domain;
2575         phys_addr_t start_paddr;
2576         struct iova *iova;
2577         int prot = 0;
2578         int ret;
2579         struct intel_iommu *iommu;
2580         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2581
2582         BUG_ON(dir == DMA_NONE);
2583
2584         if (iommu_no_mapping(hwdev))
2585                 return paddr;
2586
2587         domain = get_valid_domain_for_dev(pdev);
2588         if (!domain)
2589                 return 0;
2590
2591         iommu = domain_get_iommu(domain);
2592         size = aligned_nrpages(paddr, size);
2593
2594         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2595                                 pdev->dma_mask);
2596         if (!iova)
2597                 goto error;
2598
2599         /*
2600          * Check if DMAR supports zero-length reads on write only
2601          * mappings..
2602          */
2603         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2604                         !cap_zlr(iommu->cap))
2605                 prot |= DMA_PTE_READ;
2606         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2607                 prot |= DMA_PTE_WRITE;
2608         /*
2609          * paddr - (paddr + size) might be partial page, we should map the whole
2610          * page.  Note: if two part of one page are separately mapped, we
2611          * might have two guest_addr mapping to the same host paddr, but this
2612          * is not a big problem
2613          */
2614         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2615                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2616         if (ret)
2617                 goto error;
2618
2619         /* it's a non-present to present mapping. Only flush if caching mode */
2620         if (cap_caching_mode(iommu->cap))
2621                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2622         else
2623                 iommu_flush_write_buffer(iommu);
2624
2625         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2626         start_paddr += paddr & ~PAGE_MASK;
2627         return start_paddr;
2628
2629 error:
2630         if (iova)
2631                 __free_iova(&domain->iovad, iova);
2632         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2633                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2634         return 0;
2635 }
2636
2637 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2638                                  unsigned long offset, size_t size,
2639                                  enum dma_data_direction dir,
2640                                  struct dma_attrs *attrs)
2641 {
2642         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2643                                   dir, to_pci_dev(dev)->dma_mask);
2644 }
2645
2646 static void flush_unmaps(void)
2647 {
2648         int i, j;
2649
2650         timer_on = 0;
2651
2652         /* just flush them all */
2653         for (i = 0; i < g_num_of_iommus; i++) {
2654                 struct intel_iommu *iommu = g_iommus[i];
2655                 if (!iommu)
2656                         continue;
2657
2658                 if (!deferred_flush[i].next)
2659                         continue;
2660
2661                 /* In caching mode, global flushes turn emulation expensive */
2662                 if (!cap_caching_mode(iommu->cap))
2663                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2664                                          DMA_TLB_GLOBAL_FLUSH);
2665                 for (j = 0; j < deferred_flush[i].next; j++) {
2666                         unsigned long mask;
2667                         struct iova *iova = deferred_flush[i].iova[j];
2668                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2669
2670                         /* On real hardware multiple invalidations are expensive */
2671                         if (cap_caching_mode(iommu->cap))
2672                                 iommu_flush_iotlb_psi(iommu, domain->id,
2673                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2674                         else {
2675                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2676                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2677                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2678                         }
2679                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2680                 }
2681                 deferred_flush[i].next = 0;
2682         }
2683
2684         list_size = 0;
2685 }
2686
2687 static void flush_unmaps_timeout(unsigned long data)
2688 {
2689         unsigned long flags;
2690
2691         spin_lock_irqsave(&async_umap_flush_lock, flags);
2692         flush_unmaps();
2693         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2694 }
2695
2696 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2697 {
2698         unsigned long flags;
2699         int next, iommu_id;
2700         struct intel_iommu *iommu;
2701
2702         spin_lock_irqsave(&async_umap_flush_lock, flags);
2703         if (list_size == HIGH_WATER_MARK)
2704                 flush_unmaps();
2705
2706         iommu = domain_get_iommu(dom);
2707         iommu_id = iommu->seq_id;
2708
2709         next = deferred_flush[iommu_id].next;
2710         deferred_flush[iommu_id].domain[next] = dom;
2711         deferred_flush[iommu_id].iova[next] = iova;
2712         deferred_flush[iommu_id].next++;
2713
2714         if (!timer_on) {
2715                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2716                 timer_on = 1;
2717         }
2718         list_size++;
2719         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2720 }
2721
2722 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2723                              size_t size, enum dma_data_direction dir,
2724                              struct dma_attrs *attrs)
2725 {
2726         struct pci_dev *pdev = to_pci_dev(dev);
2727         struct dmar_domain *domain;
2728         unsigned long start_pfn, last_pfn;
2729         struct iova *iova;
2730         struct intel_iommu *iommu;
2731
2732         if (iommu_no_mapping(dev))
2733                 return;
2734
2735         domain = find_domain(pdev);
2736         BUG_ON(!domain);
2737
2738         iommu = domain_get_iommu(domain);
2739
2740         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2741         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2742                       (unsigned long long)dev_addr))
2743                 return;
2744
2745         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2746         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2747
2748         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2749                  pci_name(pdev), start_pfn, last_pfn);
2750
2751         /*  clear the whole page */
2752         dma_pte_clear_range(domain, start_pfn, last_pfn);
2753
2754         /* free page tables */
2755         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2756
2757         if (intel_iommu_strict) {
2758                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2759                                       last_pfn - start_pfn + 1, 0);
2760                 /* free iova */
2761                 __free_iova(&domain->iovad, iova);
2762         } else {
2763                 add_unmap(domain, iova);
2764                 /*
2765                  * queue up the release of the unmap to save the 1/6th of the
2766                  * cpu used up by the iotlb flush operation...
2767                  */
2768         }
2769 }
2770
2771 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2772                                   dma_addr_t *dma_handle, gfp_t flags)
2773 {
2774         void *vaddr;
2775         int order;
2776
2777         size = PAGE_ALIGN(size);
2778         order = get_order(size);
2779
2780         if (!iommu_no_mapping(hwdev))
2781                 flags &= ~(GFP_DMA | GFP_DMA32);
2782         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2783                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2784                         flags |= GFP_DMA;
2785                 else
2786                         flags |= GFP_DMA32;
2787         }
2788
2789         vaddr = (void *)__get_free_pages(flags, order);
2790         if (!vaddr)
2791                 return NULL;
2792         memset(vaddr, 0, size);
2793
2794         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2795                                          DMA_BIDIRECTIONAL,
2796                                          hwdev->coherent_dma_mask);
2797         if (*dma_handle)
2798                 return vaddr;
2799         free_pages((unsigned long)vaddr, order);
2800         return NULL;
2801 }
2802
2803 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2804                                 dma_addr_t dma_handle)
2805 {
2806         int order;
2807
2808         size = PAGE_ALIGN(size);
2809         order = get_order(size);
2810
2811         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2812         free_pages((unsigned long)vaddr, order);
2813 }
2814
2815 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2816                            int nelems, enum dma_data_direction dir,
2817                            struct dma_attrs *attrs)
2818 {
2819         struct pci_dev *pdev = to_pci_dev(hwdev);
2820         struct dmar_domain *domain;
2821         unsigned long start_pfn, last_pfn;
2822         struct iova *iova;
2823         struct intel_iommu *iommu;
2824
2825         if (iommu_no_mapping(hwdev))
2826                 return;
2827
2828         domain = find_domain(pdev);
2829         BUG_ON(!domain);
2830
2831         iommu = domain_get_iommu(domain);
2832
2833         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2834         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2835                       (unsigned long long)sglist[0].dma_address))
2836                 return;
2837
2838         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2839         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2840
2841         /*  clear the whole page */
2842         dma_pte_clear_range(domain, start_pfn, last_pfn);
2843
2844         /* free page tables */
2845         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2846
2847         if (intel_iommu_strict) {
2848                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2849                                       last_pfn - start_pfn + 1, 0);
2850                 /* free iova */
2851                 __free_iova(&domain->iovad, iova);
2852         } else {
2853                 add_unmap(domain, iova);
2854                 /*
2855                  * queue up the release of the unmap to save the 1/6th of the
2856                  * cpu used up by the iotlb flush operation...
2857                  */
2858         }
2859 }
2860
2861 static int intel_nontranslate_map_sg(struct device *hddev,
2862         struct scatterlist *sglist, int nelems, int dir)
2863 {
2864         int i;
2865         struct scatterlist *sg;
2866
2867         for_each_sg(sglist, sg, nelems, i) {
2868                 BUG_ON(!sg_page(sg));
2869                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2870                 sg->dma_length = sg->length;
2871         }
2872         return nelems;
2873 }
2874
2875 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2876                         enum dma_data_direction dir, struct dma_attrs *attrs)
2877 {
2878         int i;
2879         struct pci_dev *pdev = to_pci_dev(hwdev);
2880         struct dmar_domain *domain;
2881         size_t size = 0;
2882         int prot = 0;
2883         struct iova *iova = NULL;
2884         int ret;
2885         struct scatterlist *sg;
2886         unsigned long start_vpfn;
2887         struct intel_iommu *iommu;
2888
2889         BUG_ON(dir == DMA_NONE);
2890         if (iommu_no_mapping(hwdev))
2891                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2892
2893         domain = get_valid_domain_for_dev(pdev);
2894         if (!domain)
2895                 return 0;
2896
2897         iommu = domain_get_iommu(domain);
2898
2899         for_each_sg(sglist, sg, nelems, i)
2900                 size += aligned_nrpages(sg->offset, sg->length);
2901
2902         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2903                                 pdev->dma_mask);
2904         if (!iova) {
2905                 sglist->dma_length = 0;
2906                 return 0;
2907         }
2908
2909         /*
2910          * Check if DMAR supports zero-length reads on write only
2911          * mappings..
2912          */
2913         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2914                         !cap_zlr(iommu->cap))
2915                 prot |= DMA_PTE_READ;
2916         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2917                 prot |= DMA_PTE_WRITE;
2918
2919         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2920
2921         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2922         if (unlikely(ret)) {
2923                 /*  clear the page */
2924                 dma_pte_clear_range(domain, start_vpfn,
2925                                     start_vpfn + size - 1);
2926                 /* free page tables */
2927                 dma_pte_free_pagetable(domain, start_vpfn,
2928                                        start_vpfn + size - 1);
2929                 /* free iova */
2930                 __free_iova(&domain->iovad, iova);
2931                 return 0;
2932         }
2933
2934         /* it's a non-present to present mapping. Only flush if caching mode */
2935         if (cap_caching_mode(iommu->cap))
2936                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
2937         else
2938                 iommu_flush_write_buffer(iommu);
2939
2940         return nelems;
2941 }
2942
2943 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2944 {
2945         return !dma_addr;
2946 }
2947
2948 struct dma_map_ops intel_dma_ops = {
2949         .alloc_coherent = intel_alloc_coherent,
2950         .free_coherent = intel_free_coherent,
2951         .map_sg = intel_map_sg,
2952         .unmap_sg = intel_unmap_sg,
2953         .map_page = intel_map_page,
2954         .unmap_page = intel_unmap_page,
2955         .mapping_error = intel_mapping_error,
2956 };
2957
2958 static inline int iommu_domain_cache_init(void)
2959 {
2960         int ret = 0;
2961
2962         iommu_domain_cache = kmem_cache_create("iommu_domain",
2963                                          sizeof(struct dmar_domain),
2964                                          0,
2965                                          SLAB_HWCACHE_ALIGN,
2966
2967                                          NULL);
2968         if (!iommu_domain_cache) {
2969                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2970                 ret = -ENOMEM;
2971         }
2972
2973         return ret;
2974 }
2975
2976 static inline int iommu_devinfo_cache_init(void)
2977 {
2978         int ret = 0;
2979
2980         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2981                                          sizeof(struct device_domain_info),
2982                                          0,
2983                                          SLAB_HWCACHE_ALIGN,
2984                                          NULL);
2985         if (!iommu_devinfo_cache) {
2986                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2987                 ret = -ENOMEM;
2988         }
2989
2990         return ret;
2991 }
2992
2993 static inline int iommu_iova_cache_init(void)
2994 {
2995         int ret = 0;
2996
2997         iommu_iova_cache = kmem_cache_create("iommu_iova",
2998                                          sizeof(struct iova),
2999                                          0,
3000                                          SLAB_HWCACHE_ALIGN,
3001                                          NULL);
3002         if (!iommu_iova_cache) {
3003                 printk(KERN_ERR "Couldn't create iova cache\n");
3004                 ret = -ENOMEM;
3005         }
3006
3007         return ret;
3008 }
3009
3010 static int __init iommu_init_mempool(void)
3011 {
3012         int ret;
3013         ret = iommu_iova_cache_init();
3014         if (ret)
3015                 return ret;
3016
3017         ret = iommu_domain_cache_init();
3018         if (ret)
3019                 goto domain_error;
3020
3021         ret = iommu_devinfo_cache_init();
3022         if (!ret)
3023                 return ret;
3024
3025         kmem_cache_destroy(iommu_domain_cache);
3026 domain_error:
3027         kmem_cache_destroy(iommu_iova_cache);
3028
3029         return -ENOMEM;
3030 }
3031
3032 static void __init iommu_exit_mempool(void)
3033 {
3034         kmem_cache_destroy(iommu_devinfo_cache);
3035         kmem_cache_destroy(iommu_domain_cache);
3036         kmem_cache_destroy(iommu_iova_cache);
3037
3038 }
3039
3040 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3041 {
3042         struct dmar_drhd_unit *drhd;
3043         u32 vtbar;
3044         int rc;
3045
3046         /* We know that this device on this chipset has its own IOMMU.
3047          * If we find it under a different IOMMU, then the BIOS is lying
3048          * to us. Hope that the IOMMU for this device is actually
3049          * disabled, and it needs no translation...
3050          */
3051         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3052         if (rc) {
3053                 /* "can't" happen */
3054                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3055                 return;
3056         }
3057         vtbar &= 0xffff0000;
3058
3059         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3060         drhd = dmar_find_matched_drhd_unit(pdev);
3061         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3062                             TAINT_FIRMWARE_WORKAROUND,
3063                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3064                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3065 }
3066 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3067
3068 static void __init init_no_remapping_devices(void)
3069 {
3070         struct dmar_drhd_unit *drhd;
3071
3072         for_each_drhd_unit(drhd) {
3073                 if (!drhd->include_all) {
3074                         int i;
3075                         for (i = 0; i < drhd->devices_cnt; i++)
3076                                 if (drhd->devices[i] != NULL)
3077                                         break;
3078                         /* ignore DMAR unit if no pci devices exist */
3079                         if (i == drhd->devices_cnt)
3080                                 drhd->ignored = 1;
3081                 }
3082         }
3083
3084         if (dmar_map_gfx)
3085                 return;
3086
3087         for_each_drhd_unit(drhd) {
3088                 int i;
3089                 if (drhd->ignored || drhd->include_all)
3090                         continue;
3091
3092                 for (i = 0; i < drhd->devices_cnt; i++)
3093                         if (drhd->devices[i] &&
3094                                 !IS_GFX_DEVICE(drhd->devices[i]))
3095                                 break;
3096
3097                 if (i < drhd->devices_cnt)
3098                         continue;
3099
3100                 /* bypass IOMMU if it is just for gfx devices */
3101                 drhd->ignored = 1;
3102                 for (i = 0; i < drhd->devices_cnt; i++) {
3103                         if (!drhd->devices[i])
3104                                 continue;
3105                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3106                 }
3107         }
3108 }
3109
3110 #ifdef CONFIG_SUSPEND
3111 static int init_iommu_hw(void)
3112 {
3113         struct dmar_drhd_unit *drhd;
3114         struct intel_iommu *iommu = NULL;
3115
3116         for_each_active_iommu(iommu, drhd)
3117                 if (iommu->qi)
3118                         dmar_reenable_qi(iommu);
3119
3120         for_each_active_iommu(iommu, drhd) {
3121                 iommu_flush_write_buffer(iommu);
3122
3123                 iommu_set_root_entry(iommu);
3124
3125                 iommu->flush.flush_context(iommu, 0, 0, 0,
3126                                            DMA_CCMD_GLOBAL_INVL);
3127                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3128                                          DMA_TLB_GLOBAL_FLUSH);
3129                 iommu_enable_translation(iommu);
3130                 iommu_disable_protect_mem_regions(iommu);
3131         }
3132
3133         return 0;
3134 }
3135
3136 static void iommu_flush_all(void)
3137 {
3138         struct dmar_drhd_unit *drhd;
3139         struct intel_iommu *iommu;
3140
3141         for_each_active_iommu(iommu, drhd) {
3142                 iommu->flush.flush_context(iommu, 0, 0, 0,
3143                                            DMA_CCMD_GLOBAL_INVL);
3144                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3145                                          DMA_TLB_GLOBAL_FLUSH);
3146         }
3147 }
3148
3149 static int iommu_suspend(void)
3150 {
3151         struct dmar_drhd_unit *drhd;
3152         struct intel_iommu *iommu = NULL;
3153         unsigned long flag;
3154
3155         for_each_active_iommu(iommu, drhd) {
3156                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3157                                                  GFP_ATOMIC);
3158                 if (!iommu->iommu_state)
3159                         goto nomem;
3160         }
3161
3162         iommu_flush_all();
3163
3164         for_each_active_iommu(iommu, drhd) {
3165                 iommu_disable_translation(iommu);
3166
3167                 spin_lock_irqsave(&iommu->register_lock, flag);
3168
3169                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3170                         readl(iommu->reg + DMAR_FECTL_REG);
3171                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3172                         readl(iommu->reg + DMAR_FEDATA_REG);
3173                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3174                         readl(iommu->reg + DMAR_FEADDR_REG);
3175                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3176                         readl(iommu->reg + DMAR_FEUADDR_REG);
3177
3178                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3179         }
3180         return 0;
3181
3182 nomem:
3183         for_each_active_iommu(iommu, drhd)
3184                 kfree(iommu->iommu_state);
3185
3186         return -ENOMEM;
3187 }
3188
3189 static void iommu_resume(void)
3190 {
3191         struct dmar_drhd_unit *drhd;
3192         struct intel_iommu *iommu = NULL;
3193         unsigned long flag;
3194
3195         if (init_iommu_hw()) {
3196                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3197                 return;
3198         }
3199
3200         for_each_active_iommu(iommu, drhd) {
3201
3202                 spin_lock_irqsave(&iommu->register_lock, flag);
3203
3204                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3205                         iommu->reg + DMAR_FECTL_REG);
3206                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3207                         iommu->reg + DMAR_FEDATA_REG);
3208                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3209                         iommu->reg + DMAR_FEADDR_REG);
3210                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3211                         iommu->reg + DMAR_FEUADDR_REG);
3212
3213                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3214         }
3215
3216         for_each_active_iommu(iommu, drhd)
3217                 kfree(iommu->iommu_state);
3218 }
3219
3220 static struct syscore_ops iommu_syscore_ops = {
3221         .resume         = iommu_resume,
3222         .suspend        = iommu_suspend,
3223 };
3224
3225 static void __init init_iommu_pm_ops(void)
3226 {
3227         register_syscore_ops(&iommu_syscore_ops);
3228 }
3229
3230 #else
3231 static inline int init_iommu_pm_ops(void) { }
3232 #endif  /* CONFIG_PM */
3233
3234 /*
3235  * Here we only respond to action of unbound device from driver.
3236  *
3237  * Added device is not attached to its DMAR domain here yet. That will happen
3238  * when mapping the device to iova.
3239  */
3240 static int device_notifier(struct notifier_block *nb,
3241                                   unsigned long action, void *data)
3242 {
3243         struct device *dev = data;
3244         struct pci_dev *pdev = to_pci_dev(dev);
3245         struct dmar_domain *domain;
3246
3247         if (iommu_no_mapping(dev))
3248                 return 0;
3249
3250         domain = find_domain(pdev);
3251         if (!domain)
3252                 return 0;
3253
3254         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3255                 domain_remove_one_dev_info(domain, pdev);
3256
3257                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3258                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3259                     list_empty(&domain->devices))
3260                         domain_exit(domain);
3261         }
3262
3263         return 0;
3264 }
3265
3266 static struct notifier_block device_nb = {
3267         .notifier_call = device_notifier,
3268 };
3269
3270 int __init intel_iommu_init(void)
3271 {
3272         int ret = 0;
3273         int force_on = 0;
3274
3275         /* VT-d is required for a TXT/tboot launch, so enforce that */
3276         force_on = tboot_force_iommu();
3277
3278         if (dmar_table_init()) {
3279                 if (force_on)
3280                         panic("tboot: Failed to initialize DMAR table\n");
3281                 return  -ENODEV;
3282         }
3283
3284         if (dmar_dev_scope_init()) {
3285                 if (force_on)
3286                         panic("tboot: Failed to initialize DMAR device scope\n");
3287                 return  -ENODEV;
3288         }
3289
3290         /*
3291          * Check the need for DMA-remapping initialization now.
3292          * Above initialization will also be used by Interrupt-remapping.
3293          */
3294         if (no_iommu || dmar_disabled)
3295                 return -ENODEV;
3296
3297         if (iommu_init_mempool()) {
3298                 if (force_on)
3299                         panic("tboot: Failed to initialize iommu memory\n");
3300                 return  -ENODEV;
3301         }
3302
3303         if (dmar_init_reserved_ranges()) {
3304                 if (force_on)
3305                         panic("tboot: Failed to reserve iommu ranges\n");
3306                 return  -ENODEV;
3307         }
3308
3309         init_no_remapping_devices();
3310
3311         ret = init_dmars(force_on);
3312         if (ret) {
3313                 if (force_on)
3314                         panic("tboot: Failed to initialize DMARs\n");
3315                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3316                 put_iova_domain(&reserved_iova_list);
3317                 iommu_exit_mempool();
3318                 return ret;
3319         }
3320         printk(KERN_INFO
3321         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3322
3323         init_timer(&unmap_timer);
3324 #ifdef CONFIG_SWIOTLB
3325         swiotlb = 0;
3326 #endif
3327         dma_ops = &intel_dma_ops;
3328
3329         init_iommu_pm_ops();
3330
3331         register_iommu(&intel_iommu_ops);
3332
3333         bus_register_notifier(&pci_bus_type, &device_nb);
3334
3335         return 0;
3336 }
3337
3338 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3339                                            struct pci_dev *pdev)
3340 {
3341         struct pci_dev *tmp, *parent;
3342
3343         if (!iommu || !pdev)
3344                 return;
3345
3346         /* dependent device detach */
3347         tmp = pci_find_upstream_pcie_bridge(pdev);
3348         /* Secondary interface's bus number and devfn 0 */
3349         if (tmp) {
3350                 parent = pdev->bus->self;
3351                 while (parent != tmp) {
3352                         iommu_detach_dev(iommu, parent->bus->number,
3353                                          parent->devfn);
3354                         parent = parent->bus->self;
3355                 }
3356                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3357                         iommu_detach_dev(iommu,
3358                                 tmp->subordinate->number, 0);
3359                 else /* this is a legacy PCI bridge */
3360                         iommu_detach_dev(iommu, tmp->bus->number,
3361                                          tmp->devfn);
3362         }
3363 }
3364
3365 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3366                                           struct pci_dev *pdev)
3367 {
3368         struct device_domain_info *info;
3369         struct intel_iommu *iommu;
3370         unsigned long flags;
3371         int found = 0;
3372         struct list_head *entry, *tmp;
3373
3374         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3375                                 pdev->devfn);
3376         if (!iommu)
3377                 return;
3378
3379         spin_lock_irqsave(&device_domain_lock, flags);
3380         list_for_each_safe(entry, tmp, &domain->devices) {
3381                 info = list_entry(entry, struct device_domain_info, link);
3382                 /* No need to compare PCI domain; it has to be the same */
3383                 if (info->bus == pdev->bus->number &&
3384                     info->devfn == pdev->devfn) {
3385                         list_del(&info->link);
3386                         list_del(&info->global);
3387                         if (info->dev)
3388                                 info->dev->dev.archdata.iommu = NULL;
3389                         spin_unlock_irqrestore(&device_domain_lock, flags);
3390
3391                         iommu_disable_dev_iotlb(info);
3392                         iommu_detach_dev(iommu, info->bus, info->devfn);
3393                         iommu_detach_dependent_devices(iommu, pdev);
3394                         free_devinfo_mem(info);
3395
3396                         spin_lock_irqsave(&device_domain_lock, flags);
3397
3398                         if (found)
3399                                 break;
3400                         else
3401                                 continue;
3402                 }
3403
3404                 /* if there is no other devices under the same iommu
3405                  * owned by this domain, clear this iommu in iommu_bmp
3406                  * update iommu count and coherency
3407                  */
3408                 if (iommu == device_to_iommu(info->segment, info->bus,
3409                                             info->devfn))
3410                         found = 1;
3411         }
3412
3413         if (found == 0) {
3414                 unsigned long tmp_flags;
3415                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3416                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3417                 domain->iommu_count--;
3418                 domain_update_iommu_cap(domain);
3419                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3420
3421                 spin_lock_irqsave(&iommu->lock, tmp_flags);
3422                 clear_bit(domain->id, iommu->domain_ids);
3423                 iommu->domains[domain->id] = NULL;
3424                 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3425         }
3426
3427         spin_unlock_irqrestore(&device_domain_lock, flags);
3428 }
3429
3430 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3431 {
3432         struct device_domain_info *info;
3433         struct intel_iommu *iommu;
3434         unsigned long flags1, flags2;
3435
3436         spin_lock_irqsave(&device_domain_lock, flags1);
3437         while (!list_empty(&domain->devices)) {
3438                 info = list_entry(domain->devices.next,
3439                         struct device_domain_info, link);
3440                 list_del(&info->link);
3441                 list_del(&info->global);
3442                 if (info->dev)
3443                         info->dev->dev.archdata.iommu = NULL;
3444
3445                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3446
3447                 iommu_disable_dev_iotlb(info);
3448                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3449                 iommu_detach_dev(iommu, info->bus, info->devfn);
3450                 iommu_detach_dependent_devices(iommu, info->dev);
3451
3452                 /* clear this iommu in iommu_bmp, update iommu count
3453                  * and capabilities
3454                  */
3455                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3456                 if (test_and_clear_bit(iommu->seq_id,
3457                                        &domain->iommu_bmp)) {
3458                         domain->iommu_count--;
3459                         domain_update_iommu_cap(domain);
3460                 }
3461                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3462
3463                 free_devinfo_mem(info);
3464                 spin_lock_irqsave(&device_domain_lock, flags1);
3465         }
3466         spin_unlock_irqrestore(&device_domain_lock, flags1);
3467 }
3468
3469 /* domain id for virtual machine, it won't be set in context */
3470 static unsigned long vm_domid;
3471
3472 static struct dmar_domain *iommu_alloc_vm_domain(void)
3473 {
3474         struct dmar_domain *domain;
3475
3476         domain = alloc_domain_mem();
3477         if (!domain)
3478                 return NULL;
3479
3480         domain->id = vm_domid++;
3481         domain->nid = -1;
3482         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3483         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3484
3485         return domain;
3486 }
3487
3488 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3489 {
3490         int adjust_width;
3491
3492         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3493         spin_lock_init(&domain->iommu_lock);
3494
3495         domain_reserve_special_ranges(domain);
3496
3497         /* calculate AGAW */
3498         domain->gaw = guest_width;
3499         adjust_width = guestwidth_to_adjustwidth(guest_width);
3500         domain->agaw = width_to_agaw(adjust_width);
3501
3502         INIT_LIST_HEAD(&domain->devices);
3503
3504         domain->iommu_count = 0;
3505         domain->iommu_coherency = 0;
3506         domain->iommu_snooping = 0;
3507         domain->max_addr = 0;
3508         domain->nid = -1;
3509
3510         /* always allocate the top pgd */
3511         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3512         if (!domain->pgd)
3513                 return -ENOMEM;
3514         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3515         return 0;
3516 }
3517
3518 static void iommu_free_vm_domain(struct dmar_domain *domain)
3519 {
3520         unsigned long flags;
3521         struct dmar_drhd_unit *drhd;
3522         struct intel_iommu *iommu;
3523         unsigned long i;
3524         unsigned long ndomains;
3525
3526         for_each_drhd_unit(drhd) {
3527                 if (drhd->ignored)
3528                         continue;
3529                 iommu = drhd->iommu;
3530
3531                 ndomains = cap_ndoms(iommu->cap);
3532                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3533                         if (iommu->domains[i] == domain) {
3534                                 spin_lock_irqsave(&iommu->lock, flags);
3535                                 clear_bit(i, iommu->domain_ids);
3536                                 iommu->domains[i] = NULL;
3537                                 spin_unlock_irqrestore(&iommu->lock, flags);
3538                                 break;
3539                         }
3540                 }
3541         }
3542 }
3543
3544 static void vm_domain_exit(struct dmar_domain *domain)
3545 {
3546         /* Domain 0 is reserved, so dont process it */
3547         if (!domain)
3548                 return;
3549
3550         vm_domain_remove_all_dev_info(domain);
3551         /* destroy iovas */
3552         put_iova_domain(&domain->iovad);
3553
3554         /* clear ptes */
3555         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3556
3557         /* free page tables */
3558         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3559
3560         iommu_free_vm_domain(domain);
3561         free_domain_mem(domain);
3562 }
3563
3564 static int intel_iommu_domain_init(struct iommu_domain *domain)
3565 {
3566         struct dmar_domain *dmar_domain;
3567
3568         dmar_domain = iommu_alloc_vm_domain();
3569         if (!dmar_domain) {
3570                 printk(KERN_ERR
3571                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3572                 return -ENOMEM;
3573         }
3574         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3575                 printk(KERN_ERR
3576                         "intel_iommu_domain_init() failed\n");
3577                 vm_domain_exit(dmar_domain);
3578                 return -ENOMEM;
3579         }
3580         domain->priv = dmar_domain;
3581
3582         return 0;
3583 }
3584
3585 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3586 {
3587         struct dmar_domain *dmar_domain = domain->priv;
3588
3589         domain->priv = NULL;
3590         vm_domain_exit(dmar_domain);
3591 }
3592
3593 static int intel_iommu_attach_device(struct iommu_domain *domain,
3594                                      struct device *dev)
3595 {
3596         struct dmar_domain *dmar_domain = domain->priv;
3597         struct pci_dev *pdev = to_pci_dev(dev);
3598         struct intel_iommu *iommu;
3599         int addr_width;
3600
3601         /* normally pdev is not mapped */
3602         if (unlikely(domain_context_mapped(pdev))) {
3603                 struct dmar_domain *old_domain;
3604
3605                 old_domain = find_domain(pdev);
3606                 if (old_domain) {
3607                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3608                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3609                                 domain_remove_one_dev_info(old_domain, pdev);
3610                         else
3611                                 domain_remove_dev_info(old_domain);
3612                 }
3613         }
3614
3615         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3616                                 pdev->devfn);
3617         if (!iommu)
3618                 return -ENODEV;
3619
3620         /* check if this iommu agaw is sufficient for max mapped address */
3621         addr_width = agaw_to_width(iommu->agaw);
3622         if (addr_width > cap_mgaw(iommu->cap))
3623                 addr_width = cap_mgaw(iommu->cap);
3624
3625         if (dmar_domain->max_addr > (1LL << addr_width)) {
3626                 printk(KERN_ERR "%s: iommu width (%d) is not "
3627                        "sufficient for the mapped address (%llx)\n",
3628                        __func__, addr_width, dmar_domain->max_addr);
3629                 return -EFAULT;
3630         }
3631         dmar_domain->gaw = addr_width;
3632
3633         /*
3634          * Knock out extra levels of page tables if necessary
3635          */
3636         while (iommu->agaw < dmar_domain->agaw) {
3637                 struct dma_pte *pte;
3638
3639                 pte = dmar_domain->pgd;
3640                 if (dma_pte_present(pte)) {
3641                         dmar_domain->pgd = (struct dma_pte *)
3642                                 phys_to_virt(dma_pte_addr(pte));
3643                         free_pgtable_page(pte);
3644                 }
3645                 dmar_domain->agaw--;
3646         }
3647
3648         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3649 }
3650
3651 static void intel_iommu_detach_device(struct iommu_domain *domain,
3652                                       struct device *dev)
3653 {
3654         struct dmar_domain *dmar_domain = domain->priv;
3655         struct pci_dev *pdev = to_pci_dev(dev);
3656
3657         domain_remove_one_dev_info(dmar_domain, pdev);
3658 }
3659
3660 static int intel_iommu_map(struct iommu_domain *domain,
3661                            unsigned long iova, phys_addr_t hpa,
3662                            int gfp_order, int iommu_prot)
3663 {
3664         struct dmar_domain *dmar_domain = domain->priv;
3665         u64 max_addr;
3666         int prot = 0;
3667         size_t size;
3668         int ret;
3669
3670         if (iommu_prot & IOMMU_READ)
3671                 prot |= DMA_PTE_READ;
3672         if (iommu_prot & IOMMU_WRITE)
3673                 prot |= DMA_PTE_WRITE;
3674         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3675                 prot |= DMA_PTE_SNP;
3676
3677         size     = PAGE_SIZE << gfp_order;
3678         max_addr = iova + size;
3679         if (dmar_domain->max_addr < max_addr) {
3680                 u64 end;
3681
3682                 /* check if minimum agaw is sufficient for mapped address */
3683                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3684                 if (end < max_addr) {
3685                         printk(KERN_ERR "%s: iommu width (%d) is not "
3686                                "sufficient for the mapped address (%llx)\n",
3687                                __func__, dmar_domain->gaw, max_addr);
3688                         return -EFAULT;
3689                 }
3690                 dmar_domain->max_addr = max_addr;
3691         }
3692         /* Round up size to next multiple of PAGE_SIZE, if it and
3693            the low bits of hpa would take us onto the next page */
3694         size = aligned_nrpages(hpa, size);
3695         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3696                                  hpa >> VTD_PAGE_SHIFT, size, prot);
3697         return ret;
3698 }
3699
3700 static int intel_iommu_unmap(struct iommu_domain *domain,
3701                              unsigned long iova, int gfp_order)
3702 {
3703         struct dmar_domain *dmar_domain = domain->priv;
3704         size_t size = PAGE_SIZE << gfp_order;
3705
3706         dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3707                             (iova + size - 1) >> VTD_PAGE_SHIFT);
3708
3709         if (dmar_domain->max_addr == iova + size)
3710                 dmar_domain->max_addr = iova;
3711
3712         return gfp_order;
3713 }
3714
3715 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3716                                             unsigned long iova)
3717 {
3718         struct dmar_domain *dmar_domain = domain->priv;
3719         struct dma_pte *pte;
3720         u64 phys = 0;
3721
3722         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3723         if (pte)
3724                 phys = dma_pte_addr(pte);
3725
3726         return phys;
3727 }
3728
3729 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3730                                       unsigned long cap)
3731 {
3732         struct dmar_domain *dmar_domain = domain->priv;
3733
3734         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3735                 return dmar_domain->iommu_snooping;
3736         if (cap == IOMMU_CAP_INTR_REMAP)
3737                 return intr_remapping_enabled;
3738
3739         return 0;
3740 }
3741
3742 static struct iommu_ops intel_iommu_ops = {
3743         .domain_init    = intel_iommu_domain_init,
3744         .domain_destroy = intel_iommu_domain_destroy,
3745         .attach_dev     = intel_iommu_attach_device,
3746         .detach_dev     = intel_iommu_detach_device,
3747         .map            = intel_iommu_map,
3748         .unmap          = intel_iommu_unmap,
3749         .iova_to_phys   = intel_iommu_iova_to_phys,
3750         .domain_has_cap = intel_iommu_domain_has_cap,
3751 };
3752
3753 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3754 {
3755         /*
3756          * Mobile 4 Series Chipset neglects to set RWBF capability,
3757          * but needs it:
3758          */
3759         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3760         rwbf_quirk = 1;
3761
3762         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
3763         if (dev->revision == 0x07) {
3764                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
3765                 dmar_map_gfx = 0;
3766         }
3767 }
3768
3769 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3770
3771 #define GGC 0x52
3772 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
3773 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
3774 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
3775 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
3776 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
3777 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
3778 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
3779 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
3780
3781 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
3782 {
3783         unsigned short ggc;
3784
3785         if (pci_read_config_word(dev, GGC, &ggc))
3786                 return;
3787
3788         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
3789                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
3790                 dmar_map_gfx = 0;
3791         }
3792 }
3793 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
3794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
3795 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
3796 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
3797
3798 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3799    ISOCH DMAR unit for the Azalia sound device, but not give it any
3800    TLB entries, which causes it to deadlock. Check for that.  We do
3801    this in a function called from init_dmars(), instead of in a PCI
3802    quirk, because we don't want to print the obnoxious "BIOS broken"
3803    message if VT-d is actually disabled.
3804 */
3805 static void __init check_tylersburg_isoch(void)
3806 {
3807         struct pci_dev *pdev;
3808         uint32_t vtisochctrl;
3809
3810         /* If there's no Azalia in the system anyway, forget it. */
3811         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3812         if (!pdev)
3813                 return;
3814         pci_dev_put(pdev);
3815
3816         /* System Management Registers. Might be hidden, in which case
3817            we can't do the sanity check. But that's OK, because the
3818            known-broken BIOSes _don't_ actually hide it, so far. */
3819         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3820         if (!pdev)
3821                 return;
3822
3823         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3824                 pci_dev_put(pdev);
3825                 return;
3826         }
3827
3828         pci_dev_put(pdev);
3829
3830         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3831         if (vtisochctrl & 1)
3832                 return;
3833
3834         /* Drop all bits other than the number of TLB entries */
3835         vtisochctrl &= 0x1c;
3836
3837         /* If we have the recommended number of TLB entries (16), fine. */
3838         if (vtisochctrl == 0x10)
3839                 return;
3840
3841         /* Zero TLB entries? You get to ride the short bus to school. */
3842         if (!vtisochctrl) {
3843                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3844                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3845                      dmi_get_system_info(DMI_BIOS_VENDOR),
3846                      dmi_get_system_info(DMI_BIOS_VERSION),
3847                      dmi_get_system_info(DMI_PRODUCT_VERSION));
3848                 iommu_identity_mapping |= IDENTMAP_AZALIA;
3849                 return;
3850         }
3851         
3852         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3853                vtisochctrl);
3854 }