intel-iommu: Use correct domain ID when caching mode is enabled
[pandora-kernel.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
45
46 #define ROOT_SIZE               VTD_PAGE_SIZE
47 #define CONTEXT_SIZE            VTD_PAGE_SIZE
48
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
52
53 #define IOAPIC_RANGE_START      (0xfee00000)
54 #define IOAPIC_RANGE_END        (0xfeefffff)
55 #define IOVA_START_ADDR         (0x1000)
56
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
58
59 #define MAX_AGAW_WIDTH 64
60
61 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
63
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
67                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
69
70 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
73
74
75 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
76    are never going to work. */
77 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
78 {
79         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
80 }
81
82 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
83 {
84         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
85 }
86 static inline unsigned long page_to_dma_pfn(struct page *pg)
87 {
88         return mm_to_dma_pfn(page_to_pfn(pg));
89 }
90 static inline unsigned long virt_to_dma_pfn(void *p)
91 {
92         return page_to_dma_pfn(virt_to_page(p));
93 }
94
95 /* global iommu list, set NULL for ignored DMAR units */
96 static struct intel_iommu **g_iommus;
97
98 static void __init check_tylersburg_isoch(void);
99 static int rwbf_quirk;
100
101 /*
102  * 0: Present
103  * 1-11: Reserved
104  * 12-63: Context Ptr (12 - (haw-1))
105  * 64-127: Reserved
106  */
107 struct root_entry {
108         u64     val;
109         u64     rsvd1;
110 };
111 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
112 static inline bool root_present(struct root_entry *root)
113 {
114         return (root->val & 1);
115 }
116 static inline void set_root_present(struct root_entry *root)
117 {
118         root->val |= 1;
119 }
120 static inline void set_root_value(struct root_entry *root, unsigned long value)
121 {
122         root->val |= value & VTD_PAGE_MASK;
123 }
124
125 static inline struct context_entry *
126 get_context_addr_from_root(struct root_entry *root)
127 {
128         return (struct context_entry *)
129                 (root_present(root)?phys_to_virt(
130                 root->val & VTD_PAGE_MASK) :
131                 NULL);
132 }
133
134 /*
135  * low 64 bits:
136  * 0: present
137  * 1: fault processing disable
138  * 2-3: translation type
139  * 12-63: address space root
140  * high 64 bits:
141  * 0-2: address width
142  * 3-6: aval
143  * 8-23: domain id
144  */
145 struct context_entry {
146         u64 lo;
147         u64 hi;
148 };
149
150 static inline bool context_present(struct context_entry *context)
151 {
152         return (context->lo & 1);
153 }
154 static inline void context_set_present(struct context_entry *context)
155 {
156         context->lo |= 1;
157 }
158
159 static inline void context_set_fault_enable(struct context_entry *context)
160 {
161         context->lo &= (((u64)-1) << 2) | 1;
162 }
163
164 static inline void context_set_translation_type(struct context_entry *context,
165                                                 unsigned long value)
166 {
167         context->lo &= (((u64)-1) << 4) | 3;
168         context->lo |= (value & 3) << 2;
169 }
170
171 static inline void context_set_address_root(struct context_entry *context,
172                                             unsigned long value)
173 {
174         context->lo |= value & VTD_PAGE_MASK;
175 }
176
177 static inline void context_set_address_width(struct context_entry *context,
178                                              unsigned long value)
179 {
180         context->hi |= value & 7;
181 }
182
183 static inline void context_set_domain_id(struct context_entry *context,
184                                          unsigned long value)
185 {
186         context->hi |= (value & ((1 << 16) - 1)) << 8;
187 }
188
189 static inline void context_clear_entry(struct context_entry *context)
190 {
191         context->lo = 0;
192         context->hi = 0;
193 }
194
195 /*
196  * 0: readable
197  * 1: writable
198  * 2-6: reserved
199  * 7: super page
200  * 8-10: available
201  * 11: snoop behavior
202  * 12-63: Host physcial address
203  */
204 struct dma_pte {
205         u64 val;
206 };
207
208 static inline void dma_clear_pte(struct dma_pte *pte)
209 {
210         pte->val = 0;
211 }
212
213 static inline void dma_set_pte_readable(struct dma_pte *pte)
214 {
215         pte->val |= DMA_PTE_READ;
216 }
217
218 static inline void dma_set_pte_writable(struct dma_pte *pte)
219 {
220         pte->val |= DMA_PTE_WRITE;
221 }
222
223 static inline void dma_set_pte_snp(struct dma_pte *pte)
224 {
225         pte->val |= DMA_PTE_SNP;
226 }
227
228 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
229 {
230         pte->val = (pte->val & ~3) | (prot & 3);
231 }
232
233 static inline u64 dma_pte_addr(struct dma_pte *pte)
234 {
235 #ifdef CONFIG_64BIT
236         return pte->val & VTD_PAGE_MASK;
237 #else
238         /* Must have a full atomic 64-bit read */
239         return  __cmpxchg64(pte, 0ULL, 0ULL) & VTD_PAGE_MASK;
240 #endif
241 }
242
243 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
244 {
245         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
246 }
247
248 static inline bool dma_pte_present(struct dma_pte *pte)
249 {
250         return (pte->val & 3) != 0;
251 }
252
253 static inline int first_pte_in_page(struct dma_pte *pte)
254 {
255         return !((unsigned long)pte & ~VTD_PAGE_MASK);
256 }
257
258 /*
259  * This domain is a statically identity mapping domain.
260  *      1. This domain creats a static 1:1 mapping to all usable memory.
261  *      2. It maps to each iommu if successful.
262  *      3. Each iommu mapps to this domain if successful.
263  */
264 static struct dmar_domain *si_domain;
265 static int hw_pass_through = 1;
266
267 /* devices under the same p2p bridge are owned in one domain */
268 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
269
270 /* domain represents a virtual machine, more than one devices
271  * across iommus may be owned in one domain, e.g. kvm guest.
272  */
273 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
274
275 /* si_domain contains mulitple devices */
276 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
277
278 struct dmar_domain {
279         int     id;                     /* domain id */
280         int     nid;                    /* node id */
281         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
282
283         struct list_head devices;       /* all devices' list */
284         struct iova_domain iovad;       /* iova's that belong to this domain */
285
286         struct dma_pte  *pgd;           /* virtual address */
287         int             gaw;            /* max guest address width */
288
289         /* adjusted guest address width, 0 is level 2 30-bit */
290         int             agaw;
291
292         int             flags;          /* flags to find out type of domain */
293
294         int             iommu_coherency;/* indicate coherency of iommu access */
295         int             iommu_snooping; /* indicate snooping control feature*/
296         int             iommu_count;    /* reference count of iommu */
297         spinlock_t      iommu_lock;     /* protect iommu set in domain */
298         u64             max_addr;       /* maximum mapped address */
299 };
300
301 /* PCI domain-device relationship */
302 struct device_domain_info {
303         struct list_head link;  /* link to domain siblings */
304         struct list_head global; /* link to global list */
305         int segment;            /* PCI domain */
306         u8 bus;                 /* PCI bus number */
307         u8 devfn;               /* PCI devfn number */
308         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
309         struct intel_iommu *iommu; /* IOMMU used by this device */
310         struct dmar_domain *domain; /* pointer to domain */
311 };
312
313 static void flush_unmaps_timeout(unsigned long data);
314
315 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
316
317 #define HIGH_WATER_MARK 250
318 struct deferred_flush_tables {
319         int next;
320         struct iova *iova[HIGH_WATER_MARK];
321         struct dmar_domain *domain[HIGH_WATER_MARK];
322 };
323
324 static struct deferred_flush_tables *deferred_flush;
325
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328
329 static DEFINE_SPINLOCK(async_umap_flush_lock);
330 static LIST_HEAD(unmaps_to_do);
331
332 static int timer_on;
333 static long list_size;
334
335 static void domain_remove_dev_info(struct dmar_domain *domain);
336
337 #ifdef CONFIG_DMAR_DEFAULT_ON
338 int dmar_disabled = 0;
339 #else
340 int dmar_disabled = 1;
341 #endif /*CONFIG_DMAR_DEFAULT_ON*/
342
343 static int __initdata dmar_map_gfx = 1;
344 static int dmar_forcedac;
345 static int intel_iommu_strict;
346
347 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
348 static DEFINE_SPINLOCK(device_domain_lock);
349 static LIST_HEAD(device_domain_list);
350
351 static struct iommu_ops intel_iommu_ops;
352
353 static int __init intel_iommu_setup(char *str)
354 {
355         if (!str)
356                 return -EINVAL;
357         while (*str) {
358                 if (!strncmp(str, "on", 2)) {
359                         dmar_disabled = 0;
360                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
361                 } else if (!strncmp(str, "off", 3)) {
362                         dmar_disabled = 1;
363                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
364                 } else if (!strncmp(str, "igfx_off", 8)) {
365                         dmar_map_gfx = 0;
366                         printk(KERN_INFO
367                                 "Intel-IOMMU: disable GFX device mapping\n");
368                 } else if (!strncmp(str, "forcedac", 8)) {
369                         printk(KERN_INFO
370                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
371                         dmar_forcedac = 1;
372                 } else if (!strncmp(str, "strict", 6)) {
373                         printk(KERN_INFO
374                                 "Intel-IOMMU: disable batched IOTLB flush\n");
375                         intel_iommu_strict = 1;
376                 }
377
378                 str += strcspn(str, ",");
379                 while (*str == ',')
380                         str++;
381         }
382         return 0;
383 }
384 __setup("intel_iommu=", intel_iommu_setup);
385
386 static struct kmem_cache *iommu_domain_cache;
387 static struct kmem_cache *iommu_devinfo_cache;
388 static struct kmem_cache *iommu_iova_cache;
389
390 static inline void *alloc_pgtable_page(int node)
391 {
392         struct page *page;
393         void *vaddr = NULL;
394
395         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
396         if (page)
397                 vaddr = page_address(page);
398         return vaddr;
399 }
400
401 static inline void free_pgtable_page(void *vaddr)
402 {
403         free_page((unsigned long)vaddr);
404 }
405
406 static inline void *alloc_domain_mem(void)
407 {
408         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
409 }
410
411 static void free_domain_mem(void *vaddr)
412 {
413         kmem_cache_free(iommu_domain_cache, vaddr);
414 }
415
416 static inline void * alloc_devinfo_mem(void)
417 {
418         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
419 }
420
421 static inline void free_devinfo_mem(void *vaddr)
422 {
423         kmem_cache_free(iommu_devinfo_cache, vaddr);
424 }
425
426 struct iova *alloc_iova_mem(void)
427 {
428         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
429 }
430
431 void free_iova_mem(struct iova *iova)
432 {
433         kmem_cache_free(iommu_iova_cache, iova);
434 }
435
436
437 static inline int width_to_agaw(int width);
438
439 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
440 {
441         unsigned long sagaw;
442         int agaw = -1;
443
444         sagaw = cap_sagaw(iommu->cap);
445         for (agaw = width_to_agaw(max_gaw);
446              agaw >= 0; agaw--) {
447                 if (test_bit(agaw, &sagaw))
448                         break;
449         }
450
451         return agaw;
452 }
453
454 /*
455  * Calculate max SAGAW for each iommu.
456  */
457 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
458 {
459         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
460 }
461
462 /*
463  * calculate agaw for each iommu.
464  * "SAGAW" may be different across iommus, use a default agaw, and
465  * get a supported less agaw for iommus that don't support the default agaw.
466  */
467 int iommu_calculate_agaw(struct intel_iommu *iommu)
468 {
469         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
470 }
471
472 /* This functionin only returns single iommu in a domain */
473 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
474 {
475         int iommu_id;
476
477         /* si_domain and vm domain should not get here. */
478         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
479         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
480
481         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
482         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
483                 return NULL;
484
485         return g_iommus[iommu_id];
486 }
487
488 static void domain_update_iommu_coherency(struct dmar_domain *domain)
489 {
490         int i;
491
492         domain->iommu_coherency = 1;
493
494         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
495                 if (!ecap_coherent(g_iommus[i]->ecap)) {
496                         domain->iommu_coherency = 0;
497                         break;
498                 }
499         }
500 }
501
502 static void domain_update_iommu_snooping(struct dmar_domain *domain)
503 {
504         int i;
505
506         domain->iommu_snooping = 1;
507
508         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
509                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
510                         domain->iommu_snooping = 0;
511                         break;
512                 }
513         }
514 }
515
516 /* Some capabilities may be different across iommus */
517 static void domain_update_iommu_cap(struct dmar_domain *domain)
518 {
519         domain_update_iommu_coherency(domain);
520         domain_update_iommu_snooping(domain);
521 }
522
523 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
524 {
525         struct dmar_drhd_unit *drhd = NULL;
526         int i;
527
528         for_each_drhd_unit(drhd) {
529                 if (drhd->ignored)
530                         continue;
531                 if (segment != drhd->segment)
532                         continue;
533
534                 for (i = 0; i < drhd->devices_cnt; i++) {
535                         if (drhd->devices[i] &&
536                             drhd->devices[i]->bus->number == bus &&
537                             drhd->devices[i]->devfn == devfn)
538                                 return drhd->iommu;
539                         if (drhd->devices[i] &&
540                             drhd->devices[i]->subordinate &&
541                             drhd->devices[i]->subordinate->number <= bus &&
542                             drhd->devices[i]->subordinate->subordinate >= bus)
543                                 return drhd->iommu;
544                 }
545
546                 if (drhd->include_all)
547                         return drhd->iommu;
548         }
549
550         return NULL;
551 }
552
553 static void domain_flush_cache(struct dmar_domain *domain,
554                                void *addr, int size)
555 {
556         if (!domain->iommu_coherency)
557                 clflush_cache_range(addr, size);
558 }
559
560 /* Gets context entry for a given bus and devfn */
561 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
562                 u8 bus, u8 devfn)
563 {
564         struct root_entry *root;
565         struct context_entry *context;
566         unsigned long phy_addr;
567         unsigned long flags;
568
569         spin_lock_irqsave(&iommu->lock, flags);
570         root = &iommu->root_entry[bus];
571         context = get_context_addr_from_root(root);
572         if (!context) {
573                 context = (struct context_entry *)
574                                 alloc_pgtable_page(iommu->node);
575                 if (!context) {
576                         spin_unlock_irqrestore(&iommu->lock, flags);
577                         return NULL;
578                 }
579                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
580                 phy_addr = virt_to_phys((void *)context);
581                 set_root_value(root, phy_addr);
582                 set_root_present(root);
583                 __iommu_flush_cache(iommu, root, sizeof(*root));
584         }
585         spin_unlock_irqrestore(&iommu->lock, flags);
586         return &context[devfn];
587 }
588
589 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
590 {
591         struct root_entry *root;
592         struct context_entry *context;
593         int ret;
594         unsigned long flags;
595
596         spin_lock_irqsave(&iommu->lock, flags);
597         root = &iommu->root_entry[bus];
598         context = get_context_addr_from_root(root);
599         if (!context) {
600                 ret = 0;
601                 goto out;
602         }
603         ret = context_present(&context[devfn]);
604 out:
605         spin_unlock_irqrestore(&iommu->lock, flags);
606         return ret;
607 }
608
609 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
610 {
611         struct root_entry *root;
612         struct context_entry *context;
613         unsigned long flags;
614
615         spin_lock_irqsave(&iommu->lock, flags);
616         root = &iommu->root_entry[bus];
617         context = get_context_addr_from_root(root);
618         if (context) {
619                 context_clear_entry(&context[devfn]);
620                 __iommu_flush_cache(iommu, &context[devfn], \
621                         sizeof(*context));
622         }
623         spin_unlock_irqrestore(&iommu->lock, flags);
624 }
625
626 static void free_context_table(struct intel_iommu *iommu)
627 {
628         struct root_entry *root;
629         int i;
630         unsigned long flags;
631         struct context_entry *context;
632
633         spin_lock_irqsave(&iommu->lock, flags);
634         if (!iommu->root_entry) {
635                 goto out;
636         }
637         for (i = 0; i < ROOT_ENTRY_NR; i++) {
638                 root = &iommu->root_entry[i];
639                 context = get_context_addr_from_root(root);
640                 if (context)
641                         free_pgtable_page(context);
642         }
643         free_pgtable_page(iommu->root_entry);
644         iommu->root_entry = NULL;
645 out:
646         spin_unlock_irqrestore(&iommu->lock, flags);
647 }
648
649 /* page table handling */
650 #define LEVEL_STRIDE            (9)
651 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
652
653 static inline int agaw_to_level(int agaw)
654 {
655         return agaw + 2;
656 }
657
658 static inline int agaw_to_width(int agaw)
659 {
660         return 30 + agaw * LEVEL_STRIDE;
661
662 }
663
664 static inline int width_to_agaw(int width)
665 {
666         return (width - 30) / LEVEL_STRIDE;
667 }
668
669 static inline unsigned int level_to_offset_bits(int level)
670 {
671         return (level - 1) * LEVEL_STRIDE;
672 }
673
674 static inline int pfn_level_offset(unsigned long pfn, int level)
675 {
676         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
677 }
678
679 static inline unsigned long level_mask(int level)
680 {
681         return -1UL << level_to_offset_bits(level);
682 }
683
684 static inline unsigned long level_size(int level)
685 {
686         return 1UL << level_to_offset_bits(level);
687 }
688
689 static inline unsigned long align_to_level(unsigned long pfn, int level)
690 {
691         return (pfn + level_size(level) - 1) & level_mask(level);
692 }
693
694 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
695                                       unsigned long pfn)
696 {
697         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
698         struct dma_pte *parent, *pte = NULL;
699         int level = agaw_to_level(domain->agaw);
700         int offset;
701
702         BUG_ON(!domain->pgd);
703         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
704         parent = domain->pgd;
705
706         while (level > 0) {
707                 void *tmp_page;
708
709                 offset = pfn_level_offset(pfn, level);
710                 pte = &parent[offset];
711                 if (level == 1)
712                         break;
713
714                 if (!dma_pte_present(pte)) {
715                         uint64_t pteval;
716
717                         tmp_page = alloc_pgtable_page(domain->nid);
718
719                         if (!tmp_page)
720                                 return NULL;
721
722                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
723                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
724                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
725                                 /* Someone else set it while we were thinking; use theirs. */
726                                 free_pgtable_page(tmp_page);
727                         } else {
728                                 dma_pte_addr(pte);
729                                 domain_flush_cache(domain, pte, sizeof(*pte));
730                         }
731                 }
732                 parent = phys_to_virt(dma_pte_addr(pte));
733                 level--;
734         }
735
736         return pte;
737 }
738
739 /* return address's pte at specific level */
740 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
741                                          unsigned long pfn,
742                                          int level)
743 {
744         struct dma_pte *parent, *pte = NULL;
745         int total = agaw_to_level(domain->agaw);
746         int offset;
747
748         parent = domain->pgd;
749         while (level <= total) {
750                 offset = pfn_level_offset(pfn, total);
751                 pte = &parent[offset];
752                 if (level == total)
753                         return pte;
754
755                 if (!dma_pte_present(pte))
756                         break;
757                 parent = phys_to_virt(dma_pte_addr(pte));
758                 total--;
759         }
760         return NULL;
761 }
762
763 /* clear last level pte, a tlb flush should be followed */
764 static void dma_pte_clear_range(struct dmar_domain *domain,
765                                 unsigned long start_pfn,
766                                 unsigned long last_pfn)
767 {
768         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
769         struct dma_pte *first_pte, *pte;
770
771         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
772         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
773         BUG_ON(start_pfn > last_pfn);
774
775         /* we don't need lock here; nobody else touches the iova range */
776         do {
777                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
778                 if (!pte) {
779                         start_pfn = align_to_level(start_pfn + 1, 2);
780                         continue;
781                 }
782                 do { 
783                         dma_clear_pte(pte);
784                         start_pfn++;
785                         pte++;
786                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
787
788                 domain_flush_cache(domain, first_pte,
789                                    (void *)pte - (void *)first_pte);
790
791         } while (start_pfn && start_pfn <= last_pfn);
792 }
793
794 /* free page table pages. last level pte should already be cleared */
795 static void dma_pte_free_pagetable(struct dmar_domain *domain,
796                                    unsigned long start_pfn,
797                                    unsigned long last_pfn)
798 {
799         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
800         struct dma_pte *first_pte, *pte;
801         int total = agaw_to_level(domain->agaw);
802         int level;
803         unsigned long tmp;
804
805         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
806         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
807         BUG_ON(start_pfn > last_pfn);
808
809         /* We don't need lock here; nobody else touches the iova range */
810         level = 2;
811         while (level <= total) {
812                 tmp = align_to_level(start_pfn, level);
813
814                 /* If we can't even clear one PTE at this level, we're done */
815                 if (tmp + level_size(level) - 1 > last_pfn)
816                         return;
817
818                 do {
819                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
820                         if (!pte) {
821                                 tmp = align_to_level(tmp + 1, level + 1);
822                                 continue;
823                         }
824                         do {
825                                 if (dma_pte_present(pte)) {
826                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
827                                         dma_clear_pte(pte);
828                                 }
829                                 pte++;
830                                 tmp += level_size(level);
831                         } while (!first_pte_in_page(pte) &&
832                                  tmp + level_size(level) - 1 <= last_pfn);
833
834                         domain_flush_cache(domain, first_pte,
835                                            (void *)pte - (void *)first_pte);
836                         
837                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
838                 level++;
839         }
840         /* free pgd */
841         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
842                 free_pgtable_page(domain->pgd);
843                 domain->pgd = NULL;
844         }
845 }
846
847 /* iommu handling */
848 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
849 {
850         struct root_entry *root;
851         unsigned long flags;
852
853         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
854         if (!root)
855                 return -ENOMEM;
856
857         __iommu_flush_cache(iommu, root, ROOT_SIZE);
858
859         spin_lock_irqsave(&iommu->lock, flags);
860         iommu->root_entry = root;
861         spin_unlock_irqrestore(&iommu->lock, flags);
862
863         return 0;
864 }
865
866 static void iommu_set_root_entry(struct intel_iommu *iommu)
867 {
868         void *addr;
869         u32 sts;
870         unsigned long flag;
871
872         addr = iommu->root_entry;
873
874         spin_lock_irqsave(&iommu->register_lock, flag);
875         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
876
877         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
878
879         /* Make sure hardware complete it */
880         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
881                       readl, (sts & DMA_GSTS_RTPS), sts);
882
883         spin_unlock_irqrestore(&iommu->register_lock, flag);
884 }
885
886 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
887 {
888         u32 val;
889         unsigned long flag;
890
891         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
892                 return;
893
894         spin_lock_irqsave(&iommu->register_lock, flag);
895         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
896
897         /* Make sure hardware complete it */
898         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
899                       readl, (!(val & DMA_GSTS_WBFS)), val);
900
901         spin_unlock_irqrestore(&iommu->register_lock, flag);
902 }
903
904 /* return value determine if we need a write buffer flush */
905 static void __iommu_flush_context(struct intel_iommu *iommu,
906                                   u16 did, u16 source_id, u8 function_mask,
907                                   u64 type)
908 {
909         u64 val = 0;
910         unsigned long flag;
911
912         switch (type) {
913         case DMA_CCMD_GLOBAL_INVL:
914                 val = DMA_CCMD_GLOBAL_INVL;
915                 break;
916         case DMA_CCMD_DOMAIN_INVL:
917                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
918                 break;
919         case DMA_CCMD_DEVICE_INVL:
920                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
921                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
922                 break;
923         default:
924                 BUG();
925         }
926         val |= DMA_CCMD_ICC;
927
928         spin_lock_irqsave(&iommu->register_lock, flag);
929         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
930
931         /* Make sure hardware complete it */
932         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
933                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
934
935         spin_unlock_irqrestore(&iommu->register_lock, flag);
936 }
937
938 /* return value determine if we need a write buffer flush */
939 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
940                                 u64 addr, unsigned int size_order, u64 type)
941 {
942         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
943         u64 val = 0, val_iva = 0;
944         unsigned long flag;
945
946         switch (type) {
947         case DMA_TLB_GLOBAL_FLUSH:
948                 /* global flush doesn't need set IVA_REG */
949                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
950                 break;
951         case DMA_TLB_DSI_FLUSH:
952                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
953                 break;
954         case DMA_TLB_PSI_FLUSH:
955                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
956                 /* Note: always flush non-leaf currently */
957                 val_iva = size_order | addr;
958                 break;
959         default:
960                 BUG();
961         }
962         /* Note: set drain read/write */
963 #if 0
964         /*
965          * This is probably to be super secure.. Looks like we can
966          * ignore it without any impact.
967          */
968         if (cap_read_drain(iommu->cap))
969                 val |= DMA_TLB_READ_DRAIN;
970 #endif
971         if (cap_write_drain(iommu->cap))
972                 val |= DMA_TLB_WRITE_DRAIN;
973
974         spin_lock_irqsave(&iommu->register_lock, flag);
975         /* Note: Only uses first TLB reg currently */
976         if (val_iva)
977                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
978         dmar_writeq(iommu->reg + tlb_offset + 8, val);
979
980         /* Make sure hardware complete it */
981         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
982                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
983
984         spin_unlock_irqrestore(&iommu->register_lock, flag);
985
986         /* check IOTLB invalidation granularity */
987         if (DMA_TLB_IAIG(val) == 0)
988                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
989         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
990                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
991                         (unsigned long long)DMA_TLB_IIRG(type),
992                         (unsigned long long)DMA_TLB_IAIG(val));
993 }
994
995 static struct device_domain_info *iommu_support_dev_iotlb(
996         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
997 {
998         int found = 0;
999         unsigned long flags;
1000         struct device_domain_info *info;
1001         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1002
1003         if (!ecap_dev_iotlb_support(iommu->ecap))
1004                 return NULL;
1005
1006         if (!iommu->qi)
1007                 return NULL;
1008
1009         spin_lock_irqsave(&device_domain_lock, flags);
1010         list_for_each_entry(info, &domain->devices, link)
1011                 if (info->bus == bus && info->devfn == devfn) {
1012                         found = 1;
1013                         break;
1014                 }
1015         spin_unlock_irqrestore(&device_domain_lock, flags);
1016
1017         if (!found || !info->dev)
1018                 return NULL;
1019
1020         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1021                 return NULL;
1022
1023         if (!dmar_find_matched_atsr_unit(info->dev))
1024                 return NULL;
1025
1026         info->iommu = iommu;
1027
1028         return info;
1029 }
1030
1031 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1032 {
1033         if (!info)
1034                 return;
1035
1036         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1037 }
1038
1039 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1040 {
1041         if (!info->dev || !pci_ats_enabled(info->dev))
1042                 return;
1043
1044         pci_disable_ats(info->dev);
1045 }
1046
1047 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1048                                   u64 addr, unsigned mask)
1049 {
1050         u16 sid, qdep;
1051         unsigned long flags;
1052         struct device_domain_info *info;
1053
1054         spin_lock_irqsave(&device_domain_lock, flags);
1055         list_for_each_entry(info, &domain->devices, link) {
1056                 if (!info->dev || !pci_ats_enabled(info->dev))
1057                         continue;
1058
1059                 sid = info->bus << 8 | info->devfn;
1060                 qdep = pci_ats_queue_depth(info->dev);
1061                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1062         }
1063         spin_unlock_irqrestore(&device_domain_lock, flags);
1064 }
1065
1066 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1067                                   unsigned long pfn, unsigned int pages, int map)
1068 {
1069         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1070         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1071
1072         BUG_ON(pages == 0);
1073
1074         /*
1075          * Fallback to domain selective flush if no PSI support or the size is
1076          * too big.
1077          * PSI requires page size to be 2 ^ x, and the base address is naturally
1078          * aligned to the size
1079          */
1080         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1081                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1082                                                 DMA_TLB_DSI_FLUSH);
1083         else
1084                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1085                                                 DMA_TLB_PSI_FLUSH);
1086
1087         /*
1088          * In caching mode, changes of pages from non-present to present require
1089          * flush. However, device IOTLB doesn't need to be flushed in this case.
1090          */
1091         if (!cap_caching_mode(iommu->cap) || !map)
1092                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1093 }
1094
1095 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1096 {
1097         u32 pmen;
1098         unsigned long flags;
1099
1100         spin_lock_irqsave(&iommu->register_lock, flags);
1101         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1102         pmen &= ~DMA_PMEN_EPM;
1103         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1104
1105         /* wait for the protected region status bit to clear */
1106         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1107                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1108
1109         spin_unlock_irqrestore(&iommu->register_lock, flags);
1110 }
1111
1112 static int iommu_enable_translation(struct intel_iommu *iommu)
1113 {
1114         u32 sts;
1115         unsigned long flags;
1116
1117         spin_lock_irqsave(&iommu->register_lock, flags);
1118         iommu->gcmd |= DMA_GCMD_TE;
1119         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1120
1121         /* Make sure hardware complete it */
1122         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1123                       readl, (sts & DMA_GSTS_TES), sts);
1124
1125         spin_unlock_irqrestore(&iommu->register_lock, flags);
1126         return 0;
1127 }
1128
1129 static int iommu_disable_translation(struct intel_iommu *iommu)
1130 {
1131         u32 sts;
1132         unsigned long flag;
1133
1134         spin_lock_irqsave(&iommu->register_lock, flag);
1135         iommu->gcmd &= ~DMA_GCMD_TE;
1136         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1137
1138         /* Make sure hardware complete it */
1139         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1140                       readl, (!(sts & DMA_GSTS_TES)), sts);
1141
1142         spin_unlock_irqrestore(&iommu->register_lock, flag);
1143         return 0;
1144 }
1145
1146
1147 static int iommu_init_domains(struct intel_iommu *iommu)
1148 {
1149         unsigned long ndomains;
1150         unsigned long nlongs;
1151
1152         ndomains = cap_ndoms(iommu->cap);
1153         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1154         nlongs = BITS_TO_LONGS(ndomains);
1155
1156         spin_lock_init(&iommu->lock);
1157
1158         /* TBD: there might be 64K domains,
1159          * consider other allocation for future chip
1160          */
1161         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1162         if (!iommu->domain_ids) {
1163                 printk(KERN_ERR "Allocating domain id array failed\n");
1164                 return -ENOMEM;
1165         }
1166         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1167                         GFP_KERNEL);
1168         if (!iommu->domains) {
1169                 printk(KERN_ERR "Allocating domain array failed\n");
1170                 return -ENOMEM;
1171         }
1172
1173         /*
1174          * if Caching mode is set, then invalid translations are tagged
1175          * with domainid 0. Hence we need to pre-allocate it.
1176          */
1177         if (cap_caching_mode(iommu->cap))
1178                 set_bit(0, iommu->domain_ids);
1179         return 0;
1180 }
1181
1182
1183 static void domain_exit(struct dmar_domain *domain);
1184 static void vm_domain_exit(struct dmar_domain *domain);
1185
1186 void free_dmar_iommu(struct intel_iommu *iommu)
1187 {
1188         struct dmar_domain *domain;
1189         int i;
1190         unsigned long flags;
1191
1192         if ((iommu->domains) && (iommu->domain_ids)) {
1193                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1194                         domain = iommu->domains[i];
1195                         clear_bit(i, iommu->domain_ids);
1196
1197                         spin_lock_irqsave(&domain->iommu_lock, flags);
1198                         if (--domain->iommu_count == 0) {
1199                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1200                                         vm_domain_exit(domain);
1201                                 else
1202                                         domain_exit(domain);
1203                         }
1204                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1205                 }
1206         }
1207
1208         if (iommu->gcmd & DMA_GCMD_TE)
1209                 iommu_disable_translation(iommu);
1210
1211         if (iommu->irq) {
1212                 set_irq_data(iommu->irq, NULL);
1213                 /* This will mask the irq */
1214                 free_irq(iommu->irq, iommu);
1215                 destroy_irq(iommu->irq);
1216         }
1217
1218         kfree(iommu->domains);
1219         kfree(iommu->domain_ids);
1220
1221         g_iommus[iommu->seq_id] = NULL;
1222
1223         /* if all iommus are freed, free g_iommus */
1224         for (i = 0; i < g_num_of_iommus; i++) {
1225                 if (g_iommus[i])
1226                         break;
1227         }
1228
1229         if (i == g_num_of_iommus)
1230                 kfree(g_iommus);
1231
1232         /* free context mapping */
1233         free_context_table(iommu);
1234 }
1235
1236 static struct dmar_domain *alloc_domain(void)
1237 {
1238         struct dmar_domain *domain;
1239
1240         domain = alloc_domain_mem();
1241         if (!domain)
1242                 return NULL;
1243
1244         domain->nid = -1;
1245         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1246         domain->flags = 0;
1247
1248         return domain;
1249 }
1250
1251 static int iommu_attach_domain(struct dmar_domain *domain,
1252                                struct intel_iommu *iommu)
1253 {
1254         int num;
1255         unsigned long ndomains;
1256         unsigned long flags;
1257
1258         ndomains = cap_ndoms(iommu->cap);
1259
1260         spin_lock_irqsave(&iommu->lock, flags);
1261
1262         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1263         if (num >= ndomains) {
1264                 spin_unlock_irqrestore(&iommu->lock, flags);
1265                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1266                 return -ENOMEM;
1267         }
1268
1269         domain->id = num;
1270         set_bit(num, iommu->domain_ids);
1271         set_bit(iommu->seq_id, &domain->iommu_bmp);
1272         iommu->domains[num] = domain;
1273         spin_unlock_irqrestore(&iommu->lock, flags);
1274
1275         return 0;
1276 }
1277
1278 static void iommu_detach_domain(struct dmar_domain *domain,
1279                                 struct intel_iommu *iommu)
1280 {
1281         unsigned long flags;
1282         int num, ndomains;
1283         int found = 0;
1284
1285         spin_lock_irqsave(&iommu->lock, flags);
1286         ndomains = cap_ndoms(iommu->cap);
1287         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1288                 if (iommu->domains[num] == domain) {
1289                         found = 1;
1290                         break;
1291                 }
1292         }
1293
1294         if (found) {
1295                 clear_bit(num, iommu->domain_ids);
1296                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1297                 iommu->domains[num] = NULL;
1298         }
1299         spin_unlock_irqrestore(&iommu->lock, flags);
1300 }
1301
1302 static struct iova_domain reserved_iova_list;
1303 static struct lock_class_key reserved_rbtree_key;
1304
1305 static void dmar_init_reserved_ranges(void)
1306 {
1307         struct pci_dev *pdev = NULL;
1308         struct iova *iova;
1309         int i;
1310
1311         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1312
1313         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1314                 &reserved_rbtree_key);
1315
1316         /* IOAPIC ranges shouldn't be accessed by DMA */
1317         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1318                 IOVA_PFN(IOAPIC_RANGE_END));
1319         if (!iova)
1320                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1321
1322         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1323         for_each_pci_dev(pdev) {
1324                 struct resource *r;
1325
1326                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1327                         r = &pdev->resource[i];
1328                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1329                                 continue;
1330                         iova = reserve_iova(&reserved_iova_list,
1331                                             IOVA_PFN(r->start),
1332                                             IOVA_PFN(r->end));
1333                         if (!iova)
1334                                 printk(KERN_ERR "Reserve iova failed\n");
1335                 }
1336         }
1337
1338 }
1339
1340 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1341 {
1342         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1343 }
1344
1345 static inline int guestwidth_to_adjustwidth(int gaw)
1346 {
1347         int agaw;
1348         int r = (gaw - 12) % 9;
1349
1350         if (r == 0)
1351                 agaw = gaw;
1352         else
1353                 agaw = gaw + 9 - r;
1354         if (agaw > 64)
1355                 agaw = 64;
1356         return agaw;
1357 }
1358
1359 static int domain_init(struct dmar_domain *domain, int guest_width)
1360 {
1361         struct intel_iommu *iommu;
1362         int adjust_width, agaw;
1363         unsigned long sagaw;
1364
1365         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1366         spin_lock_init(&domain->iommu_lock);
1367
1368         domain_reserve_special_ranges(domain);
1369
1370         /* calculate AGAW */
1371         iommu = domain_get_iommu(domain);
1372         if (guest_width > cap_mgaw(iommu->cap))
1373                 guest_width = cap_mgaw(iommu->cap);
1374         domain->gaw = guest_width;
1375         adjust_width = guestwidth_to_adjustwidth(guest_width);
1376         agaw = width_to_agaw(adjust_width);
1377         sagaw = cap_sagaw(iommu->cap);
1378         if (!test_bit(agaw, &sagaw)) {
1379                 /* hardware doesn't support it, choose a bigger one */
1380                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1381                 agaw = find_next_bit(&sagaw, 5, agaw);
1382                 if (agaw >= 5)
1383                         return -ENODEV;
1384         }
1385         domain->agaw = agaw;
1386         INIT_LIST_HEAD(&domain->devices);
1387
1388         if (ecap_coherent(iommu->ecap))
1389                 domain->iommu_coherency = 1;
1390         else
1391                 domain->iommu_coherency = 0;
1392
1393         if (ecap_sc_support(iommu->ecap))
1394                 domain->iommu_snooping = 1;
1395         else
1396                 domain->iommu_snooping = 0;
1397
1398         domain->iommu_count = 1;
1399         domain->nid = iommu->node;
1400
1401         /* always allocate the top pgd */
1402         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1403         if (!domain->pgd)
1404                 return -ENOMEM;
1405         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1406         return 0;
1407 }
1408
1409 static void domain_exit(struct dmar_domain *domain)
1410 {
1411         struct dmar_drhd_unit *drhd;
1412         struct intel_iommu *iommu;
1413
1414         /* Domain 0 is reserved, so dont process it */
1415         if (!domain)
1416                 return;
1417
1418         domain_remove_dev_info(domain);
1419         /* destroy iovas */
1420         put_iova_domain(&domain->iovad);
1421
1422         /* clear ptes */
1423         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1424
1425         /* free page tables */
1426         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1427
1428         for_each_active_iommu(iommu, drhd)
1429                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1430                         iommu_detach_domain(domain, iommu);
1431
1432         free_domain_mem(domain);
1433 }
1434
1435 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1436                                  u8 bus, u8 devfn, int translation)
1437 {
1438         struct context_entry *context;
1439         unsigned long flags;
1440         struct intel_iommu *iommu;
1441         struct dma_pte *pgd;
1442         unsigned long num;
1443         unsigned long ndomains;
1444         int id;
1445         int agaw;
1446         struct device_domain_info *info = NULL;
1447
1448         pr_debug("Set context mapping for %02x:%02x.%d\n",
1449                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1450
1451         BUG_ON(!domain->pgd);
1452         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1453                translation != CONTEXT_TT_MULTI_LEVEL);
1454
1455         iommu = device_to_iommu(segment, bus, devfn);
1456         if (!iommu)
1457                 return -ENODEV;
1458
1459         context = device_to_context_entry(iommu, bus, devfn);
1460         if (!context)
1461                 return -ENOMEM;
1462         spin_lock_irqsave(&iommu->lock, flags);
1463         if (context_present(context)) {
1464                 spin_unlock_irqrestore(&iommu->lock, flags);
1465                 return 0;
1466         }
1467
1468         id = domain->id;
1469         pgd = domain->pgd;
1470
1471         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1472             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1473                 int found = 0;
1474
1475                 /* find an available domain id for this device in iommu */
1476                 ndomains = cap_ndoms(iommu->cap);
1477                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1478                         if (iommu->domains[num] == domain) {
1479                                 id = num;
1480                                 found = 1;
1481                                 break;
1482                         }
1483                 }
1484
1485                 if (found == 0) {
1486                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1487                         if (num >= ndomains) {
1488                                 spin_unlock_irqrestore(&iommu->lock, flags);
1489                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1490                                 return -EFAULT;
1491                         }
1492
1493                         set_bit(num, iommu->domain_ids);
1494                         iommu->domains[num] = domain;
1495                         id = num;
1496                 }
1497
1498                 /* Skip top levels of page tables for
1499                  * iommu which has less agaw than default.
1500                  * Unnecessary for PT mode.
1501                  */
1502                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1503                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1504                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1505                                 if (!dma_pte_present(pgd)) {
1506                                         spin_unlock_irqrestore(&iommu->lock, flags);
1507                                         return -ENOMEM;
1508                                 }
1509                         }
1510                 }
1511         }
1512
1513         context_set_domain_id(context, id);
1514
1515         if (translation != CONTEXT_TT_PASS_THROUGH) {
1516                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1517                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1518                                      CONTEXT_TT_MULTI_LEVEL;
1519         }
1520         /*
1521          * In pass through mode, AW must be programmed to indicate the largest
1522          * AGAW value supported by hardware. And ASR is ignored by hardware.
1523          */
1524         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1525                 context_set_address_width(context, iommu->msagaw);
1526         else {
1527                 context_set_address_root(context, virt_to_phys(pgd));
1528                 context_set_address_width(context, iommu->agaw);
1529         }
1530
1531         context_set_translation_type(context, translation);
1532         context_set_fault_enable(context);
1533         context_set_present(context);
1534         domain_flush_cache(domain, context, sizeof(*context));
1535
1536         /*
1537          * It's a non-present to present mapping. If hardware doesn't cache
1538          * non-present entry we only need to flush the write-buffer. If the
1539          * _does_ cache non-present entries, then it does so in the special
1540          * domain #0, which we have to flush:
1541          */
1542         if (cap_caching_mode(iommu->cap)) {
1543                 iommu->flush.flush_context(iommu, 0,
1544                                            (((u16)bus) << 8) | devfn,
1545                                            DMA_CCMD_MASK_NOBIT,
1546                                            DMA_CCMD_DEVICE_INVL);
1547                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1548         } else {
1549                 iommu_flush_write_buffer(iommu);
1550         }
1551         iommu_enable_dev_iotlb(info);
1552         spin_unlock_irqrestore(&iommu->lock, flags);
1553
1554         spin_lock_irqsave(&domain->iommu_lock, flags);
1555         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1556                 domain->iommu_count++;
1557                 if (domain->iommu_count == 1)
1558                         domain->nid = iommu->node;
1559                 domain_update_iommu_cap(domain);
1560         }
1561         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1562         return 0;
1563 }
1564
1565 static int
1566 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1567                         int translation)
1568 {
1569         int ret;
1570         struct pci_dev *tmp, *parent;
1571
1572         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1573                                          pdev->bus->number, pdev->devfn,
1574                                          translation);
1575         if (ret)
1576                 return ret;
1577
1578         /* dependent device mapping */
1579         tmp = pci_find_upstream_pcie_bridge(pdev);
1580         if (!tmp)
1581                 return 0;
1582         /* Secondary interface's bus number and devfn 0 */
1583         parent = pdev->bus->self;
1584         while (parent != tmp) {
1585                 ret = domain_context_mapping_one(domain,
1586                                                  pci_domain_nr(parent->bus),
1587                                                  parent->bus->number,
1588                                                  parent->devfn, translation);
1589                 if (ret)
1590                         return ret;
1591                 parent = parent->bus->self;
1592         }
1593         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1594                 return domain_context_mapping_one(domain,
1595                                         pci_domain_nr(tmp->subordinate),
1596                                         tmp->subordinate->number, 0,
1597                                         translation);
1598         else /* this is a legacy PCI bridge */
1599                 return domain_context_mapping_one(domain,
1600                                                   pci_domain_nr(tmp->bus),
1601                                                   tmp->bus->number,
1602                                                   tmp->devfn,
1603                                                   translation);
1604 }
1605
1606 static int domain_context_mapped(struct pci_dev *pdev)
1607 {
1608         int ret;
1609         struct pci_dev *tmp, *parent;
1610         struct intel_iommu *iommu;
1611
1612         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1613                                 pdev->devfn);
1614         if (!iommu)
1615                 return -ENODEV;
1616
1617         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1618         if (!ret)
1619                 return ret;
1620         /* dependent device mapping */
1621         tmp = pci_find_upstream_pcie_bridge(pdev);
1622         if (!tmp)
1623                 return ret;
1624         /* Secondary interface's bus number and devfn 0 */
1625         parent = pdev->bus->self;
1626         while (parent != tmp) {
1627                 ret = device_context_mapped(iommu, parent->bus->number,
1628                                             parent->devfn);
1629                 if (!ret)
1630                         return ret;
1631                 parent = parent->bus->self;
1632         }
1633         if (pci_is_pcie(tmp))
1634                 return device_context_mapped(iommu, tmp->subordinate->number,
1635                                              0);
1636         else
1637                 return device_context_mapped(iommu, tmp->bus->number,
1638                                              tmp->devfn);
1639 }
1640
1641 /* Returns a number of VTD pages, but aligned to MM page size */
1642 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1643                                             size_t size)
1644 {
1645         host_addr &= ~PAGE_MASK;
1646         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1647 }
1648
1649 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1650                             struct scatterlist *sg, unsigned long phys_pfn,
1651                             unsigned long nr_pages, int prot)
1652 {
1653         struct dma_pte *first_pte = NULL, *pte = NULL;
1654         phys_addr_t uninitialized_var(pteval);
1655         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1656         unsigned long sg_res;
1657
1658         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1659
1660         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1661                 return -EINVAL;
1662
1663         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1664
1665         if (sg)
1666                 sg_res = 0;
1667         else {
1668                 sg_res = nr_pages + 1;
1669                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1670         }
1671
1672         while (nr_pages--) {
1673                 uint64_t tmp;
1674
1675                 if (!sg_res) {
1676                         sg_res = aligned_nrpages(sg->offset, sg->length);
1677                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1678                         sg->dma_length = sg->length;
1679                         pteval = page_to_phys(sg_page(sg)) | prot;
1680                 }
1681                 if (!pte) {
1682                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1683                         if (!pte)
1684                                 return -ENOMEM;
1685                 }
1686                 /* We don't need lock here, nobody else
1687                  * touches the iova range
1688                  */
1689                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1690                 if (tmp) {
1691                         static int dumps = 5;
1692                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1693                                iov_pfn, tmp, (unsigned long long)pteval);
1694                         if (dumps) {
1695                                 dumps--;
1696                                 debug_dma_dump_mappings(NULL);
1697                         }
1698                         WARN_ON(1);
1699                 }
1700                 pte++;
1701                 if (!nr_pages || first_pte_in_page(pte)) {
1702                         domain_flush_cache(domain, first_pte,
1703                                            (void *)pte - (void *)first_pte);
1704                         pte = NULL;
1705                 }
1706                 iov_pfn++;
1707                 pteval += VTD_PAGE_SIZE;
1708                 sg_res--;
1709                 if (!sg_res)
1710                         sg = sg_next(sg);
1711         }
1712         return 0;
1713 }
1714
1715 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1716                                     struct scatterlist *sg, unsigned long nr_pages,
1717                                     int prot)
1718 {
1719         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1720 }
1721
1722 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1723                                      unsigned long phys_pfn, unsigned long nr_pages,
1724                                      int prot)
1725 {
1726         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1727 }
1728
1729 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1730 {
1731         if (!iommu)
1732                 return;
1733
1734         clear_context_table(iommu, bus, devfn);
1735         iommu->flush.flush_context(iommu, 0, 0, 0,
1736                                            DMA_CCMD_GLOBAL_INVL);
1737         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1738 }
1739
1740 static void domain_remove_dev_info(struct dmar_domain *domain)
1741 {
1742         struct device_domain_info *info;
1743         unsigned long flags;
1744         struct intel_iommu *iommu;
1745
1746         spin_lock_irqsave(&device_domain_lock, flags);
1747         while (!list_empty(&domain->devices)) {
1748                 info = list_entry(domain->devices.next,
1749                         struct device_domain_info, link);
1750                 list_del(&info->link);
1751                 list_del(&info->global);
1752                 if (info->dev)
1753                         info->dev->dev.archdata.iommu = NULL;
1754                 spin_unlock_irqrestore(&device_domain_lock, flags);
1755
1756                 iommu_disable_dev_iotlb(info);
1757                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1758                 iommu_detach_dev(iommu, info->bus, info->devfn);
1759                 free_devinfo_mem(info);
1760
1761                 spin_lock_irqsave(&device_domain_lock, flags);
1762         }
1763         spin_unlock_irqrestore(&device_domain_lock, flags);
1764 }
1765
1766 /*
1767  * find_domain
1768  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1769  */
1770 static struct dmar_domain *
1771 find_domain(struct pci_dev *pdev)
1772 {
1773         struct device_domain_info *info;
1774
1775         /* No lock here, assumes no domain exit in normal case */
1776         info = pdev->dev.archdata.iommu;
1777         if (info)
1778                 return info->domain;
1779         return NULL;
1780 }
1781
1782 /* domain is initialized */
1783 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1784 {
1785         struct dmar_domain *domain, *found = NULL;
1786         struct intel_iommu *iommu;
1787         struct dmar_drhd_unit *drhd;
1788         struct device_domain_info *info, *tmp;
1789         struct pci_dev *dev_tmp;
1790         unsigned long flags;
1791         int bus = 0, devfn = 0;
1792         int segment;
1793         int ret;
1794
1795         domain = find_domain(pdev);
1796         if (domain)
1797                 return domain;
1798
1799         segment = pci_domain_nr(pdev->bus);
1800
1801         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1802         if (dev_tmp) {
1803                 if (pci_is_pcie(dev_tmp)) {
1804                         bus = dev_tmp->subordinate->number;
1805                         devfn = 0;
1806                 } else {
1807                         bus = dev_tmp->bus->number;
1808                         devfn = dev_tmp->devfn;
1809                 }
1810                 spin_lock_irqsave(&device_domain_lock, flags);
1811                 list_for_each_entry(info, &device_domain_list, global) {
1812                         if (info->segment == segment &&
1813                             info->bus == bus && info->devfn == devfn) {
1814                                 found = info->domain;
1815                                 break;
1816                         }
1817                 }
1818                 spin_unlock_irqrestore(&device_domain_lock, flags);
1819                 /* pcie-pci bridge already has a domain, uses it */
1820                 if (found) {
1821                         domain = found;
1822                         goto found_domain;
1823                 }
1824         }
1825
1826         domain = alloc_domain();
1827         if (!domain)
1828                 goto error;
1829
1830         /* Allocate new domain for the device */
1831         drhd = dmar_find_matched_drhd_unit(pdev);
1832         if (!drhd) {
1833                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1834                         pci_name(pdev));
1835                 return NULL;
1836         }
1837         iommu = drhd->iommu;
1838
1839         ret = iommu_attach_domain(domain, iommu);
1840         if (ret) {
1841                 domain_exit(domain);
1842                 goto error;
1843         }
1844
1845         if (domain_init(domain, gaw)) {
1846                 domain_exit(domain);
1847                 goto error;
1848         }
1849
1850         /* register pcie-to-pci device */
1851         if (dev_tmp) {
1852                 info = alloc_devinfo_mem();
1853                 if (!info) {
1854                         domain_exit(domain);
1855                         goto error;
1856                 }
1857                 info->segment = segment;
1858                 info->bus = bus;
1859                 info->devfn = devfn;
1860                 info->dev = NULL;
1861                 info->domain = domain;
1862                 /* This domain is shared by devices under p2p bridge */
1863                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1864
1865                 /* pcie-to-pci bridge already has a domain, uses it */
1866                 found = NULL;
1867                 spin_lock_irqsave(&device_domain_lock, flags);
1868                 list_for_each_entry(tmp, &device_domain_list, global) {
1869                         if (tmp->segment == segment &&
1870                             tmp->bus == bus && tmp->devfn == devfn) {
1871                                 found = tmp->domain;
1872                                 break;
1873                         }
1874                 }
1875                 if (found) {
1876                         free_devinfo_mem(info);
1877                         domain_exit(domain);
1878                         domain = found;
1879                 } else {
1880                         list_add(&info->link, &domain->devices);
1881                         list_add(&info->global, &device_domain_list);
1882                 }
1883                 spin_unlock_irqrestore(&device_domain_lock, flags);
1884         }
1885
1886 found_domain:
1887         info = alloc_devinfo_mem();
1888         if (!info)
1889                 goto error;
1890         info->segment = segment;
1891         info->bus = pdev->bus->number;
1892         info->devfn = pdev->devfn;
1893         info->dev = pdev;
1894         info->domain = domain;
1895         spin_lock_irqsave(&device_domain_lock, flags);
1896         /* somebody is fast */
1897         found = find_domain(pdev);
1898         if (found != NULL) {
1899                 spin_unlock_irqrestore(&device_domain_lock, flags);
1900                 if (found != domain) {
1901                         domain_exit(domain);
1902                         domain = found;
1903                 }
1904                 free_devinfo_mem(info);
1905                 return domain;
1906         }
1907         list_add(&info->link, &domain->devices);
1908         list_add(&info->global, &device_domain_list);
1909         pdev->dev.archdata.iommu = info;
1910         spin_unlock_irqrestore(&device_domain_lock, flags);
1911         return domain;
1912 error:
1913         /* recheck it here, maybe others set it */
1914         return find_domain(pdev);
1915 }
1916
1917 static int iommu_identity_mapping;
1918 #define IDENTMAP_ALL            1
1919 #define IDENTMAP_GFX            2
1920 #define IDENTMAP_AZALIA         4
1921
1922 static int iommu_domain_identity_map(struct dmar_domain *domain,
1923                                      unsigned long long start,
1924                                      unsigned long long end)
1925 {
1926         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1927         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1928
1929         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1930                           dma_to_mm_pfn(last_vpfn))) {
1931                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1932                 return -ENOMEM;
1933         }
1934
1935         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1936                  start, end, domain->id);
1937         /*
1938          * RMRR range might have overlap with physical memory range,
1939          * clear it first
1940          */
1941         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1942
1943         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1944                                   last_vpfn - first_vpfn + 1,
1945                                   DMA_PTE_READ|DMA_PTE_WRITE);
1946 }
1947
1948 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1949                                       unsigned long long start,
1950                                       unsigned long long end)
1951 {
1952         struct dmar_domain *domain;
1953         int ret;
1954
1955         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1956         if (!domain)
1957                 return -ENOMEM;
1958
1959         /* For _hardware_ passthrough, don't bother. But for software
1960            passthrough, we do it anyway -- it may indicate a memory
1961            range which is reserved in E820, so which didn't get set
1962            up to start with in si_domain */
1963         if (domain == si_domain && hw_pass_through) {
1964                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1965                        pci_name(pdev), start, end);
1966                 return 0;
1967         }
1968
1969         printk(KERN_INFO
1970                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1971                pci_name(pdev), start, end);
1972         
1973         if (end < start) {
1974                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1975                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1976                         dmi_get_system_info(DMI_BIOS_VENDOR),
1977                         dmi_get_system_info(DMI_BIOS_VERSION),
1978                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1979                 ret = -EIO;
1980                 goto error;
1981         }
1982
1983         if (end >> agaw_to_width(domain->agaw)) {
1984                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1985                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1986                      agaw_to_width(domain->agaw),
1987                      dmi_get_system_info(DMI_BIOS_VENDOR),
1988                      dmi_get_system_info(DMI_BIOS_VERSION),
1989                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1990                 ret = -EIO;
1991                 goto error;
1992         }
1993
1994         ret = iommu_domain_identity_map(domain, start, end);
1995         if (ret)
1996                 goto error;
1997
1998         /* context entry init */
1999         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2000         if (ret)
2001                 goto error;
2002
2003         return 0;
2004
2005  error:
2006         domain_exit(domain);
2007         return ret;
2008 }
2009
2010 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2011         struct pci_dev *pdev)
2012 {
2013         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2014                 return 0;
2015         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2016                 rmrr->end_address + 1);
2017 }
2018
2019 #ifdef CONFIG_DMAR_FLOPPY_WA
2020 static inline void iommu_prepare_isa(void)
2021 {
2022         struct pci_dev *pdev;
2023         int ret;
2024
2025         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2026         if (!pdev)
2027                 return;
2028
2029         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2030         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2031
2032         if (ret)
2033                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2034                        "floppy might not work\n");
2035
2036 }
2037 #else
2038 static inline void iommu_prepare_isa(void)
2039 {
2040         return;
2041 }
2042 #endif /* !CONFIG_DMAR_FLPY_WA */
2043
2044 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2045
2046 static int __init si_domain_work_fn(unsigned long start_pfn,
2047                                     unsigned long end_pfn, void *datax)
2048 {
2049         int *ret = datax;
2050
2051         *ret = iommu_domain_identity_map(si_domain,
2052                                          (uint64_t)start_pfn << PAGE_SHIFT,
2053                                          (uint64_t)end_pfn << PAGE_SHIFT);
2054         return *ret;
2055
2056 }
2057
2058 static int __init si_domain_init(int hw)
2059 {
2060         struct dmar_drhd_unit *drhd;
2061         struct intel_iommu *iommu;
2062         int nid, ret = 0;
2063
2064         si_domain = alloc_domain();
2065         if (!si_domain)
2066                 return -EFAULT;
2067
2068         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2069
2070         for_each_active_iommu(iommu, drhd) {
2071                 ret = iommu_attach_domain(si_domain, iommu);
2072                 if (ret) {
2073                         domain_exit(si_domain);
2074                         return -EFAULT;
2075                 }
2076         }
2077
2078         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2079                 domain_exit(si_domain);
2080                 return -EFAULT;
2081         }
2082
2083         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2084
2085         if (hw)
2086                 return 0;
2087
2088         for_each_online_node(nid) {
2089                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2090                 if (ret)
2091                         return ret;
2092         }
2093
2094         return 0;
2095 }
2096
2097 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2098                                           struct pci_dev *pdev);
2099 static int identity_mapping(struct pci_dev *pdev)
2100 {
2101         struct device_domain_info *info;
2102
2103         if (likely(!iommu_identity_mapping))
2104                 return 0;
2105
2106
2107         list_for_each_entry(info, &si_domain->devices, link)
2108                 if (info->dev == pdev)
2109                         return 1;
2110         return 0;
2111 }
2112
2113 static int domain_add_dev_info(struct dmar_domain *domain,
2114                                struct pci_dev *pdev,
2115                                int translation)
2116 {
2117         struct device_domain_info *info;
2118         unsigned long flags;
2119         int ret;
2120
2121         info = alloc_devinfo_mem();
2122         if (!info)
2123                 return -ENOMEM;
2124
2125         ret = domain_context_mapping(domain, pdev, translation);
2126         if (ret) {
2127                 free_devinfo_mem(info);
2128                 return ret;
2129         }
2130
2131         info->segment = pci_domain_nr(pdev->bus);
2132         info->bus = pdev->bus->number;
2133         info->devfn = pdev->devfn;
2134         info->dev = pdev;
2135         info->domain = domain;
2136
2137         spin_lock_irqsave(&device_domain_lock, flags);
2138         list_add(&info->link, &domain->devices);
2139         list_add(&info->global, &device_domain_list);
2140         pdev->dev.archdata.iommu = info;
2141         spin_unlock_irqrestore(&device_domain_lock, flags);
2142
2143         return 0;
2144 }
2145
2146 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2147 {
2148         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2149                 return 1;
2150
2151         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2152                 return 1;
2153
2154         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2155                 return 0;
2156
2157         /*
2158          * We want to start off with all devices in the 1:1 domain, and
2159          * take them out later if we find they can't access all of memory.
2160          *
2161          * However, we can't do this for PCI devices behind bridges,
2162          * because all PCI devices behind the same bridge will end up
2163          * with the same source-id on their transactions.
2164          *
2165          * Practically speaking, we can't change things around for these
2166          * devices at run-time, because we can't be sure there'll be no
2167          * DMA transactions in flight for any of their siblings.
2168          * 
2169          * So PCI devices (unless they're on the root bus) as well as
2170          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2171          * the 1:1 domain, just in _case_ one of their siblings turns out
2172          * not to be able to map all of memory.
2173          */
2174         if (!pci_is_pcie(pdev)) {
2175                 if (!pci_is_root_bus(pdev->bus))
2176                         return 0;
2177                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2178                         return 0;
2179         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2180                 return 0;
2181
2182         /* 
2183          * At boot time, we don't yet know if devices will be 64-bit capable.
2184          * Assume that they will -- if they turn out not to be, then we can 
2185          * take them out of the 1:1 domain later.
2186          */
2187         if (!startup)
2188                 return pdev->dma_mask > DMA_BIT_MASK(32);
2189
2190         return 1;
2191 }
2192
2193 static int __init iommu_prepare_static_identity_mapping(int hw)
2194 {
2195         struct pci_dev *pdev = NULL;
2196         int ret;
2197
2198         ret = si_domain_init(hw);
2199         if (ret)
2200                 return -EFAULT;
2201
2202         for_each_pci_dev(pdev) {
2203                 if (iommu_should_identity_map(pdev, 1)) {
2204                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2205                                hw ? "hardware" : "software", pci_name(pdev));
2206
2207                         ret = domain_add_dev_info(si_domain, pdev,
2208                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2209                                                      CONTEXT_TT_MULTI_LEVEL);
2210                         if (ret)
2211                                 return ret;
2212                 }
2213         }
2214
2215         return 0;
2216 }
2217
2218 int __init init_dmars(void)
2219 {
2220         struct dmar_drhd_unit *drhd;
2221         struct dmar_rmrr_unit *rmrr;
2222         struct pci_dev *pdev;
2223         struct intel_iommu *iommu;
2224         int i, ret;
2225
2226         /*
2227          * for each drhd
2228          *    allocate root
2229          *    initialize and program root entry to not present
2230          * endfor
2231          */
2232         for_each_drhd_unit(drhd) {
2233                 g_num_of_iommus++;
2234                 /*
2235                  * lock not needed as this is only incremented in the single
2236                  * threaded kernel __init code path all other access are read
2237                  * only
2238                  */
2239         }
2240
2241         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2242                         GFP_KERNEL);
2243         if (!g_iommus) {
2244                 printk(KERN_ERR "Allocating global iommu array failed\n");
2245                 ret = -ENOMEM;
2246                 goto error;
2247         }
2248
2249         deferred_flush = kzalloc(g_num_of_iommus *
2250                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2251         if (!deferred_flush) {
2252                 ret = -ENOMEM;
2253                 goto error;
2254         }
2255
2256         for_each_drhd_unit(drhd) {
2257                 if (drhd->ignored)
2258                         continue;
2259
2260                 iommu = drhd->iommu;
2261                 g_iommus[iommu->seq_id] = iommu;
2262
2263                 ret = iommu_init_domains(iommu);
2264                 if (ret)
2265                         goto error;
2266
2267                 /*
2268                  * TBD:
2269                  * we could share the same root & context tables
2270                  * amoung all IOMMU's. Need to Split it later.
2271                  */
2272                 ret = iommu_alloc_root_entry(iommu);
2273                 if (ret) {
2274                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2275                         goto error;
2276                 }
2277                 if (!ecap_pass_through(iommu->ecap))
2278                         hw_pass_through = 0;
2279         }
2280
2281         /*
2282          * Start from the sane iommu hardware state.
2283          */
2284         for_each_drhd_unit(drhd) {
2285                 if (drhd->ignored)
2286                         continue;
2287
2288                 iommu = drhd->iommu;
2289
2290                 /*
2291                  * If the queued invalidation is already initialized by us
2292                  * (for example, while enabling interrupt-remapping) then
2293                  * we got the things already rolling from a sane state.
2294                  */
2295                 if (iommu->qi)
2296                         continue;
2297
2298                 /*
2299                  * Clear any previous faults.
2300                  */
2301                 dmar_fault(-1, iommu);
2302                 /*
2303                  * Disable queued invalidation if supported and already enabled
2304                  * before OS handover.
2305                  */
2306                 dmar_disable_qi(iommu);
2307         }
2308
2309         for_each_drhd_unit(drhd) {
2310                 if (drhd->ignored)
2311                         continue;
2312
2313                 iommu = drhd->iommu;
2314
2315                 if (dmar_enable_qi(iommu)) {
2316                         /*
2317                          * Queued Invalidate not enabled, use Register Based
2318                          * Invalidate
2319                          */
2320                         iommu->flush.flush_context = __iommu_flush_context;
2321                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2322                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2323                                "invalidation\n",
2324                                (unsigned long long)drhd->reg_base_addr);
2325                 } else {
2326                         iommu->flush.flush_context = qi_flush_context;
2327                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2328                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2329                                "invalidation\n",
2330                                (unsigned long long)drhd->reg_base_addr);
2331                 }
2332         }
2333
2334         if (iommu_pass_through)
2335                 iommu_identity_mapping |= IDENTMAP_ALL;
2336
2337 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2338         iommu_identity_mapping |= IDENTMAP_GFX;
2339 #endif
2340
2341         check_tylersburg_isoch();
2342
2343         /*
2344          * If pass through is not set or not enabled, setup context entries for
2345          * identity mappings for rmrr, gfx, and isa and may fall back to static
2346          * identity mapping if iommu_identity_mapping is set.
2347          */
2348         if (iommu_identity_mapping) {
2349                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2350                 if (ret) {
2351                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2352                         goto error;
2353                 }
2354         }
2355         /*
2356          * For each rmrr
2357          *   for each dev attached to rmrr
2358          *   do
2359          *     locate drhd for dev, alloc domain for dev
2360          *     allocate free domain
2361          *     allocate page table entries for rmrr
2362          *     if context not allocated for bus
2363          *           allocate and init context
2364          *           set present in root table for this bus
2365          *     init context with domain, translation etc
2366          *    endfor
2367          * endfor
2368          */
2369         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2370         for_each_rmrr_units(rmrr) {
2371                 for (i = 0; i < rmrr->devices_cnt; i++) {
2372                         pdev = rmrr->devices[i];
2373                         /*
2374                          * some BIOS lists non-exist devices in DMAR
2375                          * table.
2376                          */
2377                         if (!pdev)
2378                                 continue;
2379                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2380                         if (ret)
2381                                 printk(KERN_ERR
2382                                        "IOMMU: mapping reserved region failed\n");
2383                 }
2384         }
2385
2386         iommu_prepare_isa();
2387
2388         /*
2389          * for each drhd
2390          *   enable fault log
2391          *   global invalidate context cache
2392          *   global invalidate iotlb
2393          *   enable translation
2394          */
2395         for_each_drhd_unit(drhd) {
2396                 if (drhd->ignored)
2397                         continue;
2398                 iommu = drhd->iommu;
2399
2400                 iommu_flush_write_buffer(iommu);
2401
2402                 ret = dmar_set_interrupt(iommu);
2403                 if (ret)
2404                         goto error;
2405
2406                 iommu_set_root_entry(iommu);
2407
2408                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2409                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2410
2411                 ret = iommu_enable_translation(iommu);
2412                 if (ret)
2413                         goto error;
2414
2415                 iommu_disable_protect_mem_regions(iommu);
2416         }
2417
2418         return 0;
2419 error:
2420         for_each_drhd_unit(drhd) {
2421                 if (drhd->ignored)
2422                         continue;
2423                 iommu = drhd->iommu;
2424                 free_iommu(iommu);
2425         }
2426         kfree(g_iommus);
2427         return ret;
2428 }
2429
2430 /* This takes a number of _MM_ pages, not VTD pages */
2431 static struct iova *intel_alloc_iova(struct device *dev,
2432                                      struct dmar_domain *domain,
2433                                      unsigned long nrpages, uint64_t dma_mask)
2434 {
2435         struct pci_dev *pdev = to_pci_dev(dev);
2436         struct iova *iova = NULL;
2437
2438         /* Restrict dma_mask to the width that the iommu can handle */
2439         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2440
2441         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2442                 /*
2443                  * First try to allocate an io virtual address in
2444                  * DMA_BIT_MASK(32) and if that fails then try allocating
2445                  * from higher range
2446                  */
2447                 iova = alloc_iova(&domain->iovad, nrpages,
2448                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2449                 if (iova)
2450                         return iova;
2451         }
2452         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2453         if (unlikely(!iova)) {
2454                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2455                        nrpages, pci_name(pdev));
2456                 return NULL;
2457         }
2458
2459         return iova;
2460 }
2461
2462 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2463 {
2464         struct dmar_domain *domain;
2465         int ret;
2466
2467         domain = get_domain_for_dev(pdev,
2468                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2469         if (!domain) {
2470                 printk(KERN_ERR
2471                         "Allocating domain for %s failed", pci_name(pdev));
2472                 return NULL;
2473         }
2474
2475         /* make sure context mapping is ok */
2476         if (unlikely(!domain_context_mapped(pdev))) {
2477                 ret = domain_context_mapping(domain, pdev,
2478                                              CONTEXT_TT_MULTI_LEVEL);
2479                 if (ret) {
2480                         printk(KERN_ERR
2481                                 "Domain context map for %s failed",
2482                                 pci_name(pdev));
2483                         return NULL;
2484                 }
2485         }
2486
2487         return domain;
2488 }
2489
2490 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2491 {
2492         struct device_domain_info *info;
2493
2494         /* No lock here, assumes no domain exit in normal case */
2495         info = dev->dev.archdata.iommu;
2496         if (likely(info))
2497                 return info->domain;
2498
2499         return __get_valid_domain_for_dev(dev);
2500 }
2501
2502 static int iommu_dummy(struct pci_dev *pdev)
2503 {
2504         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2505 }
2506
2507 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2508 static int iommu_no_mapping(struct device *dev)
2509 {
2510         struct pci_dev *pdev;
2511         int found;
2512
2513         if (unlikely(dev->bus != &pci_bus_type))
2514                 return 1;
2515
2516         pdev = to_pci_dev(dev);
2517         if (iommu_dummy(pdev))
2518                 return 1;
2519
2520         if (!iommu_identity_mapping)
2521                 return 0;
2522
2523         found = identity_mapping(pdev);
2524         if (found) {
2525                 if (iommu_should_identity_map(pdev, 0))
2526                         return 1;
2527                 else {
2528                         /*
2529                          * 32 bit DMA is removed from si_domain and fall back
2530                          * to non-identity mapping.
2531                          */
2532                         domain_remove_one_dev_info(si_domain, pdev);
2533                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2534                                pci_name(pdev));
2535                         return 0;
2536                 }
2537         } else {
2538                 /*
2539                  * In case of a detached 64 bit DMA device from vm, the device
2540                  * is put into si_domain for identity mapping.
2541                  */
2542                 if (iommu_should_identity_map(pdev, 0)) {
2543                         int ret;
2544                         ret = domain_add_dev_info(si_domain, pdev,
2545                                                   hw_pass_through ?
2546                                                   CONTEXT_TT_PASS_THROUGH :
2547                                                   CONTEXT_TT_MULTI_LEVEL);
2548                         if (!ret) {
2549                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2550                                        pci_name(pdev));
2551                                 return 1;
2552                         }
2553                 }
2554         }
2555
2556         return 0;
2557 }
2558
2559 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2560                                      size_t size, int dir, u64 dma_mask)
2561 {
2562         struct pci_dev *pdev = to_pci_dev(hwdev);
2563         struct dmar_domain *domain;
2564         phys_addr_t start_paddr;
2565         struct iova *iova;
2566         int prot = 0;
2567         int ret;
2568         struct intel_iommu *iommu;
2569         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2570
2571         BUG_ON(dir == DMA_NONE);
2572
2573         if (iommu_no_mapping(hwdev))
2574                 return paddr;
2575
2576         domain = get_valid_domain_for_dev(pdev);
2577         if (!domain)
2578                 return 0;
2579
2580         iommu = domain_get_iommu(domain);
2581         size = aligned_nrpages(paddr, size);
2582
2583         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2584                                 pdev->dma_mask);
2585         if (!iova)
2586                 goto error;
2587
2588         /*
2589          * Check if DMAR supports zero-length reads on write only
2590          * mappings..
2591          */
2592         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2593                         !cap_zlr(iommu->cap))
2594                 prot |= DMA_PTE_READ;
2595         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2596                 prot |= DMA_PTE_WRITE;
2597         /*
2598          * paddr - (paddr + size) might be partial page, we should map the whole
2599          * page.  Note: if two part of one page are separately mapped, we
2600          * might have two guest_addr mapping to the same host paddr, but this
2601          * is not a big problem
2602          */
2603         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2604                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2605         if (ret)
2606                 goto error;
2607
2608         /* it's a non-present to present mapping. Only flush if caching mode */
2609         if (cap_caching_mode(iommu->cap))
2610                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2611         else
2612                 iommu_flush_write_buffer(iommu);
2613
2614         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2615         start_paddr += paddr & ~PAGE_MASK;
2616         return start_paddr;
2617
2618 error:
2619         if (iova)
2620                 __free_iova(&domain->iovad, iova);
2621         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2622                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2623         return 0;
2624 }
2625
2626 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2627                                  unsigned long offset, size_t size,
2628                                  enum dma_data_direction dir,
2629                                  struct dma_attrs *attrs)
2630 {
2631         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2632                                   dir, to_pci_dev(dev)->dma_mask);
2633 }
2634
2635 static void flush_unmaps(void)
2636 {
2637         int i, j;
2638
2639         timer_on = 0;
2640
2641         /* just flush them all */
2642         for (i = 0; i < g_num_of_iommus; i++) {
2643                 struct intel_iommu *iommu = g_iommus[i];
2644                 if (!iommu)
2645                         continue;
2646
2647                 if (!deferred_flush[i].next)
2648                         continue;
2649
2650                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2651                                          DMA_TLB_GLOBAL_FLUSH);
2652                 for (j = 0; j < deferred_flush[i].next; j++) {
2653                         unsigned long mask;
2654                         struct iova *iova = deferred_flush[i].iova[j];
2655
2656                         mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2657                         iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2658                                         (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2659                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2660                 }
2661                 deferred_flush[i].next = 0;
2662         }
2663
2664         list_size = 0;
2665 }
2666
2667 static void flush_unmaps_timeout(unsigned long data)
2668 {
2669         unsigned long flags;
2670
2671         spin_lock_irqsave(&async_umap_flush_lock, flags);
2672         flush_unmaps();
2673         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2674 }
2675
2676 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2677 {
2678         unsigned long flags;
2679         int next, iommu_id;
2680         struct intel_iommu *iommu;
2681
2682         spin_lock_irqsave(&async_umap_flush_lock, flags);
2683         if (list_size == HIGH_WATER_MARK)
2684                 flush_unmaps();
2685
2686         iommu = domain_get_iommu(dom);
2687         iommu_id = iommu->seq_id;
2688
2689         next = deferred_flush[iommu_id].next;
2690         deferred_flush[iommu_id].domain[next] = dom;
2691         deferred_flush[iommu_id].iova[next] = iova;
2692         deferred_flush[iommu_id].next++;
2693
2694         if (!timer_on) {
2695                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2696                 timer_on = 1;
2697         }
2698         list_size++;
2699         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2700 }
2701
2702 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2703                              size_t size, enum dma_data_direction dir,
2704                              struct dma_attrs *attrs)
2705 {
2706         struct pci_dev *pdev = to_pci_dev(dev);
2707         struct dmar_domain *domain;
2708         unsigned long start_pfn, last_pfn;
2709         struct iova *iova;
2710         struct intel_iommu *iommu;
2711
2712         if (iommu_no_mapping(dev))
2713                 return;
2714
2715         domain = find_domain(pdev);
2716         BUG_ON(!domain);
2717
2718         iommu = domain_get_iommu(domain);
2719
2720         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2721         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2722                       (unsigned long long)dev_addr))
2723                 return;
2724
2725         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2726         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2727
2728         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2729                  pci_name(pdev), start_pfn, last_pfn);
2730
2731         /*  clear the whole page */
2732         dma_pte_clear_range(domain, start_pfn, last_pfn);
2733
2734         /* free page tables */
2735         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2736
2737         if (intel_iommu_strict) {
2738                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2739                                       last_pfn - start_pfn + 1, 0);
2740                 /* free iova */
2741                 __free_iova(&domain->iovad, iova);
2742         } else {
2743                 add_unmap(domain, iova);
2744                 /*
2745                  * queue up the release of the unmap to save the 1/6th of the
2746                  * cpu used up by the iotlb flush operation...
2747                  */
2748         }
2749 }
2750
2751 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2752                                   dma_addr_t *dma_handle, gfp_t flags)
2753 {
2754         void *vaddr;
2755         int order;
2756
2757         size = PAGE_ALIGN(size);
2758         order = get_order(size);
2759
2760         if (!iommu_no_mapping(hwdev))
2761                 flags &= ~(GFP_DMA | GFP_DMA32);
2762         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2763                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2764                         flags |= GFP_DMA;
2765                 else
2766                         flags |= GFP_DMA32;
2767         }
2768
2769         vaddr = (void *)__get_free_pages(flags, order);
2770         if (!vaddr)
2771                 return NULL;
2772         memset(vaddr, 0, size);
2773
2774         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2775                                          DMA_BIDIRECTIONAL,
2776                                          hwdev->coherent_dma_mask);
2777         if (*dma_handle)
2778                 return vaddr;
2779         free_pages((unsigned long)vaddr, order);
2780         return NULL;
2781 }
2782
2783 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2784                                 dma_addr_t dma_handle)
2785 {
2786         int order;
2787
2788         size = PAGE_ALIGN(size);
2789         order = get_order(size);
2790
2791         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2792         free_pages((unsigned long)vaddr, order);
2793 }
2794
2795 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2796                            int nelems, enum dma_data_direction dir,
2797                            struct dma_attrs *attrs)
2798 {
2799         struct pci_dev *pdev = to_pci_dev(hwdev);
2800         struct dmar_domain *domain;
2801         unsigned long start_pfn, last_pfn;
2802         struct iova *iova;
2803         struct intel_iommu *iommu;
2804
2805         if (iommu_no_mapping(hwdev))
2806                 return;
2807
2808         domain = find_domain(pdev);
2809         BUG_ON(!domain);
2810
2811         iommu = domain_get_iommu(domain);
2812
2813         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2814         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2815                       (unsigned long long)sglist[0].dma_address))
2816                 return;
2817
2818         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2819         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2820
2821         /*  clear the whole page */
2822         dma_pte_clear_range(domain, start_pfn, last_pfn);
2823
2824         /* free page tables */
2825         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2826
2827         if (intel_iommu_strict) {
2828                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2829                                       last_pfn - start_pfn + 1, 0);
2830                 /* free iova */
2831                 __free_iova(&domain->iovad, iova);
2832         } else {
2833                 add_unmap(domain, iova);
2834                 /*
2835                  * queue up the release of the unmap to save the 1/6th of the
2836                  * cpu used up by the iotlb flush operation...
2837                  */
2838         }
2839 }
2840
2841 static int intel_nontranslate_map_sg(struct device *hddev,
2842         struct scatterlist *sglist, int nelems, int dir)
2843 {
2844         int i;
2845         struct scatterlist *sg;
2846
2847         for_each_sg(sglist, sg, nelems, i) {
2848                 BUG_ON(!sg_page(sg));
2849                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2850                 sg->dma_length = sg->length;
2851         }
2852         return nelems;
2853 }
2854
2855 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2856                         enum dma_data_direction dir, struct dma_attrs *attrs)
2857 {
2858         int i;
2859         struct pci_dev *pdev = to_pci_dev(hwdev);
2860         struct dmar_domain *domain;
2861         size_t size = 0;
2862         int prot = 0;
2863         struct iova *iova = NULL;
2864         int ret;
2865         struct scatterlist *sg;
2866         unsigned long start_vpfn;
2867         struct intel_iommu *iommu;
2868
2869         BUG_ON(dir == DMA_NONE);
2870         if (iommu_no_mapping(hwdev))
2871                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2872
2873         domain = get_valid_domain_for_dev(pdev);
2874         if (!domain)
2875                 return 0;
2876
2877         iommu = domain_get_iommu(domain);
2878
2879         for_each_sg(sglist, sg, nelems, i)
2880                 size += aligned_nrpages(sg->offset, sg->length);
2881
2882         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2883                                 pdev->dma_mask);
2884         if (!iova) {
2885                 sglist->dma_length = 0;
2886                 return 0;
2887         }
2888
2889         /*
2890          * Check if DMAR supports zero-length reads on write only
2891          * mappings..
2892          */
2893         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2894                         !cap_zlr(iommu->cap))
2895                 prot |= DMA_PTE_READ;
2896         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2897                 prot |= DMA_PTE_WRITE;
2898
2899         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2900
2901         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2902         if (unlikely(ret)) {
2903                 /*  clear the page */
2904                 dma_pte_clear_range(domain, start_vpfn,
2905                                     start_vpfn + size - 1);
2906                 /* free page tables */
2907                 dma_pte_free_pagetable(domain, start_vpfn,
2908                                        start_vpfn + size - 1);
2909                 /* free iova */
2910                 __free_iova(&domain->iovad, iova);
2911                 return 0;
2912         }
2913
2914         /* it's a non-present to present mapping. Only flush if caching mode */
2915         if (cap_caching_mode(iommu->cap))
2916                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
2917         else
2918                 iommu_flush_write_buffer(iommu);
2919
2920         return nelems;
2921 }
2922
2923 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2924 {
2925         return !dma_addr;
2926 }
2927
2928 struct dma_map_ops intel_dma_ops = {
2929         .alloc_coherent = intel_alloc_coherent,
2930         .free_coherent = intel_free_coherent,
2931         .map_sg = intel_map_sg,
2932         .unmap_sg = intel_unmap_sg,
2933         .map_page = intel_map_page,
2934         .unmap_page = intel_unmap_page,
2935         .mapping_error = intel_mapping_error,
2936 };
2937
2938 static inline int iommu_domain_cache_init(void)
2939 {
2940         int ret = 0;
2941
2942         iommu_domain_cache = kmem_cache_create("iommu_domain",
2943                                          sizeof(struct dmar_domain),
2944                                          0,
2945                                          SLAB_HWCACHE_ALIGN,
2946
2947                                          NULL);
2948         if (!iommu_domain_cache) {
2949                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2950                 ret = -ENOMEM;
2951         }
2952
2953         return ret;
2954 }
2955
2956 static inline int iommu_devinfo_cache_init(void)
2957 {
2958         int ret = 0;
2959
2960         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2961                                          sizeof(struct device_domain_info),
2962                                          0,
2963                                          SLAB_HWCACHE_ALIGN,
2964                                          NULL);
2965         if (!iommu_devinfo_cache) {
2966                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2967                 ret = -ENOMEM;
2968         }
2969
2970         return ret;
2971 }
2972
2973 static inline int iommu_iova_cache_init(void)
2974 {
2975         int ret = 0;
2976
2977         iommu_iova_cache = kmem_cache_create("iommu_iova",
2978                                          sizeof(struct iova),
2979                                          0,
2980                                          SLAB_HWCACHE_ALIGN,
2981                                          NULL);
2982         if (!iommu_iova_cache) {
2983                 printk(KERN_ERR "Couldn't create iova cache\n");
2984                 ret = -ENOMEM;
2985         }
2986
2987         return ret;
2988 }
2989
2990 static int __init iommu_init_mempool(void)
2991 {
2992         int ret;
2993         ret = iommu_iova_cache_init();
2994         if (ret)
2995                 return ret;
2996
2997         ret = iommu_domain_cache_init();
2998         if (ret)
2999                 goto domain_error;
3000
3001         ret = iommu_devinfo_cache_init();
3002         if (!ret)
3003                 return ret;
3004
3005         kmem_cache_destroy(iommu_domain_cache);
3006 domain_error:
3007         kmem_cache_destroy(iommu_iova_cache);
3008
3009         return -ENOMEM;
3010 }
3011
3012 static void __init iommu_exit_mempool(void)
3013 {
3014         kmem_cache_destroy(iommu_devinfo_cache);
3015         kmem_cache_destroy(iommu_domain_cache);
3016         kmem_cache_destroy(iommu_iova_cache);
3017
3018 }
3019
3020 static void __init init_no_remapping_devices(void)
3021 {
3022         struct dmar_drhd_unit *drhd;
3023
3024         for_each_drhd_unit(drhd) {
3025                 if (!drhd->include_all) {
3026                         int i;
3027                         for (i = 0; i < drhd->devices_cnt; i++)
3028                                 if (drhd->devices[i] != NULL)
3029                                         break;
3030                         /* ignore DMAR unit if no pci devices exist */
3031                         if (i == drhd->devices_cnt)
3032                                 drhd->ignored = 1;
3033                 }
3034         }
3035
3036         if (dmar_map_gfx)
3037                 return;
3038
3039         for_each_drhd_unit(drhd) {
3040                 int i;
3041                 if (drhd->ignored || drhd->include_all)
3042                         continue;
3043
3044                 for (i = 0; i < drhd->devices_cnt; i++)
3045                         if (drhd->devices[i] &&
3046                                 !IS_GFX_DEVICE(drhd->devices[i]))
3047                                 break;
3048
3049                 if (i < drhd->devices_cnt)
3050                         continue;
3051
3052                 /* bypass IOMMU if it is just for gfx devices */
3053                 drhd->ignored = 1;
3054                 for (i = 0; i < drhd->devices_cnt; i++) {
3055                         if (!drhd->devices[i])
3056                                 continue;
3057                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3058                 }
3059         }
3060 }
3061
3062 #ifdef CONFIG_SUSPEND
3063 static int init_iommu_hw(void)
3064 {
3065         struct dmar_drhd_unit *drhd;
3066         struct intel_iommu *iommu = NULL;
3067
3068         for_each_active_iommu(iommu, drhd)
3069                 if (iommu->qi)
3070                         dmar_reenable_qi(iommu);
3071
3072         for_each_active_iommu(iommu, drhd) {
3073                 iommu_flush_write_buffer(iommu);
3074
3075                 iommu_set_root_entry(iommu);
3076
3077                 iommu->flush.flush_context(iommu, 0, 0, 0,
3078                                            DMA_CCMD_GLOBAL_INVL);
3079                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3080                                          DMA_TLB_GLOBAL_FLUSH);
3081                 iommu_enable_translation(iommu);
3082                 iommu_disable_protect_mem_regions(iommu);
3083         }
3084
3085         return 0;
3086 }
3087
3088 static void iommu_flush_all(void)
3089 {
3090         struct dmar_drhd_unit *drhd;
3091         struct intel_iommu *iommu;
3092
3093         for_each_active_iommu(iommu, drhd) {
3094                 iommu->flush.flush_context(iommu, 0, 0, 0,
3095                                            DMA_CCMD_GLOBAL_INVL);
3096                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3097                                          DMA_TLB_GLOBAL_FLUSH);
3098         }
3099 }
3100
3101 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
3102 {
3103         struct dmar_drhd_unit *drhd;
3104         struct intel_iommu *iommu = NULL;
3105         unsigned long flag;
3106
3107         for_each_active_iommu(iommu, drhd) {
3108                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3109                                                  GFP_ATOMIC);
3110                 if (!iommu->iommu_state)
3111                         goto nomem;
3112         }
3113
3114         iommu_flush_all();
3115
3116         for_each_active_iommu(iommu, drhd) {
3117                 iommu_disable_translation(iommu);
3118
3119                 spin_lock_irqsave(&iommu->register_lock, flag);
3120
3121                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3122                         readl(iommu->reg + DMAR_FECTL_REG);
3123                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3124                         readl(iommu->reg + DMAR_FEDATA_REG);
3125                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3126                         readl(iommu->reg + DMAR_FEADDR_REG);
3127                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3128                         readl(iommu->reg + DMAR_FEUADDR_REG);
3129
3130                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3131         }
3132         return 0;
3133
3134 nomem:
3135         for_each_active_iommu(iommu, drhd)
3136                 kfree(iommu->iommu_state);
3137
3138         return -ENOMEM;
3139 }
3140
3141 static int iommu_resume(struct sys_device *dev)
3142 {
3143         struct dmar_drhd_unit *drhd;
3144         struct intel_iommu *iommu = NULL;
3145         unsigned long flag;
3146
3147         if (init_iommu_hw()) {
3148                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3149                 return -EIO;
3150         }
3151
3152         for_each_active_iommu(iommu, drhd) {
3153
3154                 spin_lock_irqsave(&iommu->register_lock, flag);
3155
3156                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3157                         iommu->reg + DMAR_FECTL_REG);
3158                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3159                         iommu->reg + DMAR_FEDATA_REG);
3160                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3161                         iommu->reg + DMAR_FEADDR_REG);
3162                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3163                         iommu->reg + DMAR_FEUADDR_REG);
3164
3165                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3166         }
3167
3168         for_each_active_iommu(iommu, drhd)
3169                 kfree(iommu->iommu_state);
3170
3171         return 0;
3172 }
3173
3174 static struct sysdev_class iommu_sysclass = {
3175         .name           = "iommu",
3176         .resume         = iommu_resume,
3177         .suspend        = iommu_suspend,
3178 };
3179
3180 static struct sys_device device_iommu = {
3181         .cls    = &iommu_sysclass,
3182 };
3183
3184 static int __init init_iommu_sysfs(void)
3185 {
3186         int error;
3187
3188         error = sysdev_class_register(&iommu_sysclass);
3189         if (error)
3190                 return error;
3191
3192         error = sysdev_register(&device_iommu);
3193         if (error)
3194                 sysdev_class_unregister(&iommu_sysclass);
3195
3196         return error;
3197 }
3198
3199 #else
3200 static int __init init_iommu_sysfs(void)
3201 {
3202         return 0;
3203 }
3204 #endif  /* CONFIG_PM */
3205
3206 /*
3207  * Here we only respond to action of unbound device from driver.
3208  *
3209  * Added device is not attached to its DMAR domain here yet. That will happen
3210  * when mapping the device to iova.
3211  */
3212 static int device_notifier(struct notifier_block *nb,
3213                                   unsigned long action, void *data)
3214 {
3215         struct device *dev = data;
3216         struct pci_dev *pdev = to_pci_dev(dev);
3217         struct dmar_domain *domain;
3218
3219         if (iommu_no_mapping(dev))
3220                 return 0;
3221
3222         domain = find_domain(pdev);
3223         if (!domain)
3224                 return 0;
3225
3226         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through)
3227                 domain_remove_one_dev_info(domain, pdev);
3228
3229         return 0;
3230 }
3231
3232 static struct notifier_block device_nb = {
3233         .notifier_call = device_notifier,
3234 };
3235
3236 int __init intel_iommu_init(void)
3237 {
3238         int ret = 0;
3239         int force_on = 0;
3240
3241         /* VT-d is required for a TXT/tboot launch, so enforce that */
3242         force_on = tboot_force_iommu();
3243
3244         if (dmar_table_init()) {
3245                 if (force_on)
3246                         panic("tboot: Failed to initialize DMAR table\n");
3247                 return  -ENODEV;
3248         }
3249
3250         if (dmar_dev_scope_init()) {
3251                 if (force_on)
3252                         panic("tboot: Failed to initialize DMAR device scope\n");
3253                 return  -ENODEV;
3254         }
3255
3256         /*
3257          * Check the need for DMA-remapping initialization now.
3258          * Above initialization will also be used by Interrupt-remapping.
3259          */
3260         if (no_iommu || dmar_disabled)
3261                 return -ENODEV;
3262
3263         iommu_init_mempool();
3264         dmar_init_reserved_ranges();
3265
3266         init_no_remapping_devices();
3267
3268         ret = init_dmars();
3269         if (ret) {
3270                 if (force_on)
3271                         panic("tboot: Failed to initialize DMARs\n");
3272                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3273                 put_iova_domain(&reserved_iova_list);
3274                 iommu_exit_mempool();
3275                 return ret;
3276         }
3277         printk(KERN_INFO
3278         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3279
3280         init_timer(&unmap_timer);
3281 #ifdef CONFIG_SWIOTLB
3282         swiotlb = 0;
3283 #endif
3284         dma_ops = &intel_dma_ops;
3285
3286         init_iommu_sysfs();
3287
3288         register_iommu(&intel_iommu_ops);
3289
3290         bus_register_notifier(&pci_bus_type, &device_nb);
3291
3292         return 0;
3293 }
3294
3295 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3296                                            struct pci_dev *pdev)
3297 {
3298         struct pci_dev *tmp, *parent;
3299
3300         if (!iommu || !pdev)
3301                 return;
3302
3303         /* dependent device detach */
3304         tmp = pci_find_upstream_pcie_bridge(pdev);
3305         /* Secondary interface's bus number and devfn 0 */
3306         if (tmp) {
3307                 parent = pdev->bus->self;
3308                 while (parent != tmp) {
3309                         iommu_detach_dev(iommu, parent->bus->number,
3310                                          parent->devfn);
3311                         parent = parent->bus->self;
3312                 }
3313                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3314                         iommu_detach_dev(iommu,
3315                                 tmp->subordinate->number, 0);
3316                 else /* this is a legacy PCI bridge */
3317                         iommu_detach_dev(iommu, tmp->bus->number,
3318                                          tmp->devfn);
3319         }
3320 }
3321
3322 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3323                                           struct pci_dev *pdev)
3324 {
3325         struct device_domain_info *info;
3326         struct intel_iommu *iommu;
3327         unsigned long flags;
3328         int found = 0;
3329         struct list_head *entry, *tmp;
3330
3331         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3332                                 pdev->devfn);
3333         if (!iommu)
3334                 return;
3335
3336         spin_lock_irqsave(&device_domain_lock, flags);
3337         list_for_each_safe(entry, tmp, &domain->devices) {
3338                 info = list_entry(entry, struct device_domain_info, link);
3339                 /* No need to compare PCI domain; it has to be the same */
3340                 if (info->bus == pdev->bus->number &&
3341                     info->devfn == pdev->devfn) {
3342                         list_del(&info->link);
3343                         list_del(&info->global);
3344                         if (info->dev)
3345                                 info->dev->dev.archdata.iommu = NULL;
3346                         spin_unlock_irqrestore(&device_domain_lock, flags);
3347
3348                         iommu_disable_dev_iotlb(info);
3349                         iommu_detach_dev(iommu, info->bus, info->devfn);
3350                         iommu_detach_dependent_devices(iommu, pdev);
3351                         free_devinfo_mem(info);
3352
3353                         spin_lock_irqsave(&device_domain_lock, flags);
3354
3355                         if (found)
3356                                 break;
3357                         else
3358                                 continue;
3359                 }
3360
3361                 /* if there is no other devices under the same iommu
3362                  * owned by this domain, clear this iommu in iommu_bmp
3363                  * update iommu count and coherency
3364                  */
3365                 if (iommu == device_to_iommu(info->segment, info->bus,
3366                                             info->devfn))
3367                         found = 1;
3368         }
3369
3370         if (found == 0) {
3371                 unsigned long tmp_flags;
3372                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3373                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3374                 domain->iommu_count--;
3375                 domain_update_iommu_cap(domain);
3376                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3377         }
3378
3379         spin_unlock_irqrestore(&device_domain_lock, flags);
3380 }
3381
3382 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3383 {
3384         struct device_domain_info *info;
3385         struct intel_iommu *iommu;
3386         unsigned long flags1, flags2;
3387
3388         spin_lock_irqsave(&device_domain_lock, flags1);
3389         while (!list_empty(&domain->devices)) {
3390                 info = list_entry(domain->devices.next,
3391                         struct device_domain_info, link);
3392                 list_del(&info->link);
3393                 list_del(&info->global);
3394                 if (info->dev)
3395                         info->dev->dev.archdata.iommu = NULL;
3396
3397                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3398
3399                 iommu_disable_dev_iotlb(info);
3400                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3401                 iommu_detach_dev(iommu, info->bus, info->devfn);
3402                 iommu_detach_dependent_devices(iommu, info->dev);
3403
3404                 /* clear this iommu in iommu_bmp, update iommu count
3405                  * and capabilities
3406                  */
3407                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3408                 if (test_and_clear_bit(iommu->seq_id,
3409                                        &domain->iommu_bmp)) {
3410                         domain->iommu_count--;
3411                         domain_update_iommu_cap(domain);
3412                 }
3413                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3414
3415                 free_devinfo_mem(info);
3416                 spin_lock_irqsave(&device_domain_lock, flags1);
3417         }
3418         spin_unlock_irqrestore(&device_domain_lock, flags1);
3419 }
3420
3421 /* domain id for virtual machine, it won't be set in context */
3422 static unsigned long vm_domid;
3423
3424 static int vm_domain_min_agaw(struct dmar_domain *domain)
3425 {
3426         int i;
3427         int min_agaw = domain->agaw;
3428
3429         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
3430                 if (min_agaw > g_iommus[i]->agaw)
3431                         min_agaw = g_iommus[i]->agaw;
3432         }
3433
3434         return min_agaw;
3435 }
3436
3437 static struct dmar_domain *iommu_alloc_vm_domain(void)
3438 {
3439         struct dmar_domain *domain;
3440
3441         domain = alloc_domain_mem();
3442         if (!domain)
3443                 return NULL;
3444
3445         domain->id = vm_domid++;
3446         domain->nid = -1;
3447         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3448         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3449
3450         return domain;
3451 }
3452
3453 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3454 {
3455         int adjust_width;
3456
3457         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3458         spin_lock_init(&domain->iommu_lock);
3459
3460         domain_reserve_special_ranges(domain);
3461
3462         /* calculate AGAW */
3463         domain->gaw = guest_width;
3464         adjust_width = guestwidth_to_adjustwidth(guest_width);
3465         domain->agaw = width_to_agaw(adjust_width);
3466
3467         INIT_LIST_HEAD(&domain->devices);
3468
3469         domain->iommu_count = 0;
3470         domain->iommu_coherency = 0;
3471         domain->iommu_snooping = 0;
3472         domain->max_addr = 0;
3473         domain->nid = -1;
3474
3475         /* always allocate the top pgd */
3476         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3477         if (!domain->pgd)
3478                 return -ENOMEM;
3479         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3480         return 0;
3481 }
3482
3483 static void iommu_free_vm_domain(struct dmar_domain *domain)
3484 {
3485         unsigned long flags;
3486         struct dmar_drhd_unit *drhd;
3487         struct intel_iommu *iommu;
3488         unsigned long i;
3489         unsigned long ndomains;
3490
3491         for_each_drhd_unit(drhd) {
3492                 if (drhd->ignored)
3493                         continue;
3494                 iommu = drhd->iommu;
3495
3496                 ndomains = cap_ndoms(iommu->cap);
3497                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3498                         if (iommu->domains[i] == domain) {
3499                                 spin_lock_irqsave(&iommu->lock, flags);
3500                                 clear_bit(i, iommu->domain_ids);
3501                                 iommu->domains[i] = NULL;
3502                                 spin_unlock_irqrestore(&iommu->lock, flags);
3503                                 break;
3504                         }
3505                 }
3506         }
3507 }
3508
3509 static void vm_domain_exit(struct dmar_domain *domain)
3510 {
3511         /* Domain 0 is reserved, so dont process it */
3512         if (!domain)
3513                 return;
3514
3515         vm_domain_remove_all_dev_info(domain);
3516         /* destroy iovas */
3517         put_iova_domain(&domain->iovad);
3518
3519         /* clear ptes */
3520         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3521
3522         /* free page tables */
3523         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3524
3525         iommu_free_vm_domain(domain);
3526         free_domain_mem(domain);
3527 }
3528
3529 static int intel_iommu_domain_init(struct iommu_domain *domain)
3530 {
3531         struct dmar_domain *dmar_domain;
3532
3533         dmar_domain = iommu_alloc_vm_domain();
3534         if (!dmar_domain) {
3535                 printk(KERN_ERR
3536                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3537                 return -ENOMEM;
3538         }
3539         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3540                 printk(KERN_ERR
3541                         "intel_iommu_domain_init() failed\n");
3542                 vm_domain_exit(dmar_domain);
3543                 return -ENOMEM;
3544         }
3545         domain->priv = dmar_domain;
3546
3547         return 0;
3548 }
3549
3550 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3551 {
3552         struct dmar_domain *dmar_domain = domain->priv;
3553
3554         domain->priv = NULL;
3555         vm_domain_exit(dmar_domain);
3556 }
3557
3558 static int intel_iommu_attach_device(struct iommu_domain *domain,
3559                                      struct device *dev)
3560 {
3561         struct dmar_domain *dmar_domain = domain->priv;
3562         struct pci_dev *pdev = to_pci_dev(dev);
3563         struct intel_iommu *iommu;
3564         int addr_width;
3565         u64 end;
3566
3567         /* normally pdev is not mapped */
3568         if (unlikely(domain_context_mapped(pdev))) {
3569                 struct dmar_domain *old_domain;
3570
3571                 old_domain = find_domain(pdev);
3572                 if (old_domain) {
3573                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3574                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3575                                 domain_remove_one_dev_info(old_domain, pdev);
3576                         else
3577                                 domain_remove_dev_info(old_domain);
3578                 }
3579         }
3580
3581         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3582                                 pdev->devfn);
3583         if (!iommu)
3584                 return -ENODEV;
3585
3586         /* check if this iommu agaw is sufficient for max mapped address */
3587         addr_width = agaw_to_width(iommu->agaw);
3588         end = DOMAIN_MAX_ADDR(addr_width);
3589         end = end & VTD_PAGE_MASK;
3590         if (end < dmar_domain->max_addr) {
3591                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3592                        "sufficient for the mapped address (%llx)\n",
3593                        __func__, iommu->agaw, dmar_domain->max_addr);
3594                 return -EFAULT;
3595         }
3596
3597         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3598 }
3599
3600 static void intel_iommu_detach_device(struct iommu_domain *domain,
3601                                       struct device *dev)
3602 {
3603         struct dmar_domain *dmar_domain = domain->priv;
3604         struct pci_dev *pdev = to_pci_dev(dev);
3605
3606         domain_remove_one_dev_info(dmar_domain, pdev);
3607 }
3608
3609 static int intel_iommu_map_range(struct iommu_domain *domain,
3610                                  unsigned long iova, phys_addr_t hpa,
3611                                  size_t size, int iommu_prot)
3612 {
3613         struct dmar_domain *dmar_domain = domain->priv;
3614         u64 max_addr;
3615         int addr_width;
3616         int prot = 0;
3617         int ret;
3618
3619         if (iommu_prot & IOMMU_READ)
3620                 prot |= DMA_PTE_READ;
3621         if (iommu_prot & IOMMU_WRITE)
3622                 prot |= DMA_PTE_WRITE;
3623         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3624                 prot |= DMA_PTE_SNP;
3625
3626         max_addr = iova + size;
3627         if (dmar_domain->max_addr < max_addr) {
3628                 int min_agaw;
3629                 u64 end;
3630
3631                 /* check if minimum agaw is sufficient for mapped address */
3632                 min_agaw = vm_domain_min_agaw(dmar_domain);
3633                 addr_width = agaw_to_width(min_agaw);
3634                 end = DOMAIN_MAX_ADDR(addr_width);
3635                 end = end & VTD_PAGE_MASK;
3636                 if (end < max_addr) {
3637                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3638                                "sufficient for the mapped address (%llx)\n",
3639                                __func__, min_agaw, max_addr);
3640                         return -EFAULT;
3641                 }
3642                 dmar_domain->max_addr = max_addr;
3643         }
3644         /* Round up size to next multiple of PAGE_SIZE, if it and
3645            the low bits of hpa would take us onto the next page */
3646         size = aligned_nrpages(hpa, size);
3647         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3648                                  hpa >> VTD_PAGE_SHIFT, size, prot);
3649         return ret;
3650 }
3651
3652 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3653                                     unsigned long iova, size_t size)
3654 {
3655         struct dmar_domain *dmar_domain = domain->priv;
3656
3657         if (!size)
3658                 return;
3659
3660         dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3661                             (iova + size - 1) >> VTD_PAGE_SHIFT);
3662
3663         if (dmar_domain->max_addr == iova + size)
3664                 dmar_domain->max_addr = iova;
3665 }
3666
3667 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3668                                             unsigned long iova)
3669 {
3670         struct dmar_domain *dmar_domain = domain->priv;
3671         struct dma_pte *pte;
3672         u64 phys = 0;
3673
3674         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3675         if (pte)
3676                 phys = dma_pte_addr(pte);
3677
3678         return phys;
3679 }
3680
3681 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3682                                       unsigned long cap)
3683 {
3684         struct dmar_domain *dmar_domain = domain->priv;
3685
3686         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3687                 return dmar_domain->iommu_snooping;
3688
3689         return 0;
3690 }
3691
3692 static struct iommu_ops intel_iommu_ops = {
3693         .domain_init    = intel_iommu_domain_init,
3694         .domain_destroy = intel_iommu_domain_destroy,
3695         .attach_dev     = intel_iommu_attach_device,
3696         .detach_dev     = intel_iommu_detach_device,
3697         .map            = intel_iommu_map_range,
3698         .unmap          = intel_iommu_unmap_range,
3699         .iova_to_phys   = intel_iommu_iova_to_phys,
3700         .domain_has_cap = intel_iommu_domain_has_cap,
3701 };
3702
3703 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3704 {
3705         /*
3706          * Mobile 4 Series Chipset neglects to set RWBF capability,
3707          * but needs it:
3708          */
3709         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3710         rwbf_quirk = 1;
3711 }
3712
3713 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3714
3715 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3716    ISOCH DMAR unit for the Azalia sound device, but not give it any
3717    TLB entries, which causes it to deadlock. Check for that.  We do
3718    this in a function called from init_dmars(), instead of in a PCI
3719    quirk, because we don't want to print the obnoxious "BIOS broken"
3720    message if VT-d is actually disabled.
3721 */
3722 static void __init check_tylersburg_isoch(void)
3723 {
3724         struct pci_dev *pdev;
3725         uint32_t vtisochctrl;
3726
3727         /* If there's no Azalia in the system anyway, forget it. */
3728         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3729         if (!pdev)
3730                 return;
3731         pci_dev_put(pdev);
3732
3733         /* System Management Registers. Might be hidden, in which case
3734            we can't do the sanity check. But that's OK, because the
3735            known-broken BIOSes _don't_ actually hide it, so far. */
3736         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3737         if (!pdev)
3738                 return;
3739
3740         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3741                 pci_dev_put(pdev);
3742                 return;
3743         }
3744
3745         pci_dev_put(pdev);
3746
3747         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3748         if (vtisochctrl & 1)
3749                 return;
3750
3751         /* Drop all bits other than the number of TLB entries */
3752         vtisochctrl &= 0x1c;
3753
3754         /* If we have the recommended number of TLB entries (16), fine. */
3755         if (vtisochctrl == 0x10)
3756                 return;
3757
3758         /* Zero TLB entries? You get to ride the short bus to school. */
3759         if (!vtisochctrl) {
3760                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3761                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3762                      dmi_get_system_info(DMI_BIOS_VENDOR),
3763                      dmi_get_system_info(DMI_BIOS_VERSION),
3764                      dmi_get_system_info(DMI_PRODUCT_VERSION));
3765                 iommu_identity_mapping |= IDENTMAP_AZALIA;
3766                 return;
3767         }
3768         
3769         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3770                vtisochctrl);
3771 }