x86, numa: Fix cpu nodemasks for NUMA emulation and CONFIG_DEBUG_PER_CPU_MAPS
[pandora-kernel.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/syscore_ops.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
45
46 #define ROOT_SIZE               VTD_PAGE_SIZE
47 #define CONTEXT_SIZE            VTD_PAGE_SIZE
48
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
52
53 #define IOAPIC_RANGE_START      (0xfee00000)
54 #define IOAPIC_RANGE_END        (0xfeefffff)
55 #define IOVA_START_ADDR         (0x1000)
56
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
58
59 #define MAX_AGAW_WIDTH 64
60
61 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
63
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
67                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
69
70 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
73
74 /* page table handling */
75 #define LEVEL_STRIDE            (9)
76 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
77
78 static inline int agaw_to_level(int agaw)
79 {
80         return agaw + 2;
81 }
82
83 static inline int agaw_to_width(int agaw)
84 {
85         return 30 + agaw * LEVEL_STRIDE;
86 }
87
88 static inline int width_to_agaw(int width)
89 {
90         return (width - 30) / LEVEL_STRIDE;
91 }
92
93 static inline unsigned int level_to_offset_bits(int level)
94 {
95         return (level - 1) * LEVEL_STRIDE;
96 }
97
98 static inline int pfn_level_offset(unsigned long pfn, int level)
99 {
100         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
101 }
102
103 static inline unsigned long level_mask(int level)
104 {
105         return -1UL << level_to_offset_bits(level);
106 }
107
108 static inline unsigned long level_size(int level)
109 {
110         return 1UL << level_to_offset_bits(level);
111 }
112
113 static inline unsigned long align_to_level(unsigned long pfn, int level)
114 {
115         return (pfn + level_size(level) - 1) & level_mask(level);
116 }
117
118 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
119    are never going to work. */
120 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
121 {
122         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
123 }
124
125 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
126 {
127         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
128 }
129 static inline unsigned long page_to_dma_pfn(struct page *pg)
130 {
131         return mm_to_dma_pfn(page_to_pfn(pg));
132 }
133 static inline unsigned long virt_to_dma_pfn(void *p)
134 {
135         return page_to_dma_pfn(virt_to_page(p));
136 }
137
138 /* global iommu list, set NULL for ignored DMAR units */
139 static struct intel_iommu **g_iommus;
140
141 static void __init check_tylersburg_isoch(void);
142 static int rwbf_quirk;
143
144 /*
145  * 0: Present
146  * 1-11: Reserved
147  * 12-63: Context Ptr (12 - (haw-1))
148  * 64-127: Reserved
149  */
150 struct root_entry {
151         u64     val;
152         u64     rsvd1;
153 };
154 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
155 static inline bool root_present(struct root_entry *root)
156 {
157         return (root->val & 1);
158 }
159 static inline void set_root_present(struct root_entry *root)
160 {
161         root->val |= 1;
162 }
163 static inline void set_root_value(struct root_entry *root, unsigned long value)
164 {
165         root->val |= value & VTD_PAGE_MASK;
166 }
167
168 static inline struct context_entry *
169 get_context_addr_from_root(struct root_entry *root)
170 {
171         return (struct context_entry *)
172                 (root_present(root)?phys_to_virt(
173                 root->val & VTD_PAGE_MASK) :
174                 NULL);
175 }
176
177 /*
178  * low 64 bits:
179  * 0: present
180  * 1: fault processing disable
181  * 2-3: translation type
182  * 12-63: address space root
183  * high 64 bits:
184  * 0-2: address width
185  * 3-6: aval
186  * 8-23: domain id
187  */
188 struct context_entry {
189         u64 lo;
190         u64 hi;
191 };
192
193 static inline bool context_present(struct context_entry *context)
194 {
195         return (context->lo & 1);
196 }
197 static inline void context_set_present(struct context_entry *context)
198 {
199         context->lo |= 1;
200 }
201
202 static inline void context_set_fault_enable(struct context_entry *context)
203 {
204         context->lo &= (((u64)-1) << 2) | 1;
205 }
206
207 static inline void context_set_translation_type(struct context_entry *context,
208                                                 unsigned long value)
209 {
210         context->lo &= (((u64)-1) << 4) | 3;
211         context->lo |= (value & 3) << 2;
212 }
213
214 static inline void context_set_address_root(struct context_entry *context,
215                                             unsigned long value)
216 {
217         context->lo |= value & VTD_PAGE_MASK;
218 }
219
220 static inline void context_set_address_width(struct context_entry *context,
221                                              unsigned long value)
222 {
223         context->hi |= value & 7;
224 }
225
226 static inline void context_set_domain_id(struct context_entry *context,
227                                          unsigned long value)
228 {
229         context->hi |= (value & ((1 << 16) - 1)) << 8;
230 }
231
232 static inline void context_clear_entry(struct context_entry *context)
233 {
234         context->lo = 0;
235         context->hi = 0;
236 }
237
238 /*
239  * 0: readable
240  * 1: writable
241  * 2-6: reserved
242  * 7: super page
243  * 8-10: available
244  * 11: snoop behavior
245  * 12-63: Host physcial address
246  */
247 struct dma_pte {
248         u64 val;
249 };
250
251 static inline void dma_clear_pte(struct dma_pte *pte)
252 {
253         pte->val = 0;
254 }
255
256 static inline void dma_set_pte_readable(struct dma_pte *pte)
257 {
258         pte->val |= DMA_PTE_READ;
259 }
260
261 static inline void dma_set_pte_writable(struct dma_pte *pte)
262 {
263         pte->val |= DMA_PTE_WRITE;
264 }
265
266 static inline void dma_set_pte_snp(struct dma_pte *pte)
267 {
268         pte->val |= DMA_PTE_SNP;
269 }
270
271 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
272 {
273         pte->val = (pte->val & ~3) | (prot & 3);
274 }
275
276 static inline u64 dma_pte_addr(struct dma_pte *pte)
277 {
278 #ifdef CONFIG_64BIT
279         return pte->val & VTD_PAGE_MASK;
280 #else
281         /* Must have a full atomic 64-bit read */
282         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
283 #endif
284 }
285
286 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
287 {
288         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
289 }
290
291 static inline bool dma_pte_present(struct dma_pte *pte)
292 {
293         return (pte->val & 3) != 0;
294 }
295
296 static inline int first_pte_in_page(struct dma_pte *pte)
297 {
298         return !((unsigned long)pte & ~VTD_PAGE_MASK);
299 }
300
301 /*
302  * This domain is a statically identity mapping domain.
303  *      1. This domain creats a static 1:1 mapping to all usable memory.
304  *      2. It maps to each iommu if successful.
305  *      3. Each iommu mapps to this domain if successful.
306  */
307 static struct dmar_domain *si_domain;
308 static int hw_pass_through = 1;
309
310 /* devices under the same p2p bridge are owned in one domain */
311 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
312
313 /* domain represents a virtual machine, more than one devices
314  * across iommus may be owned in one domain, e.g. kvm guest.
315  */
316 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
317
318 /* si_domain contains mulitple devices */
319 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
320
321 struct dmar_domain {
322         int     id;                     /* domain id */
323         int     nid;                    /* node id */
324         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
325
326         struct list_head devices;       /* all devices' list */
327         struct iova_domain iovad;       /* iova's that belong to this domain */
328
329         struct dma_pte  *pgd;           /* virtual address */
330         int             gaw;            /* max guest address width */
331
332         /* adjusted guest address width, 0 is level 2 30-bit */
333         int             agaw;
334
335         int             flags;          /* flags to find out type of domain */
336
337         int             iommu_coherency;/* indicate coherency of iommu access */
338         int             iommu_snooping; /* indicate snooping control feature*/
339         int             iommu_count;    /* reference count of iommu */
340         spinlock_t      iommu_lock;     /* protect iommu set in domain */
341         u64             max_addr;       /* maximum mapped address */
342 };
343
344 /* PCI domain-device relationship */
345 struct device_domain_info {
346         struct list_head link;  /* link to domain siblings */
347         struct list_head global; /* link to global list */
348         int segment;            /* PCI domain */
349         u8 bus;                 /* PCI bus number */
350         u8 devfn;               /* PCI devfn number */
351         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
352         struct intel_iommu *iommu; /* IOMMU used by this device */
353         struct dmar_domain *domain; /* pointer to domain */
354 };
355
356 static void flush_unmaps_timeout(unsigned long data);
357
358 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
359
360 #define HIGH_WATER_MARK 250
361 struct deferred_flush_tables {
362         int next;
363         struct iova *iova[HIGH_WATER_MARK];
364         struct dmar_domain *domain[HIGH_WATER_MARK];
365 };
366
367 static struct deferred_flush_tables *deferred_flush;
368
369 /* bitmap for indexing intel_iommus */
370 static int g_num_of_iommus;
371
372 static DEFINE_SPINLOCK(async_umap_flush_lock);
373 static LIST_HEAD(unmaps_to_do);
374
375 static int timer_on;
376 static long list_size;
377
378 static void domain_remove_dev_info(struct dmar_domain *domain);
379
380 #ifdef CONFIG_DMAR_DEFAULT_ON
381 int dmar_disabled = 0;
382 #else
383 int dmar_disabled = 1;
384 #endif /*CONFIG_DMAR_DEFAULT_ON*/
385
386 static int dmar_map_gfx = 1;
387 static int dmar_forcedac;
388 static int intel_iommu_strict;
389
390 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
391 static DEFINE_SPINLOCK(device_domain_lock);
392 static LIST_HEAD(device_domain_list);
393
394 static struct iommu_ops intel_iommu_ops;
395
396 static int __init intel_iommu_setup(char *str)
397 {
398         if (!str)
399                 return -EINVAL;
400         while (*str) {
401                 if (!strncmp(str, "on", 2)) {
402                         dmar_disabled = 0;
403                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
404                 } else if (!strncmp(str, "off", 3)) {
405                         dmar_disabled = 1;
406                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
407                 } else if (!strncmp(str, "igfx_off", 8)) {
408                         dmar_map_gfx = 0;
409                         printk(KERN_INFO
410                                 "Intel-IOMMU: disable GFX device mapping\n");
411                 } else if (!strncmp(str, "forcedac", 8)) {
412                         printk(KERN_INFO
413                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
414                         dmar_forcedac = 1;
415                 } else if (!strncmp(str, "strict", 6)) {
416                         printk(KERN_INFO
417                                 "Intel-IOMMU: disable batched IOTLB flush\n");
418                         intel_iommu_strict = 1;
419                 }
420
421                 str += strcspn(str, ",");
422                 while (*str == ',')
423                         str++;
424         }
425         return 0;
426 }
427 __setup("intel_iommu=", intel_iommu_setup);
428
429 static struct kmem_cache *iommu_domain_cache;
430 static struct kmem_cache *iommu_devinfo_cache;
431 static struct kmem_cache *iommu_iova_cache;
432
433 static inline void *alloc_pgtable_page(int node)
434 {
435         struct page *page;
436         void *vaddr = NULL;
437
438         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
439         if (page)
440                 vaddr = page_address(page);
441         return vaddr;
442 }
443
444 static inline void free_pgtable_page(void *vaddr)
445 {
446         free_page((unsigned long)vaddr);
447 }
448
449 static inline void *alloc_domain_mem(void)
450 {
451         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
452 }
453
454 static void free_domain_mem(void *vaddr)
455 {
456         kmem_cache_free(iommu_domain_cache, vaddr);
457 }
458
459 static inline void * alloc_devinfo_mem(void)
460 {
461         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
462 }
463
464 static inline void free_devinfo_mem(void *vaddr)
465 {
466         kmem_cache_free(iommu_devinfo_cache, vaddr);
467 }
468
469 struct iova *alloc_iova_mem(void)
470 {
471         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
472 }
473
474 void free_iova_mem(struct iova *iova)
475 {
476         kmem_cache_free(iommu_iova_cache, iova);
477 }
478
479
480 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
481 {
482         unsigned long sagaw;
483         int agaw = -1;
484
485         sagaw = cap_sagaw(iommu->cap);
486         for (agaw = width_to_agaw(max_gaw);
487              agaw >= 0; agaw--) {
488                 if (test_bit(agaw, &sagaw))
489                         break;
490         }
491
492         return agaw;
493 }
494
495 /*
496  * Calculate max SAGAW for each iommu.
497  */
498 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
499 {
500         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
501 }
502
503 /*
504  * calculate agaw for each iommu.
505  * "SAGAW" may be different across iommus, use a default agaw, and
506  * get a supported less agaw for iommus that don't support the default agaw.
507  */
508 int iommu_calculate_agaw(struct intel_iommu *iommu)
509 {
510         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
511 }
512
513 /* This functionin only returns single iommu in a domain */
514 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
515 {
516         int iommu_id;
517
518         /* si_domain and vm domain should not get here. */
519         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
520         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
521
522         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
523         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
524                 return NULL;
525
526         return g_iommus[iommu_id];
527 }
528
529 static void domain_update_iommu_coherency(struct dmar_domain *domain)
530 {
531         int i;
532
533         domain->iommu_coherency = 1;
534
535         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
536                 if (!ecap_coherent(g_iommus[i]->ecap)) {
537                         domain->iommu_coherency = 0;
538                         break;
539                 }
540         }
541 }
542
543 static void domain_update_iommu_snooping(struct dmar_domain *domain)
544 {
545         int i;
546
547         domain->iommu_snooping = 1;
548
549         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
550                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
551                         domain->iommu_snooping = 0;
552                         break;
553                 }
554         }
555 }
556
557 /* Some capabilities may be different across iommus */
558 static void domain_update_iommu_cap(struct dmar_domain *domain)
559 {
560         domain_update_iommu_coherency(domain);
561         domain_update_iommu_snooping(domain);
562 }
563
564 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
565 {
566         struct dmar_drhd_unit *drhd = NULL;
567         int i;
568
569         for_each_drhd_unit(drhd) {
570                 if (drhd->ignored)
571                         continue;
572                 if (segment != drhd->segment)
573                         continue;
574
575                 for (i = 0; i < drhd->devices_cnt; i++) {
576                         if (drhd->devices[i] &&
577                             drhd->devices[i]->bus->number == bus &&
578                             drhd->devices[i]->devfn == devfn)
579                                 return drhd->iommu;
580                         if (drhd->devices[i] &&
581                             drhd->devices[i]->subordinate &&
582                             drhd->devices[i]->subordinate->number <= bus &&
583                             drhd->devices[i]->subordinate->subordinate >= bus)
584                                 return drhd->iommu;
585                 }
586
587                 if (drhd->include_all)
588                         return drhd->iommu;
589         }
590
591         return NULL;
592 }
593
594 static void domain_flush_cache(struct dmar_domain *domain,
595                                void *addr, int size)
596 {
597         if (!domain->iommu_coherency)
598                 clflush_cache_range(addr, size);
599 }
600
601 /* Gets context entry for a given bus and devfn */
602 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
603                 u8 bus, u8 devfn)
604 {
605         struct root_entry *root;
606         struct context_entry *context;
607         unsigned long phy_addr;
608         unsigned long flags;
609
610         spin_lock_irqsave(&iommu->lock, flags);
611         root = &iommu->root_entry[bus];
612         context = get_context_addr_from_root(root);
613         if (!context) {
614                 context = (struct context_entry *)
615                                 alloc_pgtable_page(iommu->node);
616                 if (!context) {
617                         spin_unlock_irqrestore(&iommu->lock, flags);
618                         return NULL;
619                 }
620                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
621                 phy_addr = virt_to_phys((void *)context);
622                 set_root_value(root, phy_addr);
623                 set_root_present(root);
624                 __iommu_flush_cache(iommu, root, sizeof(*root));
625         }
626         spin_unlock_irqrestore(&iommu->lock, flags);
627         return &context[devfn];
628 }
629
630 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
631 {
632         struct root_entry *root;
633         struct context_entry *context;
634         int ret;
635         unsigned long flags;
636
637         spin_lock_irqsave(&iommu->lock, flags);
638         root = &iommu->root_entry[bus];
639         context = get_context_addr_from_root(root);
640         if (!context) {
641                 ret = 0;
642                 goto out;
643         }
644         ret = context_present(&context[devfn]);
645 out:
646         spin_unlock_irqrestore(&iommu->lock, flags);
647         return ret;
648 }
649
650 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
651 {
652         struct root_entry *root;
653         struct context_entry *context;
654         unsigned long flags;
655
656         spin_lock_irqsave(&iommu->lock, flags);
657         root = &iommu->root_entry[bus];
658         context = get_context_addr_from_root(root);
659         if (context) {
660                 context_clear_entry(&context[devfn]);
661                 __iommu_flush_cache(iommu, &context[devfn], \
662                         sizeof(*context));
663         }
664         spin_unlock_irqrestore(&iommu->lock, flags);
665 }
666
667 static void free_context_table(struct intel_iommu *iommu)
668 {
669         struct root_entry *root;
670         int i;
671         unsigned long flags;
672         struct context_entry *context;
673
674         spin_lock_irqsave(&iommu->lock, flags);
675         if (!iommu->root_entry) {
676                 goto out;
677         }
678         for (i = 0; i < ROOT_ENTRY_NR; i++) {
679                 root = &iommu->root_entry[i];
680                 context = get_context_addr_from_root(root);
681                 if (context)
682                         free_pgtable_page(context);
683         }
684         free_pgtable_page(iommu->root_entry);
685         iommu->root_entry = NULL;
686 out:
687         spin_unlock_irqrestore(&iommu->lock, flags);
688 }
689
690 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
691                                       unsigned long pfn)
692 {
693         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
694         struct dma_pte *parent, *pte = NULL;
695         int level = agaw_to_level(domain->agaw);
696         int offset;
697
698         BUG_ON(!domain->pgd);
699         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
700         parent = domain->pgd;
701
702         while (level > 0) {
703                 void *tmp_page;
704
705                 offset = pfn_level_offset(pfn, level);
706                 pte = &parent[offset];
707                 if (level == 1)
708                         break;
709
710                 if (!dma_pte_present(pte)) {
711                         uint64_t pteval;
712
713                         tmp_page = alloc_pgtable_page(domain->nid);
714
715                         if (!tmp_page)
716                                 return NULL;
717
718                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
719                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
720                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
721                                 /* Someone else set it while we were thinking; use theirs. */
722                                 free_pgtable_page(tmp_page);
723                         } else {
724                                 dma_pte_addr(pte);
725                                 domain_flush_cache(domain, pte, sizeof(*pte));
726                         }
727                 }
728                 parent = phys_to_virt(dma_pte_addr(pte));
729                 level--;
730         }
731
732         return pte;
733 }
734
735 /* return address's pte at specific level */
736 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
737                                          unsigned long pfn,
738                                          int level)
739 {
740         struct dma_pte *parent, *pte = NULL;
741         int total = agaw_to_level(domain->agaw);
742         int offset;
743
744         parent = domain->pgd;
745         while (level <= total) {
746                 offset = pfn_level_offset(pfn, total);
747                 pte = &parent[offset];
748                 if (level == total)
749                         return pte;
750
751                 if (!dma_pte_present(pte))
752                         break;
753                 parent = phys_to_virt(dma_pte_addr(pte));
754                 total--;
755         }
756         return NULL;
757 }
758
759 /* clear last level pte, a tlb flush should be followed */
760 static void dma_pte_clear_range(struct dmar_domain *domain,
761                                 unsigned long start_pfn,
762                                 unsigned long last_pfn)
763 {
764         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
765         struct dma_pte *first_pte, *pte;
766
767         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
768         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
769         BUG_ON(start_pfn > last_pfn);
770
771         /* we don't need lock here; nobody else touches the iova range */
772         do {
773                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
774                 if (!pte) {
775                         start_pfn = align_to_level(start_pfn + 1, 2);
776                         continue;
777                 }
778                 do { 
779                         dma_clear_pte(pte);
780                         start_pfn++;
781                         pte++;
782                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
783
784                 domain_flush_cache(domain, first_pte,
785                                    (void *)pte - (void *)first_pte);
786
787         } while (start_pfn && start_pfn <= last_pfn);
788 }
789
790 /* free page table pages. last level pte should already be cleared */
791 static void dma_pte_free_pagetable(struct dmar_domain *domain,
792                                    unsigned long start_pfn,
793                                    unsigned long last_pfn)
794 {
795         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
796         struct dma_pte *first_pte, *pte;
797         int total = agaw_to_level(domain->agaw);
798         int level;
799         unsigned long tmp;
800
801         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
802         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
803         BUG_ON(start_pfn > last_pfn);
804
805         /* We don't need lock here; nobody else touches the iova range */
806         level = 2;
807         while (level <= total) {
808                 tmp = align_to_level(start_pfn, level);
809
810                 /* If we can't even clear one PTE at this level, we're done */
811                 if (tmp + level_size(level) - 1 > last_pfn)
812                         return;
813
814                 do {
815                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
816                         if (!pte) {
817                                 tmp = align_to_level(tmp + 1, level + 1);
818                                 continue;
819                         }
820                         do {
821                                 if (dma_pte_present(pte)) {
822                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
823                                         dma_clear_pte(pte);
824                                 }
825                                 pte++;
826                                 tmp += level_size(level);
827                         } while (!first_pte_in_page(pte) &&
828                                  tmp + level_size(level) - 1 <= last_pfn);
829
830                         domain_flush_cache(domain, first_pte,
831                                            (void *)pte - (void *)first_pte);
832                         
833                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
834                 level++;
835         }
836         /* free pgd */
837         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
838                 free_pgtable_page(domain->pgd);
839                 domain->pgd = NULL;
840         }
841 }
842
843 /* iommu handling */
844 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
845 {
846         struct root_entry *root;
847         unsigned long flags;
848
849         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
850         if (!root)
851                 return -ENOMEM;
852
853         __iommu_flush_cache(iommu, root, ROOT_SIZE);
854
855         spin_lock_irqsave(&iommu->lock, flags);
856         iommu->root_entry = root;
857         spin_unlock_irqrestore(&iommu->lock, flags);
858
859         return 0;
860 }
861
862 static void iommu_set_root_entry(struct intel_iommu *iommu)
863 {
864         void *addr;
865         u32 sts;
866         unsigned long flag;
867
868         addr = iommu->root_entry;
869
870         spin_lock_irqsave(&iommu->register_lock, flag);
871         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
872
873         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
874
875         /* Make sure hardware complete it */
876         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
877                       readl, (sts & DMA_GSTS_RTPS), sts);
878
879         spin_unlock_irqrestore(&iommu->register_lock, flag);
880 }
881
882 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
883 {
884         u32 val;
885         unsigned long flag;
886
887         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
888                 return;
889
890         spin_lock_irqsave(&iommu->register_lock, flag);
891         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
892
893         /* Make sure hardware complete it */
894         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
895                       readl, (!(val & DMA_GSTS_WBFS)), val);
896
897         spin_unlock_irqrestore(&iommu->register_lock, flag);
898 }
899
900 /* return value determine if we need a write buffer flush */
901 static void __iommu_flush_context(struct intel_iommu *iommu,
902                                   u16 did, u16 source_id, u8 function_mask,
903                                   u64 type)
904 {
905         u64 val = 0;
906         unsigned long flag;
907
908         switch (type) {
909         case DMA_CCMD_GLOBAL_INVL:
910                 val = DMA_CCMD_GLOBAL_INVL;
911                 break;
912         case DMA_CCMD_DOMAIN_INVL:
913                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
914                 break;
915         case DMA_CCMD_DEVICE_INVL:
916                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
917                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
918                 break;
919         default:
920                 BUG();
921         }
922         val |= DMA_CCMD_ICC;
923
924         spin_lock_irqsave(&iommu->register_lock, flag);
925         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
926
927         /* Make sure hardware complete it */
928         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
929                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
930
931         spin_unlock_irqrestore(&iommu->register_lock, flag);
932 }
933
934 /* return value determine if we need a write buffer flush */
935 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
936                                 u64 addr, unsigned int size_order, u64 type)
937 {
938         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
939         u64 val = 0, val_iva = 0;
940         unsigned long flag;
941
942         switch (type) {
943         case DMA_TLB_GLOBAL_FLUSH:
944                 /* global flush doesn't need set IVA_REG */
945                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
946                 break;
947         case DMA_TLB_DSI_FLUSH:
948                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
949                 break;
950         case DMA_TLB_PSI_FLUSH:
951                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
952                 /* Note: always flush non-leaf currently */
953                 val_iva = size_order | addr;
954                 break;
955         default:
956                 BUG();
957         }
958         /* Note: set drain read/write */
959 #if 0
960         /*
961          * This is probably to be super secure.. Looks like we can
962          * ignore it without any impact.
963          */
964         if (cap_read_drain(iommu->cap))
965                 val |= DMA_TLB_READ_DRAIN;
966 #endif
967         if (cap_write_drain(iommu->cap))
968                 val |= DMA_TLB_WRITE_DRAIN;
969
970         spin_lock_irqsave(&iommu->register_lock, flag);
971         /* Note: Only uses first TLB reg currently */
972         if (val_iva)
973                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
974         dmar_writeq(iommu->reg + tlb_offset + 8, val);
975
976         /* Make sure hardware complete it */
977         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
978                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
979
980         spin_unlock_irqrestore(&iommu->register_lock, flag);
981
982         /* check IOTLB invalidation granularity */
983         if (DMA_TLB_IAIG(val) == 0)
984                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
985         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
986                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
987                         (unsigned long long)DMA_TLB_IIRG(type),
988                         (unsigned long long)DMA_TLB_IAIG(val));
989 }
990
991 static struct device_domain_info *iommu_support_dev_iotlb(
992         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
993 {
994         int found = 0;
995         unsigned long flags;
996         struct device_domain_info *info;
997         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
998
999         if (!ecap_dev_iotlb_support(iommu->ecap))
1000                 return NULL;
1001
1002         if (!iommu->qi)
1003                 return NULL;
1004
1005         spin_lock_irqsave(&device_domain_lock, flags);
1006         list_for_each_entry(info, &domain->devices, link)
1007                 if (info->bus == bus && info->devfn == devfn) {
1008                         found = 1;
1009                         break;
1010                 }
1011         spin_unlock_irqrestore(&device_domain_lock, flags);
1012
1013         if (!found || !info->dev)
1014                 return NULL;
1015
1016         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1017                 return NULL;
1018
1019         if (!dmar_find_matched_atsr_unit(info->dev))
1020                 return NULL;
1021
1022         info->iommu = iommu;
1023
1024         return info;
1025 }
1026
1027 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1028 {
1029         if (!info)
1030                 return;
1031
1032         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1033 }
1034
1035 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1036 {
1037         if (!info->dev || !pci_ats_enabled(info->dev))
1038                 return;
1039
1040         pci_disable_ats(info->dev);
1041 }
1042
1043 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1044                                   u64 addr, unsigned mask)
1045 {
1046         u16 sid, qdep;
1047         unsigned long flags;
1048         struct device_domain_info *info;
1049
1050         spin_lock_irqsave(&device_domain_lock, flags);
1051         list_for_each_entry(info, &domain->devices, link) {
1052                 if (!info->dev || !pci_ats_enabled(info->dev))
1053                         continue;
1054
1055                 sid = info->bus << 8 | info->devfn;
1056                 qdep = pci_ats_queue_depth(info->dev);
1057                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1058         }
1059         spin_unlock_irqrestore(&device_domain_lock, flags);
1060 }
1061
1062 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1063                                   unsigned long pfn, unsigned int pages, int map)
1064 {
1065         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1066         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1067
1068         BUG_ON(pages == 0);
1069
1070         /*
1071          * Fallback to domain selective flush if no PSI support or the size is
1072          * too big.
1073          * PSI requires page size to be 2 ^ x, and the base address is naturally
1074          * aligned to the size
1075          */
1076         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1077                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1078                                                 DMA_TLB_DSI_FLUSH);
1079         else
1080                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1081                                                 DMA_TLB_PSI_FLUSH);
1082
1083         /*
1084          * In caching mode, changes of pages from non-present to present require
1085          * flush. However, device IOTLB doesn't need to be flushed in this case.
1086          */
1087         if (!cap_caching_mode(iommu->cap) || !map)
1088                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1089 }
1090
1091 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1092 {
1093         u32 pmen;
1094         unsigned long flags;
1095
1096         spin_lock_irqsave(&iommu->register_lock, flags);
1097         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1098         pmen &= ~DMA_PMEN_EPM;
1099         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1100
1101         /* wait for the protected region status bit to clear */
1102         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1103                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1104
1105         spin_unlock_irqrestore(&iommu->register_lock, flags);
1106 }
1107
1108 static int iommu_enable_translation(struct intel_iommu *iommu)
1109 {
1110         u32 sts;
1111         unsigned long flags;
1112
1113         spin_lock_irqsave(&iommu->register_lock, flags);
1114         iommu->gcmd |= DMA_GCMD_TE;
1115         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1116
1117         /* Make sure hardware complete it */
1118         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1119                       readl, (sts & DMA_GSTS_TES), sts);
1120
1121         spin_unlock_irqrestore(&iommu->register_lock, flags);
1122         return 0;
1123 }
1124
1125 static int iommu_disable_translation(struct intel_iommu *iommu)
1126 {
1127         u32 sts;
1128         unsigned long flag;
1129
1130         spin_lock_irqsave(&iommu->register_lock, flag);
1131         iommu->gcmd &= ~DMA_GCMD_TE;
1132         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1133
1134         /* Make sure hardware complete it */
1135         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1136                       readl, (!(sts & DMA_GSTS_TES)), sts);
1137
1138         spin_unlock_irqrestore(&iommu->register_lock, flag);
1139         return 0;
1140 }
1141
1142
1143 static int iommu_init_domains(struct intel_iommu *iommu)
1144 {
1145         unsigned long ndomains;
1146         unsigned long nlongs;
1147
1148         ndomains = cap_ndoms(iommu->cap);
1149         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1150                         ndomains);
1151         nlongs = BITS_TO_LONGS(ndomains);
1152
1153         spin_lock_init(&iommu->lock);
1154
1155         /* TBD: there might be 64K domains,
1156          * consider other allocation for future chip
1157          */
1158         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1159         if (!iommu->domain_ids) {
1160                 printk(KERN_ERR "Allocating domain id array failed\n");
1161                 return -ENOMEM;
1162         }
1163         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1164                         GFP_KERNEL);
1165         if (!iommu->domains) {
1166                 printk(KERN_ERR "Allocating domain array failed\n");
1167                 return -ENOMEM;
1168         }
1169
1170         /*
1171          * if Caching mode is set, then invalid translations are tagged
1172          * with domainid 0. Hence we need to pre-allocate it.
1173          */
1174         if (cap_caching_mode(iommu->cap))
1175                 set_bit(0, iommu->domain_ids);
1176         return 0;
1177 }
1178
1179
1180 static void domain_exit(struct dmar_domain *domain);
1181 static void vm_domain_exit(struct dmar_domain *domain);
1182
1183 void free_dmar_iommu(struct intel_iommu *iommu)
1184 {
1185         struct dmar_domain *domain;
1186         int i;
1187         unsigned long flags;
1188
1189         if ((iommu->domains) && (iommu->domain_ids)) {
1190                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1191                         domain = iommu->domains[i];
1192                         clear_bit(i, iommu->domain_ids);
1193
1194                         spin_lock_irqsave(&domain->iommu_lock, flags);
1195                         if (--domain->iommu_count == 0) {
1196                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1197                                         vm_domain_exit(domain);
1198                                 else
1199                                         domain_exit(domain);
1200                         }
1201                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1202                 }
1203         }
1204
1205         if (iommu->gcmd & DMA_GCMD_TE)
1206                 iommu_disable_translation(iommu);
1207
1208         if (iommu->irq) {
1209                 irq_set_handler_data(iommu->irq, NULL);
1210                 /* This will mask the irq */
1211                 free_irq(iommu->irq, iommu);
1212                 destroy_irq(iommu->irq);
1213         }
1214
1215         kfree(iommu->domains);
1216         kfree(iommu->domain_ids);
1217
1218         g_iommus[iommu->seq_id] = NULL;
1219
1220         /* if all iommus are freed, free g_iommus */
1221         for (i = 0; i < g_num_of_iommus; i++) {
1222                 if (g_iommus[i])
1223                         break;
1224         }
1225
1226         if (i == g_num_of_iommus)
1227                 kfree(g_iommus);
1228
1229         /* free context mapping */
1230         free_context_table(iommu);
1231 }
1232
1233 static struct dmar_domain *alloc_domain(void)
1234 {
1235         struct dmar_domain *domain;
1236
1237         domain = alloc_domain_mem();
1238         if (!domain)
1239                 return NULL;
1240
1241         domain->nid = -1;
1242         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1243         domain->flags = 0;
1244
1245         return domain;
1246 }
1247
1248 static int iommu_attach_domain(struct dmar_domain *domain,
1249                                struct intel_iommu *iommu)
1250 {
1251         int num;
1252         unsigned long ndomains;
1253         unsigned long flags;
1254
1255         ndomains = cap_ndoms(iommu->cap);
1256
1257         spin_lock_irqsave(&iommu->lock, flags);
1258
1259         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1260         if (num >= ndomains) {
1261                 spin_unlock_irqrestore(&iommu->lock, flags);
1262                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1263                 return -ENOMEM;
1264         }
1265
1266         domain->id = num;
1267         set_bit(num, iommu->domain_ids);
1268         set_bit(iommu->seq_id, &domain->iommu_bmp);
1269         iommu->domains[num] = domain;
1270         spin_unlock_irqrestore(&iommu->lock, flags);
1271
1272         return 0;
1273 }
1274
1275 static void iommu_detach_domain(struct dmar_domain *domain,
1276                                 struct intel_iommu *iommu)
1277 {
1278         unsigned long flags;
1279         int num, ndomains;
1280         int found = 0;
1281
1282         spin_lock_irqsave(&iommu->lock, flags);
1283         ndomains = cap_ndoms(iommu->cap);
1284         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1285                 if (iommu->domains[num] == domain) {
1286                         found = 1;
1287                         break;
1288                 }
1289         }
1290
1291         if (found) {
1292                 clear_bit(num, iommu->domain_ids);
1293                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1294                 iommu->domains[num] = NULL;
1295         }
1296         spin_unlock_irqrestore(&iommu->lock, flags);
1297 }
1298
1299 static struct iova_domain reserved_iova_list;
1300 static struct lock_class_key reserved_rbtree_key;
1301
1302 static void dmar_init_reserved_ranges(void)
1303 {
1304         struct pci_dev *pdev = NULL;
1305         struct iova *iova;
1306         int i;
1307
1308         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1309
1310         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1311                 &reserved_rbtree_key);
1312
1313         /* IOAPIC ranges shouldn't be accessed by DMA */
1314         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1315                 IOVA_PFN(IOAPIC_RANGE_END));
1316         if (!iova)
1317                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1318
1319         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1320         for_each_pci_dev(pdev) {
1321                 struct resource *r;
1322
1323                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1324                         r = &pdev->resource[i];
1325                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1326                                 continue;
1327                         iova = reserve_iova(&reserved_iova_list,
1328                                             IOVA_PFN(r->start),
1329                                             IOVA_PFN(r->end));
1330                         if (!iova)
1331                                 printk(KERN_ERR "Reserve iova failed\n");
1332                 }
1333         }
1334
1335 }
1336
1337 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1338 {
1339         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1340 }
1341
1342 static inline int guestwidth_to_adjustwidth(int gaw)
1343 {
1344         int agaw;
1345         int r = (gaw - 12) % 9;
1346
1347         if (r == 0)
1348                 agaw = gaw;
1349         else
1350                 agaw = gaw + 9 - r;
1351         if (agaw > 64)
1352                 agaw = 64;
1353         return agaw;
1354 }
1355
1356 static int domain_init(struct dmar_domain *domain, int guest_width)
1357 {
1358         struct intel_iommu *iommu;
1359         int adjust_width, agaw;
1360         unsigned long sagaw;
1361
1362         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1363         spin_lock_init(&domain->iommu_lock);
1364
1365         domain_reserve_special_ranges(domain);
1366
1367         /* calculate AGAW */
1368         iommu = domain_get_iommu(domain);
1369         if (guest_width > cap_mgaw(iommu->cap))
1370                 guest_width = cap_mgaw(iommu->cap);
1371         domain->gaw = guest_width;
1372         adjust_width = guestwidth_to_adjustwidth(guest_width);
1373         agaw = width_to_agaw(adjust_width);
1374         sagaw = cap_sagaw(iommu->cap);
1375         if (!test_bit(agaw, &sagaw)) {
1376                 /* hardware doesn't support it, choose a bigger one */
1377                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1378                 agaw = find_next_bit(&sagaw, 5, agaw);
1379                 if (agaw >= 5)
1380                         return -ENODEV;
1381         }
1382         domain->agaw = agaw;
1383         INIT_LIST_HEAD(&domain->devices);
1384
1385         if (ecap_coherent(iommu->ecap))
1386                 domain->iommu_coherency = 1;
1387         else
1388                 domain->iommu_coherency = 0;
1389
1390         if (ecap_sc_support(iommu->ecap))
1391                 domain->iommu_snooping = 1;
1392         else
1393                 domain->iommu_snooping = 0;
1394
1395         domain->iommu_count = 1;
1396         domain->nid = iommu->node;
1397
1398         /* always allocate the top pgd */
1399         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1400         if (!domain->pgd)
1401                 return -ENOMEM;
1402         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1403         return 0;
1404 }
1405
1406 static void domain_exit(struct dmar_domain *domain)
1407 {
1408         struct dmar_drhd_unit *drhd;
1409         struct intel_iommu *iommu;
1410
1411         /* Domain 0 is reserved, so dont process it */
1412         if (!domain)
1413                 return;
1414
1415         domain_remove_dev_info(domain);
1416         /* destroy iovas */
1417         put_iova_domain(&domain->iovad);
1418
1419         /* clear ptes */
1420         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1421
1422         /* free page tables */
1423         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1424
1425         for_each_active_iommu(iommu, drhd)
1426                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1427                         iommu_detach_domain(domain, iommu);
1428
1429         free_domain_mem(domain);
1430 }
1431
1432 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1433                                  u8 bus, u8 devfn, int translation)
1434 {
1435         struct context_entry *context;
1436         unsigned long flags;
1437         struct intel_iommu *iommu;
1438         struct dma_pte *pgd;
1439         unsigned long num;
1440         unsigned long ndomains;
1441         int id;
1442         int agaw;
1443         struct device_domain_info *info = NULL;
1444
1445         pr_debug("Set context mapping for %02x:%02x.%d\n",
1446                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1447
1448         BUG_ON(!domain->pgd);
1449         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1450                translation != CONTEXT_TT_MULTI_LEVEL);
1451
1452         iommu = device_to_iommu(segment, bus, devfn);
1453         if (!iommu)
1454                 return -ENODEV;
1455
1456         context = device_to_context_entry(iommu, bus, devfn);
1457         if (!context)
1458                 return -ENOMEM;
1459         spin_lock_irqsave(&iommu->lock, flags);
1460         if (context_present(context)) {
1461                 spin_unlock_irqrestore(&iommu->lock, flags);
1462                 return 0;
1463         }
1464
1465         id = domain->id;
1466         pgd = domain->pgd;
1467
1468         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1469             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1470                 int found = 0;
1471
1472                 /* find an available domain id for this device in iommu */
1473                 ndomains = cap_ndoms(iommu->cap);
1474                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1475                         if (iommu->domains[num] == domain) {
1476                                 id = num;
1477                                 found = 1;
1478                                 break;
1479                         }
1480                 }
1481
1482                 if (found == 0) {
1483                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1484                         if (num >= ndomains) {
1485                                 spin_unlock_irqrestore(&iommu->lock, flags);
1486                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1487                                 return -EFAULT;
1488                         }
1489
1490                         set_bit(num, iommu->domain_ids);
1491                         iommu->domains[num] = domain;
1492                         id = num;
1493                 }
1494
1495                 /* Skip top levels of page tables for
1496                  * iommu which has less agaw than default.
1497                  * Unnecessary for PT mode.
1498                  */
1499                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1500                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1501                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1502                                 if (!dma_pte_present(pgd)) {
1503                                         spin_unlock_irqrestore(&iommu->lock, flags);
1504                                         return -ENOMEM;
1505                                 }
1506                         }
1507                 }
1508         }
1509
1510         context_set_domain_id(context, id);
1511
1512         if (translation != CONTEXT_TT_PASS_THROUGH) {
1513                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1514                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1515                                      CONTEXT_TT_MULTI_LEVEL;
1516         }
1517         /*
1518          * In pass through mode, AW must be programmed to indicate the largest
1519          * AGAW value supported by hardware. And ASR is ignored by hardware.
1520          */
1521         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1522                 context_set_address_width(context, iommu->msagaw);
1523         else {
1524                 context_set_address_root(context, virt_to_phys(pgd));
1525                 context_set_address_width(context, iommu->agaw);
1526         }
1527
1528         context_set_translation_type(context, translation);
1529         context_set_fault_enable(context);
1530         context_set_present(context);
1531         domain_flush_cache(domain, context, sizeof(*context));
1532
1533         /*
1534          * It's a non-present to present mapping. If hardware doesn't cache
1535          * non-present entry we only need to flush the write-buffer. If the
1536          * _does_ cache non-present entries, then it does so in the special
1537          * domain #0, which we have to flush:
1538          */
1539         if (cap_caching_mode(iommu->cap)) {
1540                 iommu->flush.flush_context(iommu, 0,
1541                                            (((u16)bus) << 8) | devfn,
1542                                            DMA_CCMD_MASK_NOBIT,
1543                                            DMA_CCMD_DEVICE_INVL);
1544                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1545         } else {
1546                 iommu_flush_write_buffer(iommu);
1547         }
1548         iommu_enable_dev_iotlb(info);
1549         spin_unlock_irqrestore(&iommu->lock, flags);
1550
1551         spin_lock_irqsave(&domain->iommu_lock, flags);
1552         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1553                 domain->iommu_count++;
1554                 if (domain->iommu_count == 1)
1555                         domain->nid = iommu->node;
1556                 domain_update_iommu_cap(domain);
1557         }
1558         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1559         return 0;
1560 }
1561
1562 static int
1563 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1564                         int translation)
1565 {
1566         int ret;
1567         struct pci_dev *tmp, *parent;
1568
1569         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1570                                          pdev->bus->number, pdev->devfn,
1571                                          translation);
1572         if (ret)
1573                 return ret;
1574
1575         /* dependent device mapping */
1576         tmp = pci_find_upstream_pcie_bridge(pdev);
1577         if (!tmp)
1578                 return 0;
1579         /* Secondary interface's bus number and devfn 0 */
1580         parent = pdev->bus->self;
1581         while (parent != tmp) {
1582                 ret = domain_context_mapping_one(domain,
1583                                                  pci_domain_nr(parent->bus),
1584                                                  parent->bus->number,
1585                                                  parent->devfn, translation);
1586                 if (ret)
1587                         return ret;
1588                 parent = parent->bus->self;
1589         }
1590         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1591                 return domain_context_mapping_one(domain,
1592                                         pci_domain_nr(tmp->subordinate),
1593                                         tmp->subordinate->number, 0,
1594                                         translation);
1595         else /* this is a legacy PCI bridge */
1596                 return domain_context_mapping_one(domain,
1597                                                   pci_domain_nr(tmp->bus),
1598                                                   tmp->bus->number,
1599                                                   tmp->devfn,
1600                                                   translation);
1601 }
1602
1603 static int domain_context_mapped(struct pci_dev *pdev)
1604 {
1605         int ret;
1606         struct pci_dev *tmp, *parent;
1607         struct intel_iommu *iommu;
1608
1609         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1610                                 pdev->devfn);
1611         if (!iommu)
1612                 return -ENODEV;
1613
1614         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1615         if (!ret)
1616                 return ret;
1617         /* dependent device mapping */
1618         tmp = pci_find_upstream_pcie_bridge(pdev);
1619         if (!tmp)
1620                 return ret;
1621         /* Secondary interface's bus number and devfn 0 */
1622         parent = pdev->bus->self;
1623         while (parent != tmp) {
1624                 ret = device_context_mapped(iommu, parent->bus->number,
1625                                             parent->devfn);
1626                 if (!ret)
1627                         return ret;
1628                 parent = parent->bus->self;
1629         }
1630         if (pci_is_pcie(tmp))
1631                 return device_context_mapped(iommu, tmp->subordinate->number,
1632                                              0);
1633         else
1634                 return device_context_mapped(iommu, tmp->bus->number,
1635                                              tmp->devfn);
1636 }
1637
1638 /* Returns a number of VTD pages, but aligned to MM page size */
1639 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1640                                             size_t size)
1641 {
1642         host_addr &= ~PAGE_MASK;
1643         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1644 }
1645
1646 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1647                             struct scatterlist *sg, unsigned long phys_pfn,
1648                             unsigned long nr_pages, int prot)
1649 {
1650         struct dma_pte *first_pte = NULL, *pte = NULL;
1651         phys_addr_t uninitialized_var(pteval);
1652         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1653         unsigned long sg_res;
1654
1655         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1656
1657         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1658                 return -EINVAL;
1659
1660         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1661
1662         if (sg)
1663                 sg_res = 0;
1664         else {
1665                 sg_res = nr_pages + 1;
1666                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1667         }
1668
1669         while (nr_pages--) {
1670                 uint64_t tmp;
1671
1672                 if (!sg_res) {
1673                         sg_res = aligned_nrpages(sg->offset, sg->length);
1674                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1675                         sg->dma_length = sg->length;
1676                         pteval = page_to_phys(sg_page(sg)) | prot;
1677                 }
1678                 if (!pte) {
1679                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1680                         if (!pte)
1681                                 return -ENOMEM;
1682                 }
1683                 /* We don't need lock here, nobody else
1684                  * touches the iova range
1685                  */
1686                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1687                 if (tmp) {
1688                         static int dumps = 5;
1689                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1690                                iov_pfn, tmp, (unsigned long long)pteval);
1691                         if (dumps) {
1692                                 dumps--;
1693                                 debug_dma_dump_mappings(NULL);
1694                         }
1695                         WARN_ON(1);
1696                 }
1697                 pte++;
1698                 if (!nr_pages || first_pte_in_page(pte)) {
1699                         domain_flush_cache(domain, first_pte,
1700                                            (void *)pte - (void *)first_pte);
1701                         pte = NULL;
1702                 }
1703                 iov_pfn++;
1704                 pteval += VTD_PAGE_SIZE;
1705                 sg_res--;
1706                 if (!sg_res)
1707                         sg = sg_next(sg);
1708         }
1709         return 0;
1710 }
1711
1712 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1713                                     struct scatterlist *sg, unsigned long nr_pages,
1714                                     int prot)
1715 {
1716         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1717 }
1718
1719 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1720                                      unsigned long phys_pfn, unsigned long nr_pages,
1721                                      int prot)
1722 {
1723         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1724 }
1725
1726 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1727 {
1728         if (!iommu)
1729                 return;
1730
1731         clear_context_table(iommu, bus, devfn);
1732         iommu->flush.flush_context(iommu, 0, 0, 0,
1733                                            DMA_CCMD_GLOBAL_INVL);
1734         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1735 }
1736
1737 static void domain_remove_dev_info(struct dmar_domain *domain)
1738 {
1739         struct device_domain_info *info;
1740         unsigned long flags;
1741         struct intel_iommu *iommu;
1742
1743         spin_lock_irqsave(&device_domain_lock, flags);
1744         while (!list_empty(&domain->devices)) {
1745                 info = list_entry(domain->devices.next,
1746                         struct device_domain_info, link);
1747                 list_del(&info->link);
1748                 list_del(&info->global);
1749                 if (info->dev)
1750                         info->dev->dev.archdata.iommu = NULL;
1751                 spin_unlock_irqrestore(&device_domain_lock, flags);
1752
1753                 iommu_disable_dev_iotlb(info);
1754                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1755                 iommu_detach_dev(iommu, info->bus, info->devfn);
1756                 free_devinfo_mem(info);
1757
1758                 spin_lock_irqsave(&device_domain_lock, flags);
1759         }
1760         spin_unlock_irqrestore(&device_domain_lock, flags);
1761 }
1762
1763 /*
1764  * find_domain
1765  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1766  */
1767 static struct dmar_domain *
1768 find_domain(struct pci_dev *pdev)
1769 {
1770         struct device_domain_info *info;
1771
1772         /* No lock here, assumes no domain exit in normal case */
1773         info = pdev->dev.archdata.iommu;
1774         if (info)
1775                 return info->domain;
1776         return NULL;
1777 }
1778
1779 /* domain is initialized */
1780 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1781 {
1782         struct dmar_domain *domain, *found = NULL;
1783         struct intel_iommu *iommu;
1784         struct dmar_drhd_unit *drhd;
1785         struct device_domain_info *info, *tmp;
1786         struct pci_dev *dev_tmp;
1787         unsigned long flags;
1788         int bus = 0, devfn = 0;
1789         int segment;
1790         int ret;
1791
1792         domain = find_domain(pdev);
1793         if (domain)
1794                 return domain;
1795
1796         segment = pci_domain_nr(pdev->bus);
1797
1798         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1799         if (dev_tmp) {
1800                 if (pci_is_pcie(dev_tmp)) {
1801                         bus = dev_tmp->subordinate->number;
1802                         devfn = 0;
1803                 } else {
1804                         bus = dev_tmp->bus->number;
1805                         devfn = dev_tmp->devfn;
1806                 }
1807                 spin_lock_irqsave(&device_domain_lock, flags);
1808                 list_for_each_entry(info, &device_domain_list, global) {
1809                         if (info->segment == segment &&
1810                             info->bus == bus && info->devfn == devfn) {
1811                                 found = info->domain;
1812                                 break;
1813                         }
1814                 }
1815                 spin_unlock_irqrestore(&device_domain_lock, flags);
1816                 /* pcie-pci bridge already has a domain, uses it */
1817                 if (found) {
1818                         domain = found;
1819                         goto found_domain;
1820                 }
1821         }
1822
1823         domain = alloc_domain();
1824         if (!domain)
1825                 goto error;
1826
1827         /* Allocate new domain for the device */
1828         drhd = dmar_find_matched_drhd_unit(pdev);
1829         if (!drhd) {
1830                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1831                         pci_name(pdev));
1832                 return NULL;
1833         }
1834         iommu = drhd->iommu;
1835
1836         ret = iommu_attach_domain(domain, iommu);
1837         if (ret) {
1838                 domain_exit(domain);
1839                 goto error;
1840         }
1841
1842         if (domain_init(domain, gaw)) {
1843                 domain_exit(domain);
1844                 goto error;
1845         }
1846
1847         /* register pcie-to-pci device */
1848         if (dev_tmp) {
1849                 info = alloc_devinfo_mem();
1850                 if (!info) {
1851                         domain_exit(domain);
1852                         goto error;
1853                 }
1854                 info->segment = segment;
1855                 info->bus = bus;
1856                 info->devfn = devfn;
1857                 info->dev = NULL;
1858                 info->domain = domain;
1859                 /* This domain is shared by devices under p2p bridge */
1860                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1861
1862                 /* pcie-to-pci bridge already has a domain, uses it */
1863                 found = NULL;
1864                 spin_lock_irqsave(&device_domain_lock, flags);
1865                 list_for_each_entry(tmp, &device_domain_list, global) {
1866                         if (tmp->segment == segment &&
1867                             tmp->bus == bus && tmp->devfn == devfn) {
1868                                 found = tmp->domain;
1869                                 break;
1870                         }
1871                 }
1872                 if (found) {
1873                         spin_unlock_irqrestore(&device_domain_lock, flags);
1874                         free_devinfo_mem(info);
1875                         domain_exit(domain);
1876                         domain = found;
1877                 } else {
1878                         list_add(&info->link, &domain->devices);
1879                         list_add(&info->global, &device_domain_list);
1880                         spin_unlock_irqrestore(&device_domain_lock, flags);
1881                 }
1882         }
1883
1884 found_domain:
1885         info = alloc_devinfo_mem();
1886         if (!info)
1887                 goto error;
1888         info->segment = segment;
1889         info->bus = pdev->bus->number;
1890         info->devfn = pdev->devfn;
1891         info->dev = pdev;
1892         info->domain = domain;
1893         spin_lock_irqsave(&device_domain_lock, flags);
1894         /* somebody is fast */
1895         found = find_domain(pdev);
1896         if (found != NULL) {
1897                 spin_unlock_irqrestore(&device_domain_lock, flags);
1898                 if (found != domain) {
1899                         domain_exit(domain);
1900                         domain = found;
1901                 }
1902                 free_devinfo_mem(info);
1903                 return domain;
1904         }
1905         list_add(&info->link, &domain->devices);
1906         list_add(&info->global, &device_domain_list);
1907         pdev->dev.archdata.iommu = info;
1908         spin_unlock_irqrestore(&device_domain_lock, flags);
1909         return domain;
1910 error:
1911         /* recheck it here, maybe others set it */
1912         return find_domain(pdev);
1913 }
1914
1915 static int iommu_identity_mapping;
1916 #define IDENTMAP_ALL            1
1917 #define IDENTMAP_GFX            2
1918 #define IDENTMAP_AZALIA         4
1919
1920 static int iommu_domain_identity_map(struct dmar_domain *domain,
1921                                      unsigned long long start,
1922                                      unsigned long long end)
1923 {
1924         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1925         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1926
1927         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1928                           dma_to_mm_pfn(last_vpfn))) {
1929                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1930                 return -ENOMEM;
1931         }
1932
1933         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1934                  start, end, domain->id);
1935         /*
1936          * RMRR range might have overlap with physical memory range,
1937          * clear it first
1938          */
1939         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1940
1941         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1942                                   last_vpfn - first_vpfn + 1,
1943                                   DMA_PTE_READ|DMA_PTE_WRITE);
1944 }
1945
1946 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1947                                       unsigned long long start,
1948                                       unsigned long long end)
1949 {
1950         struct dmar_domain *domain;
1951         int ret;
1952
1953         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1954         if (!domain)
1955                 return -ENOMEM;
1956
1957         /* For _hardware_ passthrough, don't bother. But for software
1958            passthrough, we do it anyway -- it may indicate a memory
1959            range which is reserved in E820, so which didn't get set
1960            up to start with in si_domain */
1961         if (domain == si_domain && hw_pass_through) {
1962                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1963                        pci_name(pdev), start, end);
1964                 return 0;
1965         }
1966
1967         printk(KERN_INFO
1968                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1969                pci_name(pdev), start, end);
1970         
1971         if (end < start) {
1972                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1973                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1974                         dmi_get_system_info(DMI_BIOS_VENDOR),
1975                         dmi_get_system_info(DMI_BIOS_VERSION),
1976                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1977                 ret = -EIO;
1978                 goto error;
1979         }
1980
1981         if (end >> agaw_to_width(domain->agaw)) {
1982                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1983                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1984                      agaw_to_width(domain->agaw),
1985                      dmi_get_system_info(DMI_BIOS_VENDOR),
1986                      dmi_get_system_info(DMI_BIOS_VERSION),
1987                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1988                 ret = -EIO;
1989                 goto error;
1990         }
1991
1992         ret = iommu_domain_identity_map(domain, start, end);
1993         if (ret)
1994                 goto error;
1995
1996         /* context entry init */
1997         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
1998         if (ret)
1999                 goto error;
2000
2001         return 0;
2002
2003  error:
2004         domain_exit(domain);
2005         return ret;
2006 }
2007
2008 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2009         struct pci_dev *pdev)
2010 {
2011         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2012                 return 0;
2013         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2014                 rmrr->end_address + 1);
2015 }
2016
2017 #ifdef CONFIG_DMAR_FLOPPY_WA
2018 static inline void iommu_prepare_isa(void)
2019 {
2020         struct pci_dev *pdev;
2021         int ret;
2022
2023         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2024         if (!pdev)
2025                 return;
2026
2027         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2028         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2029
2030         if (ret)
2031                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2032                        "floppy might not work\n");
2033
2034 }
2035 #else
2036 static inline void iommu_prepare_isa(void)
2037 {
2038         return;
2039 }
2040 #endif /* !CONFIG_DMAR_FLPY_WA */
2041
2042 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2043
2044 static int __init si_domain_work_fn(unsigned long start_pfn,
2045                                     unsigned long end_pfn, void *datax)
2046 {
2047         int *ret = datax;
2048
2049         *ret = iommu_domain_identity_map(si_domain,
2050                                          (uint64_t)start_pfn << PAGE_SHIFT,
2051                                          (uint64_t)end_pfn << PAGE_SHIFT);
2052         return *ret;
2053
2054 }
2055
2056 static int __init si_domain_init(int hw)
2057 {
2058         struct dmar_drhd_unit *drhd;
2059         struct intel_iommu *iommu;
2060         int nid, ret = 0;
2061
2062         si_domain = alloc_domain();
2063         if (!si_domain)
2064                 return -EFAULT;
2065
2066         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2067
2068         for_each_active_iommu(iommu, drhd) {
2069                 ret = iommu_attach_domain(si_domain, iommu);
2070                 if (ret) {
2071                         domain_exit(si_domain);
2072                         return -EFAULT;
2073                 }
2074         }
2075
2076         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2077                 domain_exit(si_domain);
2078                 return -EFAULT;
2079         }
2080
2081         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2082
2083         if (hw)
2084                 return 0;
2085
2086         for_each_online_node(nid) {
2087                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2088                 if (ret)
2089                         return ret;
2090         }
2091
2092         return 0;
2093 }
2094
2095 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2096                                           struct pci_dev *pdev);
2097 static int identity_mapping(struct pci_dev *pdev)
2098 {
2099         struct device_domain_info *info;
2100
2101         if (likely(!iommu_identity_mapping))
2102                 return 0;
2103
2104
2105         list_for_each_entry(info, &si_domain->devices, link)
2106                 if (info->dev == pdev)
2107                         return 1;
2108         return 0;
2109 }
2110
2111 static int domain_add_dev_info(struct dmar_domain *domain,
2112                                struct pci_dev *pdev,
2113                                int translation)
2114 {
2115         struct device_domain_info *info;
2116         unsigned long flags;
2117         int ret;
2118
2119         info = alloc_devinfo_mem();
2120         if (!info)
2121                 return -ENOMEM;
2122
2123         ret = domain_context_mapping(domain, pdev, translation);
2124         if (ret) {
2125                 free_devinfo_mem(info);
2126                 return ret;
2127         }
2128
2129         info->segment = pci_domain_nr(pdev->bus);
2130         info->bus = pdev->bus->number;
2131         info->devfn = pdev->devfn;
2132         info->dev = pdev;
2133         info->domain = domain;
2134
2135         spin_lock_irqsave(&device_domain_lock, flags);
2136         list_add(&info->link, &domain->devices);
2137         list_add(&info->global, &device_domain_list);
2138         pdev->dev.archdata.iommu = info;
2139         spin_unlock_irqrestore(&device_domain_lock, flags);
2140
2141         return 0;
2142 }
2143
2144 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2145 {
2146         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2147                 return 1;
2148
2149         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2150                 return 1;
2151
2152         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2153                 return 0;
2154
2155         /*
2156          * We want to start off with all devices in the 1:1 domain, and
2157          * take them out later if we find they can't access all of memory.
2158          *
2159          * However, we can't do this for PCI devices behind bridges,
2160          * because all PCI devices behind the same bridge will end up
2161          * with the same source-id on their transactions.
2162          *
2163          * Practically speaking, we can't change things around for these
2164          * devices at run-time, because we can't be sure there'll be no
2165          * DMA transactions in flight for any of their siblings.
2166          * 
2167          * So PCI devices (unless they're on the root bus) as well as
2168          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2169          * the 1:1 domain, just in _case_ one of their siblings turns out
2170          * not to be able to map all of memory.
2171          */
2172         if (!pci_is_pcie(pdev)) {
2173                 if (!pci_is_root_bus(pdev->bus))
2174                         return 0;
2175                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2176                         return 0;
2177         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2178                 return 0;
2179
2180         /* 
2181          * At boot time, we don't yet know if devices will be 64-bit capable.
2182          * Assume that they will -- if they turn out not to be, then we can 
2183          * take them out of the 1:1 domain later.
2184          */
2185         if (!startup)
2186                 return pdev->dma_mask > DMA_BIT_MASK(32);
2187
2188         return 1;
2189 }
2190
2191 static int __init iommu_prepare_static_identity_mapping(int hw)
2192 {
2193         struct pci_dev *pdev = NULL;
2194         int ret;
2195
2196         ret = si_domain_init(hw);
2197         if (ret)
2198                 return -EFAULT;
2199
2200         for_each_pci_dev(pdev) {
2201                 if (iommu_should_identity_map(pdev, 1)) {
2202                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2203                                hw ? "hardware" : "software", pci_name(pdev));
2204
2205                         ret = domain_add_dev_info(si_domain, pdev,
2206                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2207                                                      CONTEXT_TT_MULTI_LEVEL);
2208                         if (ret)
2209                                 return ret;
2210                 }
2211         }
2212
2213         return 0;
2214 }
2215
2216 int __init init_dmars(void)
2217 {
2218         struct dmar_drhd_unit *drhd;
2219         struct dmar_rmrr_unit *rmrr;
2220         struct pci_dev *pdev;
2221         struct intel_iommu *iommu;
2222         int i, ret;
2223
2224         /*
2225          * for each drhd
2226          *    allocate root
2227          *    initialize and program root entry to not present
2228          * endfor
2229          */
2230         for_each_drhd_unit(drhd) {
2231                 g_num_of_iommus++;
2232                 /*
2233                  * lock not needed as this is only incremented in the single
2234                  * threaded kernel __init code path all other access are read
2235                  * only
2236                  */
2237         }
2238
2239         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2240                         GFP_KERNEL);
2241         if (!g_iommus) {
2242                 printk(KERN_ERR "Allocating global iommu array failed\n");
2243                 ret = -ENOMEM;
2244                 goto error;
2245         }
2246
2247         deferred_flush = kzalloc(g_num_of_iommus *
2248                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2249         if (!deferred_flush) {
2250                 ret = -ENOMEM;
2251                 goto error;
2252         }
2253
2254         for_each_drhd_unit(drhd) {
2255                 if (drhd->ignored)
2256                         continue;
2257
2258                 iommu = drhd->iommu;
2259                 g_iommus[iommu->seq_id] = iommu;
2260
2261                 ret = iommu_init_domains(iommu);
2262                 if (ret)
2263                         goto error;
2264
2265                 /*
2266                  * TBD:
2267                  * we could share the same root & context tables
2268                  * among all IOMMU's. Need to Split it later.
2269                  */
2270                 ret = iommu_alloc_root_entry(iommu);
2271                 if (ret) {
2272                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2273                         goto error;
2274                 }
2275                 if (!ecap_pass_through(iommu->ecap))
2276                         hw_pass_through = 0;
2277         }
2278
2279         /*
2280          * Start from the sane iommu hardware state.
2281          */
2282         for_each_drhd_unit(drhd) {
2283                 if (drhd->ignored)
2284                         continue;
2285
2286                 iommu = drhd->iommu;
2287
2288                 /*
2289                  * If the queued invalidation is already initialized by us
2290                  * (for example, while enabling interrupt-remapping) then
2291                  * we got the things already rolling from a sane state.
2292                  */
2293                 if (iommu->qi)
2294                         continue;
2295
2296                 /*
2297                  * Clear any previous faults.
2298                  */
2299                 dmar_fault(-1, iommu);
2300                 /*
2301                  * Disable queued invalidation if supported and already enabled
2302                  * before OS handover.
2303                  */
2304                 dmar_disable_qi(iommu);
2305         }
2306
2307         for_each_drhd_unit(drhd) {
2308                 if (drhd->ignored)
2309                         continue;
2310
2311                 iommu = drhd->iommu;
2312
2313                 if (dmar_enable_qi(iommu)) {
2314                         /*
2315                          * Queued Invalidate not enabled, use Register Based
2316                          * Invalidate
2317                          */
2318                         iommu->flush.flush_context = __iommu_flush_context;
2319                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2320                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2321                                "invalidation\n",
2322                                 iommu->seq_id,
2323                                (unsigned long long)drhd->reg_base_addr);
2324                 } else {
2325                         iommu->flush.flush_context = qi_flush_context;
2326                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2327                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2328                                "invalidation\n",
2329                                 iommu->seq_id,
2330                                (unsigned long long)drhd->reg_base_addr);
2331                 }
2332         }
2333
2334         if (iommu_pass_through)
2335                 iommu_identity_mapping |= IDENTMAP_ALL;
2336
2337 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2338         iommu_identity_mapping |= IDENTMAP_GFX;
2339 #endif
2340
2341         check_tylersburg_isoch();
2342
2343         /*
2344          * If pass through is not set or not enabled, setup context entries for
2345          * identity mappings for rmrr, gfx, and isa and may fall back to static
2346          * identity mapping if iommu_identity_mapping is set.
2347          */
2348         if (iommu_identity_mapping) {
2349                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2350                 if (ret) {
2351                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2352                         goto error;
2353                 }
2354         }
2355         /*
2356          * For each rmrr
2357          *   for each dev attached to rmrr
2358          *   do
2359          *     locate drhd for dev, alloc domain for dev
2360          *     allocate free domain
2361          *     allocate page table entries for rmrr
2362          *     if context not allocated for bus
2363          *           allocate and init context
2364          *           set present in root table for this bus
2365          *     init context with domain, translation etc
2366          *    endfor
2367          * endfor
2368          */
2369         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2370         for_each_rmrr_units(rmrr) {
2371                 for (i = 0; i < rmrr->devices_cnt; i++) {
2372                         pdev = rmrr->devices[i];
2373                         /*
2374                          * some BIOS lists non-exist devices in DMAR
2375                          * table.
2376                          */
2377                         if (!pdev)
2378                                 continue;
2379                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2380                         if (ret)
2381                                 printk(KERN_ERR
2382                                        "IOMMU: mapping reserved region failed\n");
2383                 }
2384         }
2385
2386         iommu_prepare_isa();
2387
2388         /*
2389          * for each drhd
2390          *   enable fault log
2391          *   global invalidate context cache
2392          *   global invalidate iotlb
2393          *   enable translation
2394          */
2395         for_each_drhd_unit(drhd) {
2396                 if (drhd->ignored)
2397                         continue;
2398                 iommu = drhd->iommu;
2399
2400                 iommu_flush_write_buffer(iommu);
2401
2402                 ret = dmar_set_interrupt(iommu);
2403                 if (ret)
2404                         goto error;
2405
2406                 iommu_set_root_entry(iommu);
2407
2408                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2409                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2410
2411                 ret = iommu_enable_translation(iommu);
2412                 if (ret)
2413                         goto error;
2414
2415                 iommu_disable_protect_mem_regions(iommu);
2416         }
2417
2418         return 0;
2419 error:
2420         for_each_drhd_unit(drhd) {
2421                 if (drhd->ignored)
2422                         continue;
2423                 iommu = drhd->iommu;
2424                 free_iommu(iommu);
2425         }
2426         kfree(g_iommus);
2427         return ret;
2428 }
2429
2430 /* This takes a number of _MM_ pages, not VTD pages */
2431 static struct iova *intel_alloc_iova(struct device *dev,
2432                                      struct dmar_domain *domain,
2433                                      unsigned long nrpages, uint64_t dma_mask)
2434 {
2435         struct pci_dev *pdev = to_pci_dev(dev);
2436         struct iova *iova = NULL;
2437
2438         /* Restrict dma_mask to the width that the iommu can handle */
2439         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2440
2441         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2442                 /*
2443                  * First try to allocate an io virtual address in
2444                  * DMA_BIT_MASK(32) and if that fails then try allocating
2445                  * from higher range
2446                  */
2447                 iova = alloc_iova(&domain->iovad, nrpages,
2448                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2449                 if (iova)
2450                         return iova;
2451         }
2452         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2453         if (unlikely(!iova)) {
2454                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2455                        nrpages, pci_name(pdev));
2456                 return NULL;
2457         }
2458
2459         return iova;
2460 }
2461
2462 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2463 {
2464         struct dmar_domain *domain;
2465         int ret;
2466
2467         domain = get_domain_for_dev(pdev,
2468                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2469         if (!domain) {
2470                 printk(KERN_ERR
2471                         "Allocating domain for %s failed", pci_name(pdev));
2472                 return NULL;
2473         }
2474
2475         /* make sure context mapping is ok */
2476         if (unlikely(!domain_context_mapped(pdev))) {
2477                 ret = domain_context_mapping(domain, pdev,
2478                                              CONTEXT_TT_MULTI_LEVEL);
2479                 if (ret) {
2480                         printk(KERN_ERR
2481                                 "Domain context map for %s failed",
2482                                 pci_name(pdev));
2483                         return NULL;
2484                 }
2485         }
2486
2487         return domain;
2488 }
2489
2490 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2491 {
2492         struct device_domain_info *info;
2493
2494         /* No lock here, assumes no domain exit in normal case */
2495         info = dev->dev.archdata.iommu;
2496         if (likely(info))
2497                 return info->domain;
2498
2499         return __get_valid_domain_for_dev(dev);
2500 }
2501
2502 static int iommu_dummy(struct pci_dev *pdev)
2503 {
2504         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2505 }
2506
2507 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2508 static int iommu_no_mapping(struct device *dev)
2509 {
2510         struct pci_dev *pdev;
2511         int found;
2512
2513         if (unlikely(dev->bus != &pci_bus_type))
2514                 return 1;
2515
2516         pdev = to_pci_dev(dev);
2517         if (iommu_dummy(pdev))
2518                 return 1;
2519
2520         if (!iommu_identity_mapping)
2521                 return 0;
2522
2523         found = identity_mapping(pdev);
2524         if (found) {
2525                 if (iommu_should_identity_map(pdev, 0))
2526                         return 1;
2527                 else {
2528                         /*
2529                          * 32 bit DMA is removed from si_domain and fall back
2530                          * to non-identity mapping.
2531                          */
2532                         domain_remove_one_dev_info(si_domain, pdev);
2533                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2534                                pci_name(pdev));
2535                         return 0;
2536                 }
2537         } else {
2538                 /*
2539                  * In case of a detached 64 bit DMA device from vm, the device
2540                  * is put into si_domain for identity mapping.
2541                  */
2542                 if (iommu_should_identity_map(pdev, 0)) {
2543                         int ret;
2544                         ret = domain_add_dev_info(si_domain, pdev,
2545                                                   hw_pass_through ?
2546                                                   CONTEXT_TT_PASS_THROUGH :
2547                                                   CONTEXT_TT_MULTI_LEVEL);
2548                         if (!ret) {
2549                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2550                                        pci_name(pdev));
2551                                 return 1;
2552                         }
2553                 }
2554         }
2555
2556         return 0;
2557 }
2558
2559 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2560                                      size_t size, int dir, u64 dma_mask)
2561 {
2562         struct pci_dev *pdev = to_pci_dev(hwdev);
2563         struct dmar_domain *domain;
2564         phys_addr_t start_paddr;
2565         struct iova *iova;
2566         int prot = 0;
2567         int ret;
2568         struct intel_iommu *iommu;
2569         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2570
2571         BUG_ON(dir == DMA_NONE);
2572
2573         if (iommu_no_mapping(hwdev))
2574                 return paddr;
2575
2576         domain = get_valid_domain_for_dev(pdev);
2577         if (!domain)
2578                 return 0;
2579
2580         iommu = domain_get_iommu(domain);
2581         size = aligned_nrpages(paddr, size);
2582
2583         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2584                                 pdev->dma_mask);
2585         if (!iova)
2586                 goto error;
2587
2588         /*
2589          * Check if DMAR supports zero-length reads on write only
2590          * mappings..
2591          */
2592         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2593                         !cap_zlr(iommu->cap))
2594                 prot |= DMA_PTE_READ;
2595         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2596                 prot |= DMA_PTE_WRITE;
2597         /*
2598          * paddr - (paddr + size) might be partial page, we should map the whole
2599          * page.  Note: if two part of one page are separately mapped, we
2600          * might have two guest_addr mapping to the same host paddr, but this
2601          * is not a big problem
2602          */
2603         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2604                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2605         if (ret)
2606                 goto error;
2607
2608         /* it's a non-present to present mapping. Only flush if caching mode */
2609         if (cap_caching_mode(iommu->cap))
2610                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2611         else
2612                 iommu_flush_write_buffer(iommu);
2613
2614         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2615         start_paddr += paddr & ~PAGE_MASK;
2616         return start_paddr;
2617
2618 error:
2619         if (iova)
2620                 __free_iova(&domain->iovad, iova);
2621         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2622                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2623         return 0;
2624 }
2625
2626 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2627                                  unsigned long offset, size_t size,
2628                                  enum dma_data_direction dir,
2629                                  struct dma_attrs *attrs)
2630 {
2631         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2632                                   dir, to_pci_dev(dev)->dma_mask);
2633 }
2634
2635 static void flush_unmaps(void)
2636 {
2637         int i, j;
2638
2639         timer_on = 0;
2640
2641         /* just flush them all */
2642         for (i = 0; i < g_num_of_iommus; i++) {
2643                 struct intel_iommu *iommu = g_iommus[i];
2644                 if (!iommu)
2645                         continue;
2646
2647                 if (!deferred_flush[i].next)
2648                         continue;
2649
2650                 /* In caching mode, global flushes turn emulation expensive */
2651                 if (!cap_caching_mode(iommu->cap))
2652                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2653                                          DMA_TLB_GLOBAL_FLUSH);
2654                 for (j = 0; j < deferred_flush[i].next; j++) {
2655                         unsigned long mask;
2656                         struct iova *iova = deferred_flush[i].iova[j];
2657                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2658
2659                         /* On real hardware multiple invalidations are expensive */
2660                         if (cap_caching_mode(iommu->cap))
2661                                 iommu_flush_iotlb_psi(iommu, domain->id,
2662                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2663                         else {
2664                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2665                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2666                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2667                         }
2668                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2669                 }
2670                 deferred_flush[i].next = 0;
2671         }
2672
2673         list_size = 0;
2674 }
2675
2676 static void flush_unmaps_timeout(unsigned long data)
2677 {
2678         unsigned long flags;
2679
2680         spin_lock_irqsave(&async_umap_flush_lock, flags);
2681         flush_unmaps();
2682         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2683 }
2684
2685 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2686 {
2687         unsigned long flags;
2688         int next, iommu_id;
2689         struct intel_iommu *iommu;
2690
2691         spin_lock_irqsave(&async_umap_flush_lock, flags);
2692         if (list_size == HIGH_WATER_MARK)
2693                 flush_unmaps();
2694
2695         iommu = domain_get_iommu(dom);
2696         iommu_id = iommu->seq_id;
2697
2698         next = deferred_flush[iommu_id].next;
2699         deferred_flush[iommu_id].domain[next] = dom;
2700         deferred_flush[iommu_id].iova[next] = iova;
2701         deferred_flush[iommu_id].next++;
2702
2703         if (!timer_on) {
2704                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2705                 timer_on = 1;
2706         }
2707         list_size++;
2708         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2709 }
2710
2711 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2712                              size_t size, enum dma_data_direction dir,
2713                              struct dma_attrs *attrs)
2714 {
2715         struct pci_dev *pdev = to_pci_dev(dev);
2716         struct dmar_domain *domain;
2717         unsigned long start_pfn, last_pfn;
2718         struct iova *iova;
2719         struct intel_iommu *iommu;
2720
2721         if (iommu_no_mapping(dev))
2722                 return;
2723
2724         domain = find_domain(pdev);
2725         BUG_ON(!domain);
2726
2727         iommu = domain_get_iommu(domain);
2728
2729         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2730         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2731                       (unsigned long long)dev_addr))
2732                 return;
2733
2734         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2735         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2736
2737         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2738                  pci_name(pdev), start_pfn, last_pfn);
2739
2740         /*  clear the whole page */
2741         dma_pte_clear_range(domain, start_pfn, last_pfn);
2742
2743         /* free page tables */
2744         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2745
2746         if (intel_iommu_strict) {
2747                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2748                                       last_pfn - start_pfn + 1, 0);
2749                 /* free iova */
2750                 __free_iova(&domain->iovad, iova);
2751         } else {
2752                 add_unmap(domain, iova);
2753                 /*
2754                  * queue up the release of the unmap to save the 1/6th of the
2755                  * cpu used up by the iotlb flush operation...
2756                  */
2757         }
2758 }
2759
2760 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2761                                   dma_addr_t *dma_handle, gfp_t flags)
2762 {
2763         void *vaddr;
2764         int order;
2765
2766         size = PAGE_ALIGN(size);
2767         order = get_order(size);
2768
2769         if (!iommu_no_mapping(hwdev))
2770                 flags &= ~(GFP_DMA | GFP_DMA32);
2771         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2772                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2773                         flags |= GFP_DMA;
2774                 else
2775                         flags |= GFP_DMA32;
2776         }
2777
2778         vaddr = (void *)__get_free_pages(flags, order);
2779         if (!vaddr)
2780                 return NULL;
2781         memset(vaddr, 0, size);
2782
2783         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2784                                          DMA_BIDIRECTIONAL,
2785                                          hwdev->coherent_dma_mask);
2786         if (*dma_handle)
2787                 return vaddr;
2788         free_pages((unsigned long)vaddr, order);
2789         return NULL;
2790 }
2791
2792 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2793                                 dma_addr_t dma_handle)
2794 {
2795         int order;
2796
2797         size = PAGE_ALIGN(size);
2798         order = get_order(size);
2799
2800         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2801         free_pages((unsigned long)vaddr, order);
2802 }
2803
2804 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2805                            int nelems, enum dma_data_direction dir,
2806                            struct dma_attrs *attrs)
2807 {
2808         struct pci_dev *pdev = to_pci_dev(hwdev);
2809         struct dmar_domain *domain;
2810         unsigned long start_pfn, last_pfn;
2811         struct iova *iova;
2812         struct intel_iommu *iommu;
2813
2814         if (iommu_no_mapping(hwdev))
2815                 return;
2816
2817         domain = find_domain(pdev);
2818         BUG_ON(!domain);
2819
2820         iommu = domain_get_iommu(domain);
2821
2822         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2823         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2824                       (unsigned long long)sglist[0].dma_address))
2825                 return;
2826
2827         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2828         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2829
2830         /*  clear the whole page */
2831         dma_pte_clear_range(domain, start_pfn, last_pfn);
2832
2833         /* free page tables */
2834         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2835
2836         if (intel_iommu_strict) {
2837                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2838                                       last_pfn - start_pfn + 1, 0);
2839                 /* free iova */
2840                 __free_iova(&domain->iovad, iova);
2841         } else {
2842                 add_unmap(domain, iova);
2843                 /*
2844                  * queue up the release of the unmap to save the 1/6th of the
2845                  * cpu used up by the iotlb flush operation...
2846                  */
2847         }
2848 }
2849
2850 static int intel_nontranslate_map_sg(struct device *hddev,
2851         struct scatterlist *sglist, int nelems, int dir)
2852 {
2853         int i;
2854         struct scatterlist *sg;
2855
2856         for_each_sg(sglist, sg, nelems, i) {
2857                 BUG_ON(!sg_page(sg));
2858                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2859                 sg->dma_length = sg->length;
2860         }
2861         return nelems;
2862 }
2863
2864 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2865                         enum dma_data_direction dir, struct dma_attrs *attrs)
2866 {
2867         int i;
2868         struct pci_dev *pdev = to_pci_dev(hwdev);
2869         struct dmar_domain *domain;
2870         size_t size = 0;
2871         int prot = 0;
2872         struct iova *iova = NULL;
2873         int ret;
2874         struct scatterlist *sg;
2875         unsigned long start_vpfn;
2876         struct intel_iommu *iommu;
2877
2878         BUG_ON(dir == DMA_NONE);
2879         if (iommu_no_mapping(hwdev))
2880                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2881
2882         domain = get_valid_domain_for_dev(pdev);
2883         if (!domain)
2884                 return 0;
2885
2886         iommu = domain_get_iommu(domain);
2887
2888         for_each_sg(sglist, sg, nelems, i)
2889                 size += aligned_nrpages(sg->offset, sg->length);
2890
2891         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2892                                 pdev->dma_mask);
2893         if (!iova) {
2894                 sglist->dma_length = 0;
2895                 return 0;
2896         }
2897
2898         /*
2899          * Check if DMAR supports zero-length reads on write only
2900          * mappings..
2901          */
2902         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2903                         !cap_zlr(iommu->cap))
2904                 prot |= DMA_PTE_READ;
2905         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2906                 prot |= DMA_PTE_WRITE;
2907
2908         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2909
2910         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2911         if (unlikely(ret)) {
2912                 /*  clear the page */
2913                 dma_pte_clear_range(domain, start_vpfn,
2914                                     start_vpfn + size - 1);
2915                 /* free page tables */
2916                 dma_pte_free_pagetable(domain, start_vpfn,
2917                                        start_vpfn + size - 1);
2918                 /* free iova */
2919                 __free_iova(&domain->iovad, iova);
2920                 return 0;
2921         }
2922
2923         /* it's a non-present to present mapping. Only flush if caching mode */
2924         if (cap_caching_mode(iommu->cap))
2925                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
2926         else
2927                 iommu_flush_write_buffer(iommu);
2928
2929         return nelems;
2930 }
2931
2932 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2933 {
2934         return !dma_addr;
2935 }
2936
2937 struct dma_map_ops intel_dma_ops = {
2938         .alloc_coherent = intel_alloc_coherent,
2939         .free_coherent = intel_free_coherent,
2940         .map_sg = intel_map_sg,
2941         .unmap_sg = intel_unmap_sg,
2942         .map_page = intel_map_page,
2943         .unmap_page = intel_unmap_page,
2944         .mapping_error = intel_mapping_error,
2945 };
2946
2947 static inline int iommu_domain_cache_init(void)
2948 {
2949         int ret = 0;
2950
2951         iommu_domain_cache = kmem_cache_create("iommu_domain",
2952                                          sizeof(struct dmar_domain),
2953                                          0,
2954                                          SLAB_HWCACHE_ALIGN,
2955
2956                                          NULL);
2957         if (!iommu_domain_cache) {
2958                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2959                 ret = -ENOMEM;
2960         }
2961
2962         return ret;
2963 }
2964
2965 static inline int iommu_devinfo_cache_init(void)
2966 {
2967         int ret = 0;
2968
2969         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2970                                          sizeof(struct device_domain_info),
2971                                          0,
2972                                          SLAB_HWCACHE_ALIGN,
2973                                          NULL);
2974         if (!iommu_devinfo_cache) {
2975                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2976                 ret = -ENOMEM;
2977         }
2978
2979         return ret;
2980 }
2981
2982 static inline int iommu_iova_cache_init(void)
2983 {
2984         int ret = 0;
2985
2986         iommu_iova_cache = kmem_cache_create("iommu_iova",
2987                                          sizeof(struct iova),
2988                                          0,
2989                                          SLAB_HWCACHE_ALIGN,
2990                                          NULL);
2991         if (!iommu_iova_cache) {
2992                 printk(KERN_ERR "Couldn't create iova cache\n");
2993                 ret = -ENOMEM;
2994         }
2995
2996         return ret;
2997 }
2998
2999 static int __init iommu_init_mempool(void)
3000 {
3001         int ret;
3002         ret = iommu_iova_cache_init();
3003         if (ret)
3004                 return ret;
3005
3006         ret = iommu_domain_cache_init();
3007         if (ret)
3008                 goto domain_error;
3009
3010         ret = iommu_devinfo_cache_init();
3011         if (!ret)
3012                 return ret;
3013
3014         kmem_cache_destroy(iommu_domain_cache);
3015 domain_error:
3016         kmem_cache_destroy(iommu_iova_cache);
3017
3018         return -ENOMEM;
3019 }
3020
3021 static void __init iommu_exit_mempool(void)
3022 {
3023         kmem_cache_destroy(iommu_devinfo_cache);
3024         kmem_cache_destroy(iommu_domain_cache);
3025         kmem_cache_destroy(iommu_iova_cache);
3026
3027 }
3028
3029 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3030 {
3031         struct dmar_drhd_unit *drhd;
3032         u32 vtbar;
3033         int rc;
3034
3035         /* We know that this device on this chipset has its own IOMMU.
3036          * If we find it under a different IOMMU, then the BIOS is lying
3037          * to us. Hope that the IOMMU for this device is actually
3038          * disabled, and it needs no translation...
3039          */
3040         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3041         if (rc) {
3042                 /* "can't" happen */
3043                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3044                 return;
3045         }
3046         vtbar &= 0xffff0000;
3047
3048         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3049         drhd = dmar_find_matched_drhd_unit(pdev);
3050         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3051                             TAINT_FIRMWARE_WORKAROUND,
3052                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3053                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3054 }
3055 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3056
3057 static void __init init_no_remapping_devices(void)
3058 {
3059         struct dmar_drhd_unit *drhd;
3060
3061         for_each_drhd_unit(drhd) {
3062                 if (!drhd->include_all) {
3063                         int i;
3064                         for (i = 0; i < drhd->devices_cnt; i++)
3065                                 if (drhd->devices[i] != NULL)
3066                                         break;
3067                         /* ignore DMAR unit if no pci devices exist */
3068                         if (i == drhd->devices_cnt)
3069                                 drhd->ignored = 1;
3070                 }
3071         }
3072
3073         if (dmar_map_gfx)
3074                 return;
3075
3076         for_each_drhd_unit(drhd) {
3077                 int i;
3078                 if (drhd->ignored || drhd->include_all)
3079                         continue;
3080
3081                 for (i = 0; i < drhd->devices_cnt; i++)
3082                         if (drhd->devices[i] &&
3083                                 !IS_GFX_DEVICE(drhd->devices[i]))
3084                                 break;
3085
3086                 if (i < drhd->devices_cnt)
3087                         continue;
3088
3089                 /* bypass IOMMU if it is just for gfx devices */
3090                 drhd->ignored = 1;
3091                 for (i = 0; i < drhd->devices_cnt; i++) {
3092                         if (!drhd->devices[i])
3093                                 continue;
3094                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3095                 }
3096         }
3097 }
3098
3099 #ifdef CONFIG_SUSPEND
3100 static int init_iommu_hw(void)
3101 {
3102         struct dmar_drhd_unit *drhd;
3103         struct intel_iommu *iommu = NULL;
3104
3105         for_each_active_iommu(iommu, drhd)
3106                 if (iommu->qi)
3107                         dmar_reenable_qi(iommu);
3108
3109         for_each_active_iommu(iommu, drhd) {
3110                 iommu_flush_write_buffer(iommu);
3111
3112                 iommu_set_root_entry(iommu);
3113
3114                 iommu->flush.flush_context(iommu, 0, 0, 0,
3115                                            DMA_CCMD_GLOBAL_INVL);
3116                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3117                                          DMA_TLB_GLOBAL_FLUSH);
3118                 iommu_enable_translation(iommu);
3119                 iommu_disable_protect_mem_regions(iommu);
3120         }
3121
3122         return 0;
3123 }
3124
3125 static void iommu_flush_all(void)
3126 {
3127         struct dmar_drhd_unit *drhd;
3128         struct intel_iommu *iommu;
3129
3130         for_each_active_iommu(iommu, drhd) {
3131                 iommu->flush.flush_context(iommu, 0, 0, 0,
3132                                            DMA_CCMD_GLOBAL_INVL);
3133                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3134                                          DMA_TLB_GLOBAL_FLUSH);
3135         }
3136 }
3137
3138 static int iommu_suspend(void)
3139 {
3140         struct dmar_drhd_unit *drhd;
3141         struct intel_iommu *iommu = NULL;
3142         unsigned long flag;
3143
3144         for_each_active_iommu(iommu, drhd) {
3145                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3146                                                  GFP_ATOMIC);
3147                 if (!iommu->iommu_state)
3148                         goto nomem;
3149         }
3150
3151         iommu_flush_all();
3152
3153         for_each_active_iommu(iommu, drhd) {
3154                 iommu_disable_translation(iommu);
3155
3156                 spin_lock_irqsave(&iommu->register_lock, flag);
3157
3158                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3159                         readl(iommu->reg + DMAR_FECTL_REG);
3160                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3161                         readl(iommu->reg + DMAR_FEDATA_REG);
3162                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3163                         readl(iommu->reg + DMAR_FEADDR_REG);
3164                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3165                         readl(iommu->reg + DMAR_FEUADDR_REG);
3166
3167                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3168         }
3169         return 0;
3170
3171 nomem:
3172         for_each_active_iommu(iommu, drhd)
3173                 kfree(iommu->iommu_state);
3174
3175         return -ENOMEM;
3176 }
3177
3178 static void iommu_resume(void)
3179 {
3180         struct dmar_drhd_unit *drhd;
3181         struct intel_iommu *iommu = NULL;
3182         unsigned long flag;
3183
3184         if (init_iommu_hw()) {
3185                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3186                 return;
3187         }
3188
3189         for_each_active_iommu(iommu, drhd) {
3190
3191                 spin_lock_irqsave(&iommu->register_lock, flag);
3192
3193                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3194                         iommu->reg + DMAR_FECTL_REG);
3195                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3196                         iommu->reg + DMAR_FEDATA_REG);
3197                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3198                         iommu->reg + DMAR_FEADDR_REG);
3199                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3200                         iommu->reg + DMAR_FEUADDR_REG);
3201
3202                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3203         }
3204
3205         for_each_active_iommu(iommu, drhd)
3206                 kfree(iommu->iommu_state);
3207 }
3208
3209 static struct syscore_ops iommu_syscore_ops = {
3210         .resume         = iommu_resume,
3211         .suspend        = iommu_suspend,
3212 };
3213
3214 static void __init init_iommu_pm_ops(void)
3215 {
3216         register_syscore_ops(&iommu_syscore_ops);
3217 }
3218
3219 #else
3220 static inline int init_iommu_pm_ops(void) { }
3221 #endif  /* CONFIG_PM */
3222
3223 /*
3224  * Here we only respond to action of unbound device from driver.
3225  *
3226  * Added device is not attached to its DMAR domain here yet. That will happen
3227  * when mapping the device to iova.
3228  */
3229 static int device_notifier(struct notifier_block *nb,
3230                                   unsigned long action, void *data)
3231 {
3232         struct device *dev = data;
3233         struct pci_dev *pdev = to_pci_dev(dev);
3234         struct dmar_domain *domain;
3235
3236         if (iommu_no_mapping(dev))
3237                 return 0;
3238
3239         domain = find_domain(pdev);
3240         if (!domain)
3241                 return 0;
3242
3243         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through)
3244                 domain_remove_one_dev_info(domain, pdev);
3245
3246         return 0;
3247 }
3248
3249 static struct notifier_block device_nb = {
3250         .notifier_call = device_notifier,
3251 };
3252
3253 int __init intel_iommu_init(void)
3254 {
3255         int ret = 0;
3256         int force_on = 0;
3257
3258         /* VT-d is required for a TXT/tboot launch, so enforce that */
3259         force_on = tboot_force_iommu();
3260
3261         if (dmar_table_init()) {
3262                 if (force_on)
3263                         panic("tboot: Failed to initialize DMAR table\n");
3264                 return  -ENODEV;
3265         }
3266
3267         if (dmar_dev_scope_init()) {
3268                 if (force_on)
3269                         panic("tboot: Failed to initialize DMAR device scope\n");
3270                 return  -ENODEV;
3271         }
3272
3273         /*
3274          * Check the need for DMA-remapping initialization now.
3275          * Above initialization will also be used by Interrupt-remapping.
3276          */
3277         if (no_iommu || dmar_disabled)
3278                 return -ENODEV;
3279
3280         iommu_init_mempool();
3281         dmar_init_reserved_ranges();
3282
3283         init_no_remapping_devices();
3284
3285         ret = init_dmars();
3286         if (ret) {
3287                 if (force_on)
3288                         panic("tboot: Failed to initialize DMARs\n");
3289                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3290                 put_iova_domain(&reserved_iova_list);
3291                 iommu_exit_mempool();
3292                 return ret;
3293         }
3294         printk(KERN_INFO
3295         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3296
3297         init_timer(&unmap_timer);
3298 #ifdef CONFIG_SWIOTLB
3299         swiotlb = 0;
3300 #endif
3301         dma_ops = &intel_dma_ops;
3302
3303         init_iommu_pm_ops();
3304
3305         register_iommu(&intel_iommu_ops);
3306
3307         bus_register_notifier(&pci_bus_type, &device_nb);
3308
3309         return 0;
3310 }
3311
3312 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3313                                            struct pci_dev *pdev)
3314 {
3315         struct pci_dev *tmp, *parent;
3316
3317         if (!iommu || !pdev)
3318                 return;
3319
3320         /* dependent device detach */
3321         tmp = pci_find_upstream_pcie_bridge(pdev);
3322         /* Secondary interface's bus number and devfn 0 */
3323         if (tmp) {
3324                 parent = pdev->bus->self;
3325                 while (parent != tmp) {
3326                         iommu_detach_dev(iommu, parent->bus->number,
3327                                          parent->devfn);
3328                         parent = parent->bus->self;
3329                 }
3330                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3331                         iommu_detach_dev(iommu,
3332                                 tmp->subordinate->number, 0);
3333                 else /* this is a legacy PCI bridge */
3334                         iommu_detach_dev(iommu, tmp->bus->number,
3335                                          tmp->devfn);
3336         }
3337 }
3338
3339 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3340                                           struct pci_dev *pdev)
3341 {
3342         struct device_domain_info *info;
3343         struct intel_iommu *iommu;
3344         unsigned long flags;
3345         int found = 0;
3346         struct list_head *entry, *tmp;
3347
3348         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3349                                 pdev->devfn);
3350         if (!iommu)
3351                 return;
3352
3353         spin_lock_irqsave(&device_domain_lock, flags);
3354         list_for_each_safe(entry, tmp, &domain->devices) {
3355                 info = list_entry(entry, struct device_domain_info, link);
3356                 /* No need to compare PCI domain; it has to be the same */
3357                 if (info->bus == pdev->bus->number &&
3358                     info->devfn == pdev->devfn) {
3359                         list_del(&info->link);
3360                         list_del(&info->global);
3361                         if (info->dev)
3362                                 info->dev->dev.archdata.iommu = NULL;
3363                         spin_unlock_irqrestore(&device_domain_lock, flags);
3364
3365                         iommu_disable_dev_iotlb(info);
3366                         iommu_detach_dev(iommu, info->bus, info->devfn);
3367                         iommu_detach_dependent_devices(iommu, pdev);
3368                         free_devinfo_mem(info);
3369
3370                         spin_lock_irqsave(&device_domain_lock, flags);
3371
3372                         if (found)
3373                                 break;
3374                         else
3375                                 continue;
3376                 }
3377
3378                 /* if there is no other devices under the same iommu
3379                  * owned by this domain, clear this iommu in iommu_bmp
3380                  * update iommu count and coherency
3381                  */
3382                 if (iommu == device_to_iommu(info->segment, info->bus,
3383                                             info->devfn))
3384                         found = 1;
3385         }
3386
3387         if (found == 0) {
3388                 unsigned long tmp_flags;
3389                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3390                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3391                 domain->iommu_count--;
3392                 domain_update_iommu_cap(domain);
3393                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3394         }
3395
3396         spin_unlock_irqrestore(&device_domain_lock, flags);
3397 }
3398
3399 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3400 {
3401         struct device_domain_info *info;
3402         struct intel_iommu *iommu;
3403         unsigned long flags1, flags2;
3404
3405         spin_lock_irqsave(&device_domain_lock, flags1);
3406         while (!list_empty(&domain->devices)) {
3407                 info = list_entry(domain->devices.next,
3408                         struct device_domain_info, link);
3409                 list_del(&info->link);
3410                 list_del(&info->global);
3411                 if (info->dev)
3412                         info->dev->dev.archdata.iommu = NULL;
3413
3414                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3415
3416                 iommu_disable_dev_iotlb(info);
3417                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3418                 iommu_detach_dev(iommu, info->bus, info->devfn);
3419                 iommu_detach_dependent_devices(iommu, info->dev);
3420
3421                 /* clear this iommu in iommu_bmp, update iommu count
3422                  * and capabilities
3423                  */
3424                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3425                 if (test_and_clear_bit(iommu->seq_id,
3426                                        &domain->iommu_bmp)) {
3427                         domain->iommu_count--;
3428                         domain_update_iommu_cap(domain);
3429                 }
3430                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3431
3432                 free_devinfo_mem(info);
3433                 spin_lock_irqsave(&device_domain_lock, flags1);
3434         }
3435         spin_unlock_irqrestore(&device_domain_lock, flags1);
3436 }
3437
3438 /* domain id for virtual machine, it won't be set in context */
3439 static unsigned long vm_domid;
3440
3441 static struct dmar_domain *iommu_alloc_vm_domain(void)
3442 {
3443         struct dmar_domain *domain;
3444
3445         domain = alloc_domain_mem();
3446         if (!domain)
3447                 return NULL;
3448
3449         domain->id = vm_domid++;
3450         domain->nid = -1;
3451         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3452         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3453
3454         return domain;
3455 }
3456
3457 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3458 {
3459         int adjust_width;
3460
3461         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3462         spin_lock_init(&domain->iommu_lock);
3463
3464         domain_reserve_special_ranges(domain);
3465
3466         /* calculate AGAW */
3467         domain->gaw = guest_width;
3468         adjust_width = guestwidth_to_adjustwidth(guest_width);
3469         domain->agaw = width_to_agaw(adjust_width);
3470
3471         INIT_LIST_HEAD(&domain->devices);
3472
3473         domain->iommu_count = 0;
3474         domain->iommu_coherency = 0;
3475         domain->iommu_snooping = 0;
3476         domain->max_addr = 0;
3477         domain->nid = -1;
3478
3479         /* always allocate the top pgd */
3480         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3481         if (!domain->pgd)
3482                 return -ENOMEM;
3483         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3484         return 0;
3485 }
3486
3487 static void iommu_free_vm_domain(struct dmar_domain *domain)
3488 {
3489         unsigned long flags;
3490         struct dmar_drhd_unit *drhd;
3491         struct intel_iommu *iommu;
3492         unsigned long i;
3493         unsigned long ndomains;
3494
3495         for_each_drhd_unit(drhd) {
3496                 if (drhd->ignored)
3497                         continue;
3498                 iommu = drhd->iommu;
3499
3500                 ndomains = cap_ndoms(iommu->cap);
3501                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3502                         if (iommu->domains[i] == domain) {
3503                                 spin_lock_irqsave(&iommu->lock, flags);
3504                                 clear_bit(i, iommu->domain_ids);
3505                                 iommu->domains[i] = NULL;
3506                                 spin_unlock_irqrestore(&iommu->lock, flags);
3507                                 break;
3508                         }
3509                 }
3510         }
3511 }
3512
3513 static void vm_domain_exit(struct dmar_domain *domain)
3514 {
3515         /* Domain 0 is reserved, so dont process it */
3516         if (!domain)
3517                 return;
3518
3519         vm_domain_remove_all_dev_info(domain);
3520         /* destroy iovas */
3521         put_iova_domain(&domain->iovad);
3522
3523         /* clear ptes */
3524         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3525
3526         /* free page tables */
3527         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3528
3529         iommu_free_vm_domain(domain);
3530         free_domain_mem(domain);
3531 }
3532
3533 static int intel_iommu_domain_init(struct iommu_domain *domain)
3534 {
3535         struct dmar_domain *dmar_domain;
3536
3537         dmar_domain = iommu_alloc_vm_domain();
3538         if (!dmar_domain) {
3539                 printk(KERN_ERR
3540                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3541                 return -ENOMEM;
3542         }
3543         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3544                 printk(KERN_ERR
3545                         "intel_iommu_domain_init() failed\n");
3546                 vm_domain_exit(dmar_domain);
3547                 return -ENOMEM;
3548         }
3549         domain->priv = dmar_domain;
3550
3551         return 0;
3552 }
3553
3554 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3555 {
3556         struct dmar_domain *dmar_domain = domain->priv;
3557
3558         domain->priv = NULL;
3559         vm_domain_exit(dmar_domain);
3560 }
3561
3562 static int intel_iommu_attach_device(struct iommu_domain *domain,
3563                                      struct device *dev)
3564 {
3565         struct dmar_domain *dmar_domain = domain->priv;
3566         struct pci_dev *pdev = to_pci_dev(dev);
3567         struct intel_iommu *iommu;
3568         int addr_width;
3569
3570         /* normally pdev is not mapped */
3571         if (unlikely(domain_context_mapped(pdev))) {
3572                 struct dmar_domain *old_domain;
3573
3574                 old_domain = find_domain(pdev);
3575                 if (old_domain) {
3576                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3577                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3578                                 domain_remove_one_dev_info(old_domain, pdev);
3579                         else
3580                                 domain_remove_dev_info(old_domain);
3581                 }
3582         }
3583
3584         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3585                                 pdev->devfn);
3586         if (!iommu)
3587                 return -ENODEV;
3588
3589         /* check if this iommu agaw is sufficient for max mapped address */
3590         addr_width = agaw_to_width(iommu->agaw);
3591         if (addr_width > cap_mgaw(iommu->cap))
3592                 addr_width = cap_mgaw(iommu->cap);
3593
3594         if (dmar_domain->max_addr > (1LL << addr_width)) {
3595                 printk(KERN_ERR "%s: iommu width (%d) is not "
3596                        "sufficient for the mapped address (%llx)\n",
3597                        __func__, addr_width, dmar_domain->max_addr);
3598                 return -EFAULT;
3599         }
3600         dmar_domain->gaw = addr_width;
3601
3602         /*
3603          * Knock out extra levels of page tables if necessary
3604          */
3605         while (iommu->agaw < dmar_domain->agaw) {
3606                 struct dma_pte *pte;
3607
3608                 pte = dmar_domain->pgd;
3609                 if (dma_pte_present(pte)) {
3610                         free_pgtable_page(dmar_domain->pgd);
3611                         dmar_domain->pgd = (struct dma_pte *)
3612                                 phys_to_virt(dma_pte_addr(pte));
3613                 }
3614                 dmar_domain->agaw--;
3615         }
3616
3617         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3618 }
3619
3620 static void intel_iommu_detach_device(struct iommu_domain *domain,
3621                                       struct device *dev)
3622 {
3623         struct dmar_domain *dmar_domain = domain->priv;
3624         struct pci_dev *pdev = to_pci_dev(dev);
3625
3626         domain_remove_one_dev_info(dmar_domain, pdev);
3627 }
3628
3629 static int intel_iommu_map(struct iommu_domain *domain,
3630                            unsigned long iova, phys_addr_t hpa,
3631                            int gfp_order, int iommu_prot)
3632 {
3633         struct dmar_domain *dmar_domain = domain->priv;
3634         u64 max_addr;
3635         int prot = 0;
3636         size_t size;
3637         int ret;
3638
3639         if (iommu_prot & IOMMU_READ)
3640                 prot |= DMA_PTE_READ;
3641         if (iommu_prot & IOMMU_WRITE)
3642                 prot |= DMA_PTE_WRITE;
3643         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3644                 prot |= DMA_PTE_SNP;
3645
3646         size     = PAGE_SIZE << gfp_order;
3647         max_addr = iova + size;
3648         if (dmar_domain->max_addr < max_addr) {
3649                 u64 end;
3650
3651                 /* check if minimum agaw is sufficient for mapped address */
3652                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3653                 if (end < max_addr) {
3654                         printk(KERN_ERR "%s: iommu width (%d) is not "
3655                                "sufficient for the mapped address (%llx)\n",
3656                                __func__, dmar_domain->gaw, max_addr);
3657                         return -EFAULT;
3658                 }
3659                 dmar_domain->max_addr = max_addr;
3660         }
3661         /* Round up size to next multiple of PAGE_SIZE, if it and
3662            the low bits of hpa would take us onto the next page */
3663         size = aligned_nrpages(hpa, size);
3664         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3665                                  hpa >> VTD_PAGE_SHIFT, size, prot);
3666         return ret;
3667 }
3668
3669 static int intel_iommu_unmap(struct iommu_domain *domain,
3670                              unsigned long iova, int gfp_order)
3671 {
3672         struct dmar_domain *dmar_domain = domain->priv;
3673         size_t size = PAGE_SIZE << gfp_order;
3674
3675         dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3676                             (iova + size - 1) >> VTD_PAGE_SHIFT);
3677
3678         if (dmar_domain->max_addr == iova + size)
3679                 dmar_domain->max_addr = iova;
3680
3681         return gfp_order;
3682 }
3683
3684 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3685                                             unsigned long iova)
3686 {
3687         struct dmar_domain *dmar_domain = domain->priv;
3688         struct dma_pte *pte;
3689         u64 phys = 0;
3690
3691         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3692         if (pte)
3693                 phys = dma_pte_addr(pte);
3694
3695         return phys;
3696 }
3697
3698 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3699                                       unsigned long cap)
3700 {
3701         struct dmar_domain *dmar_domain = domain->priv;
3702
3703         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3704                 return dmar_domain->iommu_snooping;
3705         if (cap == IOMMU_CAP_INTR_REMAP)
3706                 return intr_remapping_enabled;
3707
3708         return 0;
3709 }
3710
3711 static struct iommu_ops intel_iommu_ops = {
3712         .domain_init    = intel_iommu_domain_init,
3713         .domain_destroy = intel_iommu_domain_destroy,
3714         .attach_dev     = intel_iommu_attach_device,
3715         .detach_dev     = intel_iommu_detach_device,
3716         .map            = intel_iommu_map,
3717         .unmap          = intel_iommu_unmap,
3718         .iova_to_phys   = intel_iommu_iova_to_phys,
3719         .domain_has_cap = intel_iommu_domain_has_cap,
3720 };
3721
3722 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3723 {
3724         /*
3725          * Mobile 4 Series Chipset neglects to set RWBF capability,
3726          * but needs it:
3727          */
3728         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3729         rwbf_quirk = 1;
3730
3731         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
3732         if (dev->revision == 0x07) {
3733                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
3734                 dmar_map_gfx = 0;
3735         }
3736 }
3737
3738 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3739
3740 #define GGC 0x52
3741 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
3742 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
3743 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
3744 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
3745 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
3746 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
3747 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
3748 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
3749
3750 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
3751 {
3752         unsigned short ggc;
3753
3754         if (pci_read_config_word(dev, GGC, &ggc))
3755                 return;
3756
3757         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
3758                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
3759                 dmar_map_gfx = 0;
3760         }
3761 }
3762 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
3763 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
3764 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
3765 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
3766
3767 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3768    ISOCH DMAR unit for the Azalia sound device, but not give it any
3769    TLB entries, which causes it to deadlock. Check for that.  We do
3770    this in a function called from init_dmars(), instead of in a PCI
3771    quirk, because we don't want to print the obnoxious "BIOS broken"
3772    message if VT-d is actually disabled.
3773 */
3774 static void __init check_tylersburg_isoch(void)
3775 {
3776         struct pci_dev *pdev;
3777         uint32_t vtisochctrl;
3778
3779         /* If there's no Azalia in the system anyway, forget it. */
3780         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3781         if (!pdev)
3782                 return;
3783         pci_dev_put(pdev);
3784
3785         /* System Management Registers. Might be hidden, in which case
3786            we can't do the sanity check. But that's OK, because the
3787            known-broken BIOSes _don't_ actually hide it, so far. */
3788         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3789         if (!pdev)
3790                 return;
3791
3792         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3793                 pci_dev_put(pdev);
3794                 return;
3795         }
3796
3797         pci_dev_put(pdev);
3798
3799         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3800         if (vtisochctrl & 1)
3801                 return;
3802
3803         /* Drop all bits other than the number of TLB entries */
3804         vtisochctrl &= 0x1c;
3805
3806         /* If we have the recommended number of TLB entries (16), fine. */
3807         if (vtisochctrl == 0x10)
3808                 return;
3809
3810         /* Zero TLB entries? You get to ride the short bus to school. */
3811         if (!vtisochctrl) {
3812                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3813                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3814                      dmi_get_system_info(DMI_BIOS_VENDOR),
3815                      dmi_get_system_info(DMI_BIOS_VERSION),
3816                      dmi_get_system_info(DMI_PRODUCT_VERSION));
3817                 iommu_identity_mapping |= IDENTMAP_AZALIA;
3818                 return;
3819         }
3820         
3821         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3822                vtisochctrl);
3823 }