Merge branch 'for-2639/i2c/i2c-u2c12' into for-linus/2639/i2c-12
[pandora-kernel.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
45
46 #define ROOT_SIZE               VTD_PAGE_SIZE
47 #define CONTEXT_SIZE            VTD_PAGE_SIZE
48
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
52
53 #define IOAPIC_RANGE_START      (0xfee00000)
54 #define IOAPIC_RANGE_END        (0xfeefffff)
55 #define IOVA_START_ADDR         (0x1000)
56
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
58
59 #define MAX_AGAW_WIDTH 64
60
61 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
63
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
67                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
69
70 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
73
74 /* page table handling */
75 #define LEVEL_STRIDE            (9)
76 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
77
78 static inline int agaw_to_level(int agaw)
79 {
80         return agaw + 2;
81 }
82
83 static inline int agaw_to_width(int agaw)
84 {
85         return 30 + agaw * LEVEL_STRIDE;
86 }
87
88 static inline int width_to_agaw(int width)
89 {
90         return (width - 30) / LEVEL_STRIDE;
91 }
92
93 static inline unsigned int level_to_offset_bits(int level)
94 {
95         return (level - 1) * LEVEL_STRIDE;
96 }
97
98 static inline int pfn_level_offset(unsigned long pfn, int level)
99 {
100         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
101 }
102
103 static inline unsigned long level_mask(int level)
104 {
105         return -1UL << level_to_offset_bits(level);
106 }
107
108 static inline unsigned long level_size(int level)
109 {
110         return 1UL << level_to_offset_bits(level);
111 }
112
113 static inline unsigned long align_to_level(unsigned long pfn, int level)
114 {
115         return (pfn + level_size(level) - 1) & level_mask(level);
116 }
117
118 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
119    are never going to work. */
120 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
121 {
122         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
123 }
124
125 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
126 {
127         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
128 }
129 static inline unsigned long page_to_dma_pfn(struct page *pg)
130 {
131         return mm_to_dma_pfn(page_to_pfn(pg));
132 }
133 static inline unsigned long virt_to_dma_pfn(void *p)
134 {
135         return page_to_dma_pfn(virt_to_page(p));
136 }
137
138 /* global iommu list, set NULL for ignored DMAR units */
139 static struct intel_iommu **g_iommus;
140
141 static void __init check_tylersburg_isoch(void);
142 static int rwbf_quirk;
143
144 /*
145  * 0: Present
146  * 1-11: Reserved
147  * 12-63: Context Ptr (12 - (haw-1))
148  * 64-127: Reserved
149  */
150 struct root_entry {
151         u64     val;
152         u64     rsvd1;
153 };
154 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
155 static inline bool root_present(struct root_entry *root)
156 {
157         return (root->val & 1);
158 }
159 static inline void set_root_present(struct root_entry *root)
160 {
161         root->val |= 1;
162 }
163 static inline void set_root_value(struct root_entry *root, unsigned long value)
164 {
165         root->val |= value & VTD_PAGE_MASK;
166 }
167
168 static inline struct context_entry *
169 get_context_addr_from_root(struct root_entry *root)
170 {
171         return (struct context_entry *)
172                 (root_present(root)?phys_to_virt(
173                 root->val & VTD_PAGE_MASK) :
174                 NULL);
175 }
176
177 /*
178  * low 64 bits:
179  * 0: present
180  * 1: fault processing disable
181  * 2-3: translation type
182  * 12-63: address space root
183  * high 64 bits:
184  * 0-2: address width
185  * 3-6: aval
186  * 8-23: domain id
187  */
188 struct context_entry {
189         u64 lo;
190         u64 hi;
191 };
192
193 static inline bool context_present(struct context_entry *context)
194 {
195         return (context->lo & 1);
196 }
197 static inline void context_set_present(struct context_entry *context)
198 {
199         context->lo |= 1;
200 }
201
202 static inline void context_set_fault_enable(struct context_entry *context)
203 {
204         context->lo &= (((u64)-1) << 2) | 1;
205 }
206
207 static inline void context_set_translation_type(struct context_entry *context,
208                                                 unsigned long value)
209 {
210         context->lo &= (((u64)-1) << 4) | 3;
211         context->lo |= (value & 3) << 2;
212 }
213
214 static inline void context_set_address_root(struct context_entry *context,
215                                             unsigned long value)
216 {
217         context->lo |= value & VTD_PAGE_MASK;
218 }
219
220 static inline void context_set_address_width(struct context_entry *context,
221                                              unsigned long value)
222 {
223         context->hi |= value & 7;
224 }
225
226 static inline void context_set_domain_id(struct context_entry *context,
227                                          unsigned long value)
228 {
229         context->hi |= (value & ((1 << 16) - 1)) << 8;
230 }
231
232 static inline void context_clear_entry(struct context_entry *context)
233 {
234         context->lo = 0;
235         context->hi = 0;
236 }
237
238 /*
239  * 0: readable
240  * 1: writable
241  * 2-6: reserved
242  * 7: super page
243  * 8-10: available
244  * 11: snoop behavior
245  * 12-63: Host physcial address
246  */
247 struct dma_pte {
248         u64 val;
249 };
250
251 static inline void dma_clear_pte(struct dma_pte *pte)
252 {
253         pte->val = 0;
254 }
255
256 static inline void dma_set_pte_readable(struct dma_pte *pte)
257 {
258         pte->val |= DMA_PTE_READ;
259 }
260
261 static inline void dma_set_pte_writable(struct dma_pte *pte)
262 {
263         pte->val |= DMA_PTE_WRITE;
264 }
265
266 static inline void dma_set_pte_snp(struct dma_pte *pte)
267 {
268         pte->val |= DMA_PTE_SNP;
269 }
270
271 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
272 {
273         pte->val = (pte->val & ~3) | (prot & 3);
274 }
275
276 static inline u64 dma_pte_addr(struct dma_pte *pte)
277 {
278 #ifdef CONFIG_64BIT
279         return pte->val & VTD_PAGE_MASK;
280 #else
281         /* Must have a full atomic 64-bit read */
282         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
283 #endif
284 }
285
286 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
287 {
288         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
289 }
290
291 static inline bool dma_pte_present(struct dma_pte *pte)
292 {
293         return (pte->val & 3) != 0;
294 }
295
296 static inline int first_pte_in_page(struct dma_pte *pte)
297 {
298         return !((unsigned long)pte & ~VTD_PAGE_MASK);
299 }
300
301 /*
302  * This domain is a statically identity mapping domain.
303  *      1. This domain creats a static 1:1 mapping to all usable memory.
304  *      2. It maps to each iommu if successful.
305  *      3. Each iommu mapps to this domain if successful.
306  */
307 static struct dmar_domain *si_domain;
308 static int hw_pass_through = 1;
309
310 /* devices under the same p2p bridge are owned in one domain */
311 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
312
313 /* domain represents a virtual machine, more than one devices
314  * across iommus may be owned in one domain, e.g. kvm guest.
315  */
316 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
317
318 /* si_domain contains mulitple devices */
319 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
320
321 struct dmar_domain {
322         int     id;                     /* domain id */
323         int     nid;                    /* node id */
324         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
325
326         struct list_head devices;       /* all devices' list */
327         struct iova_domain iovad;       /* iova's that belong to this domain */
328
329         struct dma_pte  *pgd;           /* virtual address */
330         int             gaw;            /* max guest address width */
331
332         /* adjusted guest address width, 0 is level 2 30-bit */
333         int             agaw;
334
335         int             flags;          /* flags to find out type of domain */
336
337         int             iommu_coherency;/* indicate coherency of iommu access */
338         int             iommu_snooping; /* indicate snooping control feature*/
339         int             iommu_count;    /* reference count of iommu */
340         spinlock_t      iommu_lock;     /* protect iommu set in domain */
341         u64             max_addr;       /* maximum mapped address */
342 };
343
344 /* PCI domain-device relationship */
345 struct device_domain_info {
346         struct list_head link;  /* link to domain siblings */
347         struct list_head global; /* link to global list */
348         int segment;            /* PCI domain */
349         u8 bus;                 /* PCI bus number */
350         u8 devfn;               /* PCI devfn number */
351         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
352         struct intel_iommu *iommu; /* IOMMU used by this device */
353         struct dmar_domain *domain; /* pointer to domain */
354 };
355
356 static void flush_unmaps_timeout(unsigned long data);
357
358 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
359
360 #define HIGH_WATER_MARK 250
361 struct deferred_flush_tables {
362         int next;
363         struct iova *iova[HIGH_WATER_MARK];
364         struct dmar_domain *domain[HIGH_WATER_MARK];
365 };
366
367 static struct deferred_flush_tables *deferred_flush;
368
369 /* bitmap for indexing intel_iommus */
370 static int g_num_of_iommus;
371
372 static DEFINE_SPINLOCK(async_umap_flush_lock);
373 static LIST_HEAD(unmaps_to_do);
374
375 static int timer_on;
376 static long list_size;
377
378 static void domain_remove_dev_info(struct dmar_domain *domain);
379
380 #ifdef CONFIG_DMAR_DEFAULT_ON
381 int dmar_disabled = 0;
382 #else
383 int dmar_disabled = 1;
384 #endif /*CONFIG_DMAR_DEFAULT_ON*/
385
386 static int dmar_map_gfx = 1;
387 static int dmar_forcedac;
388 static int intel_iommu_strict;
389
390 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
391 static DEFINE_SPINLOCK(device_domain_lock);
392 static LIST_HEAD(device_domain_list);
393
394 static struct iommu_ops intel_iommu_ops;
395
396 static int __init intel_iommu_setup(char *str)
397 {
398         if (!str)
399                 return -EINVAL;
400         while (*str) {
401                 if (!strncmp(str, "on", 2)) {
402                         dmar_disabled = 0;
403                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
404                 } else if (!strncmp(str, "off", 3)) {
405                         dmar_disabled = 1;
406                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
407                 } else if (!strncmp(str, "igfx_off", 8)) {
408                         dmar_map_gfx = 0;
409                         printk(KERN_INFO
410                                 "Intel-IOMMU: disable GFX device mapping\n");
411                 } else if (!strncmp(str, "forcedac", 8)) {
412                         printk(KERN_INFO
413                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
414                         dmar_forcedac = 1;
415                 } else if (!strncmp(str, "strict", 6)) {
416                         printk(KERN_INFO
417                                 "Intel-IOMMU: disable batched IOTLB flush\n");
418                         intel_iommu_strict = 1;
419                 }
420
421                 str += strcspn(str, ",");
422                 while (*str == ',')
423                         str++;
424         }
425         return 0;
426 }
427 __setup("intel_iommu=", intel_iommu_setup);
428
429 static struct kmem_cache *iommu_domain_cache;
430 static struct kmem_cache *iommu_devinfo_cache;
431 static struct kmem_cache *iommu_iova_cache;
432
433 static inline void *alloc_pgtable_page(int node)
434 {
435         struct page *page;
436         void *vaddr = NULL;
437
438         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
439         if (page)
440                 vaddr = page_address(page);
441         return vaddr;
442 }
443
444 static inline void free_pgtable_page(void *vaddr)
445 {
446         free_page((unsigned long)vaddr);
447 }
448
449 static inline void *alloc_domain_mem(void)
450 {
451         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
452 }
453
454 static void free_domain_mem(void *vaddr)
455 {
456         kmem_cache_free(iommu_domain_cache, vaddr);
457 }
458
459 static inline void * alloc_devinfo_mem(void)
460 {
461         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
462 }
463
464 static inline void free_devinfo_mem(void *vaddr)
465 {
466         kmem_cache_free(iommu_devinfo_cache, vaddr);
467 }
468
469 struct iova *alloc_iova_mem(void)
470 {
471         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
472 }
473
474 void free_iova_mem(struct iova *iova)
475 {
476         kmem_cache_free(iommu_iova_cache, iova);
477 }
478
479
480 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
481 {
482         unsigned long sagaw;
483         int agaw = -1;
484
485         sagaw = cap_sagaw(iommu->cap);
486         for (agaw = width_to_agaw(max_gaw);
487              agaw >= 0; agaw--) {
488                 if (test_bit(agaw, &sagaw))
489                         break;
490         }
491
492         return agaw;
493 }
494
495 /*
496  * Calculate max SAGAW for each iommu.
497  */
498 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
499 {
500         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
501 }
502
503 /*
504  * calculate agaw for each iommu.
505  * "SAGAW" may be different across iommus, use a default agaw, and
506  * get a supported less agaw for iommus that don't support the default agaw.
507  */
508 int iommu_calculate_agaw(struct intel_iommu *iommu)
509 {
510         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
511 }
512
513 /* This functionin only returns single iommu in a domain */
514 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
515 {
516         int iommu_id;
517
518         /* si_domain and vm domain should not get here. */
519         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
520         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
521
522         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
523         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
524                 return NULL;
525
526         return g_iommus[iommu_id];
527 }
528
529 static void domain_update_iommu_coherency(struct dmar_domain *domain)
530 {
531         int i;
532
533         domain->iommu_coherency = 1;
534
535         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
536                 if (!ecap_coherent(g_iommus[i]->ecap)) {
537                         domain->iommu_coherency = 0;
538                         break;
539                 }
540         }
541 }
542
543 static void domain_update_iommu_snooping(struct dmar_domain *domain)
544 {
545         int i;
546
547         domain->iommu_snooping = 1;
548
549         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
550                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
551                         domain->iommu_snooping = 0;
552                         break;
553                 }
554         }
555 }
556
557 /* Some capabilities may be different across iommus */
558 static void domain_update_iommu_cap(struct dmar_domain *domain)
559 {
560         domain_update_iommu_coherency(domain);
561         domain_update_iommu_snooping(domain);
562 }
563
564 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
565 {
566         struct dmar_drhd_unit *drhd = NULL;
567         int i;
568
569         for_each_drhd_unit(drhd) {
570                 if (drhd->ignored)
571                         continue;
572                 if (segment != drhd->segment)
573                         continue;
574
575                 for (i = 0; i < drhd->devices_cnt; i++) {
576                         if (drhd->devices[i] &&
577                             drhd->devices[i]->bus->number == bus &&
578                             drhd->devices[i]->devfn == devfn)
579                                 return drhd->iommu;
580                         if (drhd->devices[i] &&
581                             drhd->devices[i]->subordinate &&
582                             drhd->devices[i]->subordinate->number <= bus &&
583                             drhd->devices[i]->subordinate->subordinate >= bus)
584                                 return drhd->iommu;
585                 }
586
587                 if (drhd->include_all)
588                         return drhd->iommu;
589         }
590
591         return NULL;
592 }
593
594 static void domain_flush_cache(struct dmar_domain *domain,
595                                void *addr, int size)
596 {
597         if (!domain->iommu_coherency)
598                 clflush_cache_range(addr, size);
599 }
600
601 /* Gets context entry for a given bus and devfn */
602 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
603                 u8 bus, u8 devfn)
604 {
605         struct root_entry *root;
606         struct context_entry *context;
607         unsigned long phy_addr;
608         unsigned long flags;
609
610         spin_lock_irqsave(&iommu->lock, flags);
611         root = &iommu->root_entry[bus];
612         context = get_context_addr_from_root(root);
613         if (!context) {
614                 context = (struct context_entry *)
615                                 alloc_pgtable_page(iommu->node);
616                 if (!context) {
617                         spin_unlock_irqrestore(&iommu->lock, flags);
618                         return NULL;
619                 }
620                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
621                 phy_addr = virt_to_phys((void *)context);
622                 set_root_value(root, phy_addr);
623                 set_root_present(root);
624                 __iommu_flush_cache(iommu, root, sizeof(*root));
625         }
626         spin_unlock_irqrestore(&iommu->lock, flags);
627         return &context[devfn];
628 }
629
630 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
631 {
632         struct root_entry *root;
633         struct context_entry *context;
634         int ret;
635         unsigned long flags;
636
637         spin_lock_irqsave(&iommu->lock, flags);
638         root = &iommu->root_entry[bus];
639         context = get_context_addr_from_root(root);
640         if (!context) {
641                 ret = 0;
642                 goto out;
643         }
644         ret = context_present(&context[devfn]);
645 out:
646         spin_unlock_irqrestore(&iommu->lock, flags);
647         return ret;
648 }
649
650 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
651 {
652         struct root_entry *root;
653         struct context_entry *context;
654         unsigned long flags;
655
656         spin_lock_irqsave(&iommu->lock, flags);
657         root = &iommu->root_entry[bus];
658         context = get_context_addr_from_root(root);
659         if (context) {
660                 context_clear_entry(&context[devfn]);
661                 __iommu_flush_cache(iommu, &context[devfn], \
662                         sizeof(*context));
663         }
664         spin_unlock_irqrestore(&iommu->lock, flags);
665 }
666
667 static void free_context_table(struct intel_iommu *iommu)
668 {
669         struct root_entry *root;
670         int i;
671         unsigned long flags;
672         struct context_entry *context;
673
674         spin_lock_irqsave(&iommu->lock, flags);
675         if (!iommu->root_entry) {
676                 goto out;
677         }
678         for (i = 0; i < ROOT_ENTRY_NR; i++) {
679                 root = &iommu->root_entry[i];
680                 context = get_context_addr_from_root(root);
681                 if (context)
682                         free_pgtable_page(context);
683         }
684         free_pgtable_page(iommu->root_entry);
685         iommu->root_entry = NULL;
686 out:
687         spin_unlock_irqrestore(&iommu->lock, flags);
688 }
689
690 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
691                                       unsigned long pfn)
692 {
693         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
694         struct dma_pte *parent, *pte = NULL;
695         int level = agaw_to_level(domain->agaw);
696         int offset;
697
698         BUG_ON(!domain->pgd);
699         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
700         parent = domain->pgd;
701
702         while (level > 0) {
703                 void *tmp_page;
704
705                 offset = pfn_level_offset(pfn, level);
706                 pte = &parent[offset];
707                 if (level == 1)
708                         break;
709
710                 if (!dma_pte_present(pte)) {
711                         uint64_t pteval;
712
713                         tmp_page = alloc_pgtable_page(domain->nid);
714
715                         if (!tmp_page)
716                                 return NULL;
717
718                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
719                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
720                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
721                                 /* Someone else set it while we were thinking; use theirs. */
722                                 free_pgtable_page(tmp_page);
723                         } else {
724                                 dma_pte_addr(pte);
725                                 domain_flush_cache(domain, pte, sizeof(*pte));
726                         }
727                 }
728                 parent = phys_to_virt(dma_pte_addr(pte));
729                 level--;
730         }
731
732         return pte;
733 }
734
735 /* return address's pte at specific level */
736 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
737                                          unsigned long pfn,
738                                          int level)
739 {
740         struct dma_pte *parent, *pte = NULL;
741         int total = agaw_to_level(domain->agaw);
742         int offset;
743
744         parent = domain->pgd;
745         while (level <= total) {
746                 offset = pfn_level_offset(pfn, total);
747                 pte = &parent[offset];
748                 if (level == total)
749                         return pte;
750
751                 if (!dma_pte_present(pte))
752                         break;
753                 parent = phys_to_virt(dma_pte_addr(pte));
754                 total--;
755         }
756         return NULL;
757 }
758
759 /* clear last level pte, a tlb flush should be followed */
760 static void dma_pte_clear_range(struct dmar_domain *domain,
761                                 unsigned long start_pfn,
762                                 unsigned long last_pfn)
763 {
764         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
765         struct dma_pte *first_pte, *pte;
766
767         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
768         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
769         BUG_ON(start_pfn > last_pfn);
770
771         /* we don't need lock here; nobody else touches the iova range */
772         do {
773                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
774                 if (!pte) {
775                         start_pfn = align_to_level(start_pfn + 1, 2);
776                         continue;
777                 }
778                 do { 
779                         dma_clear_pte(pte);
780                         start_pfn++;
781                         pte++;
782                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
783
784                 domain_flush_cache(domain, first_pte,
785                                    (void *)pte - (void *)first_pte);
786
787         } while (start_pfn && start_pfn <= last_pfn);
788 }
789
790 /* free page table pages. last level pte should already be cleared */
791 static void dma_pte_free_pagetable(struct dmar_domain *domain,
792                                    unsigned long start_pfn,
793                                    unsigned long last_pfn)
794 {
795         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
796         struct dma_pte *first_pte, *pte;
797         int total = agaw_to_level(domain->agaw);
798         int level;
799         unsigned long tmp;
800
801         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
802         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
803         BUG_ON(start_pfn > last_pfn);
804
805         /* We don't need lock here; nobody else touches the iova range */
806         level = 2;
807         while (level <= total) {
808                 tmp = align_to_level(start_pfn, level);
809
810                 /* If we can't even clear one PTE at this level, we're done */
811                 if (tmp + level_size(level) - 1 > last_pfn)
812                         return;
813
814                 do {
815                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
816                         if (!pte) {
817                                 tmp = align_to_level(tmp + 1, level + 1);
818                                 continue;
819                         }
820                         do {
821                                 if (dma_pte_present(pte)) {
822                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
823                                         dma_clear_pte(pte);
824                                 }
825                                 pte++;
826                                 tmp += level_size(level);
827                         } while (!first_pte_in_page(pte) &&
828                                  tmp + level_size(level) - 1 <= last_pfn);
829
830                         domain_flush_cache(domain, first_pte,
831                                            (void *)pte - (void *)first_pte);
832                         
833                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
834                 level++;
835         }
836         /* free pgd */
837         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
838                 free_pgtable_page(domain->pgd);
839                 domain->pgd = NULL;
840         }
841 }
842
843 /* iommu handling */
844 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
845 {
846         struct root_entry *root;
847         unsigned long flags;
848
849         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
850         if (!root)
851                 return -ENOMEM;
852
853         __iommu_flush_cache(iommu, root, ROOT_SIZE);
854
855         spin_lock_irqsave(&iommu->lock, flags);
856         iommu->root_entry = root;
857         spin_unlock_irqrestore(&iommu->lock, flags);
858
859         return 0;
860 }
861
862 static void iommu_set_root_entry(struct intel_iommu *iommu)
863 {
864         void *addr;
865         u32 sts;
866         unsigned long flag;
867
868         addr = iommu->root_entry;
869
870         spin_lock_irqsave(&iommu->register_lock, flag);
871         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
872
873         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
874
875         /* Make sure hardware complete it */
876         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
877                       readl, (sts & DMA_GSTS_RTPS), sts);
878
879         spin_unlock_irqrestore(&iommu->register_lock, flag);
880 }
881
882 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
883 {
884         u32 val;
885         unsigned long flag;
886
887         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
888                 return;
889
890         spin_lock_irqsave(&iommu->register_lock, flag);
891         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
892
893         /* Make sure hardware complete it */
894         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
895                       readl, (!(val & DMA_GSTS_WBFS)), val);
896
897         spin_unlock_irqrestore(&iommu->register_lock, flag);
898 }
899
900 /* return value determine if we need a write buffer flush */
901 static void __iommu_flush_context(struct intel_iommu *iommu,
902                                   u16 did, u16 source_id, u8 function_mask,
903                                   u64 type)
904 {
905         u64 val = 0;
906         unsigned long flag;
907
908         switch (type) {
909         case DMA_CCMD_GLOBAL_INVL:
910                 val = DMA_CCMD_GLOBAL_INVL;
911                 break;
912         case DMA_CCMD_DOMAIN_INVL:
913                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
914                 break;
915         case DMA_CCMD_DEVICE_INVL:
916                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
917                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
918                 break;
919         default:
920                 BUG();
921         }
922         val |= DMA_CCMD_ICC;
923
924         spin_lock_irqsave(&iommu->register_lock, flag);
925         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
926
927         /* Make sure hardware complete it */
928         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
929                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
930
931         spin_unlock_irqrestore(&iommu->register_lock, flag);
932 }
933
934 /* return value determine if we need a write buffer flush */
935 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
936                                 u64 addr, unsigned int size_order, u64 type)
937 {
938         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
939         u64 val = 0, val_iva = 0;
940         unsigned long flag;
941
942         switch (type) {
943         case DMA_TLB_GLOBAL_FLUSH:
944                 /* global flush doesn't need set IVA_REG */
945                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
946                 break;
947         case DMA_TLB_DSI_FLUSH:
948                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
949                 break;
950         case DMA_TLB_PSI_FLUSH:
951                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
952                 /* Note: always flush non-leaf currently */
953                 val_iva = size_order | addr;
954                 break;
955         default:
956                 BUG();
957         }
958         /* Note: set drain read/write */
959 #if 0
960         /*
961          * This is probably to be super secure.. Looks like we can
962          * ignore it without any impact.
963          */
964         if (cap_read_drain(iommu->cap))
965                 val |= DMA_TLB_READ_DRAIN;
966 #endif
967         if (cap_write_drain(iommu->cap))
968                 val |= DMA_TLB_WRITE_DRAIN;
969
970         spin_lock_irqsave(&iommu->register_lock, flag);
971         /* Note: Only uses first TLB reg currently */
972         if (val_iva)
973                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
974         dmar_writeq(iommu->reg + tlb_offset + 8, val);
975
976         /* Make sure hardware complete it */
977         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
978                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
979
980         spin_unlock_irqrestore(&iommu->register_lock, flag);
981
982         /* check IOTLB invalidation granularity */
983         if (DMA_TLB_IAIG(val) == 0)
984                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
985         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
986                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
987                         (unsigned long long)DMA_TLB_IIRG(type),
988                         (unsigned long long)DMA_TLB_IAIG(val));
989 }
990
991 static struct device_domain_info *iommu_support_dev_iotlb(
992         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
993 {
994         int found = 0;
995         unsigned long flags;
996         struct device_domain_info *info;
997         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
998
999         if (!ecap_dev_iotlb_support(iommu->ecap))
1000                 return NULL;
1001
1002         if (!iommu->qi)
1003                 return NULL;
1004
1005         spin_lock_irqsave(&device_domain_lock, flags);
1006         list_for_each_entry(info, &domain->devices, link)
1007                 if (info->bus == bus && info->devfn == devfn) {
1008                         found = 1;
1009                         break;
1010                 }
1011         spin_unlock_irqrestore(&device_domain_lock, flags);
1012
1013         if (!found || !info->dev)
1014                 return NULL;
1015
1016         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1017                 return NULL;
1018
1019         if (!dmar_find_matched_atsr_unit(info->dev))
1020                 return NULL;
1021
1022         info->iommu = iommu;
1023
1024         return info;
1025 }
1026
1027 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1028 {
1029         if (!info)
1030                 return;
1031
1032         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1033 }
1034
1035 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1036 {
1037         if (!info->dev || !pci_ats_enabled(info->dev))
1038                 return;
1039
1040         pci_disable_ats(info->dev);
1041 }
1042
1043 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1044                                   u64 addr, unsigned mask)
1045 {
1046         u16 sid, qdep;
1047         unsigned long flags;
1048         struct device_domain_info *info;
1049
1050         spin_lock_irqsave(&device_domain_lock, flags);
1051         list_for_each_entry(info, &domain->devices, link) {
1052                 if (!info->dev || !pci_ats_enabled(info->dev))
1053                         continue;
1054
1055                 sid = info->bus << 8 | info->devfn;
1056                 qdep = pci_ats_queue_depth(info->dev);
1057                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1058         }
1059         spin_unlock_irqrestore(&device_domain_lock, flags);
1060 }
1061
1062 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1063                                   unsigned long pfn, unsigned int pages, int map)
1064 {
1065         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1066         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1067
1068         BUG_ON(pages == 0);
1069
1070         /*
1071          * Fallback to domain selective flush if no PSI support or the size is
1072          * too big.
1073          * PSI requires page size to be 2 ^ x, and the base address is naturally
1074          * aligned to the size
1075          */
1076         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1077                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1078                                                 DMA_TLB_DSI_FLUSH);
1079         else
1080                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1081                                                 DMA_TLB_PSI_FLUSH);
1082
1083         /*
1084          * In caching mode, changes of pages from non-present to present require
1085          * flush. However, device IOTLB doesn't need to be flushed in this case.
1086          */
1087         if (!cap_caching_mode(iommu->cap) || !map)
1088                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1089 }
1090
1091 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1092 {
1093         u32 pmen;
1094         unsigned long flags;
1095
1096         spin_lock_irqsave(&iommu->register_lock, flags);
1097         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1098         pmen &= ~DMA_PMEN_EPM;
1099         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1100
1101         /* wait for the protected region status bit to clear */
1102         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1103                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1104
1105         spin_unlock_irqrestore(&iommu->register_lock, flags);
1106 }
1107
1108 static int iommu_enable_translation(struct intel_iommu *iommu)
1109 {
1110         u32 sts;
1111         unsigned long flags;
1112
1113         spin_lock_irqsave(&iommu->register_lock, flags);
1114         iommu->gcmd |= DMA_GCMD_TE;
1115         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1116
1117         /* Make sure hardware complete it */
1118         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1119                       readl, (sts & DMA_GSTS_TES), sts);
1120
1121         spin_unlock_irqrestore(&iommu->register_lock, flags);
1122         return 0;
1123 }
1124
1125 static int iommu_disable_translation(struct intel_iommu *iommu)
1126 {
1127         u32 sts;
1128         unsigned long flag;
1129
1130         spin_lock_irqsave(&iommu->register_lock, flag);
1131         iommu->gcmd &= ~DMA_GCMD_TE;
1132         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1133
1134         /* Make sure hardware complete it */
1135         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1136                       readl, (!(sts & DMA_GSTS_TES)), sts);
1137
1138         spin_unlock_irqrestore(&iommu->register_lock, flag);
1139         return 0;
1140 }
1141
1142
1143 static int iommu_init_domains(struct intel_iommu *iommu)
1144 {
1145         unsigned long ndomains;
1146         unsigned long nlongs;
1147
1148         ndomains = cap_ndoms(iommu->cap);
1149         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1150                         ndomains);
1151         nlongs = BITS_TO_LONGS(ndomains);
1152
1153         spin_lock_init(&iommu->lock);
1154
1155         /* TBD: there might be 64K domains,
1156          * consider other allocation for future chip
1157          */
1158         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1159         if (!iommu->domain_ids) {
1160                 printk(KERN_ERR "Allocating domain id array failed\n");
1161                 return -ENOMEM;
1162         }
1163         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1164                         GFP_KERNEL);
1165         if (!iommu->domains) {
1166                 printk(KERN_ERR "Allocating domain array failed\n");
1167                 return -ENOMEM;
1168         }
1169
1170         /*
1171          * if Caching mode is set, then invalid translations are tagged
1172          * with domainid 0. Hence we need to pre-allocate it.
1173          */
1174         if (cap_caching_mode(iommu->cap))
1175                 set_bit(0, iommu->domain_ids);
1176         return 0;
1177 }
1178
1179
1180 static void domain_exit(struct dmar_domain *domain);
1181 static void vm_domain_exit(struct dmar_domain *domain);
1182
1183 void free_dmar_iommu(struct intel_iommu *iommu)
1184 {
1185         struct dmar_domain *domain;
1186         int i;
1187         unsigned long flags;
1188
1189         if ((iommu->domains) && (iommu->domain_ids)) {
1190                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1191                         domain = iommu->domains[i];
1192                         clear_bit(i, iommu->domain_ids);
1193
1194                         spin_lock_irqsave(&domain->iommu_lock, flags);
1195                         if (--domain->iommu_count == 0) {
1196                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1197                                         vm_domain_exit(domain);
1198                                 else
1199                                         domain_exit(domain);
1200                         }
1201                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1202                 }
1203         }
1204
1205         if (iommu->gcmd & DMA_GCMD_TE)
1206                 iommu_disable_translation(iommu);
1207
1208         if (iommu->irq) {
1209                 set_irq_data(iommu->irq, NULL);
1210                 /* This will mask the irq */
1211                 free_irq(iommu->irq, iommu);
1212                 destroy_irq(iommu->irq);
1213         }
1214
1215         kfree(iommu->domains);
1216         kfree(iommu->domain_ids);
1217
1218         g_iommus[iommu->seq_id] = NULL;
1219
1220         /* if all iommus are freed, free g_iommus */
1221         for (i = 0; i < g_num_of_iommus; i++) {
1222                 if (g_iommus[i])
1223                         break;
1224         }
1225
1226         if (i == g_num_of_iommus)
1227                 kfree(g_iommus);
1228
1229         /* free context mapping */
1230         free_context_table(iommu);
1231 }
1232
1233 static struct dmar_domain *alloc_domain(void)
1234 {
1235         struct dmar_domain *domain;
1236
1237         domain = alloc_domain_mem();
1238         if (!domain)
1239                 return NULL;
1240
1241         domain->nid = -1;
1242         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1243         domain->flags = 0;
1244
1245         return domain;
1246 }
1247
1248 static int iommu_attach_domain(struct dmar_domain *domain,
1249                                struct intel_iommu *iommu)
1250 {
1251         int num;
1252         unsigned long ndomains;
1253         unsigned long flags;
1254
1255         ndomains = cap_ndoms(iommu->cap);
1256
1257         spin_lock_irqsave(&iommu->lock, flags);
1258
1259         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1260         if (num >= ndomains) {
1261                 spin_unlock_irqrestore(&iommu->lock, flags);
1262                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1263                 return -ENOMEM;
1264         }
1265
1266         domain->id = num;
1267         set_bit(num, iommu->domain_ids);
1268         set_bit(iommu->seq_id, &domain->iommu_bmp);
1269         iommu->domains[num] = domain;
1270         spin_unlock_irqrestore(&iommu->lock, flags);
1271
1272         return 0;
1273 }
1274
1275 static void iommu_detach_domain(struct dmar_domain *domain,
1276                                 struct intel_iommu *iommu)
1277 {
1278         unsigned long flags;
1279         int num, ndomains;
1280         int found = 0;
1281
1282         spin_lock_irqsave(&iommu->lock, flags);
1283         ndomains = cap_ndoms(iommu->cap);
1284         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1285                 if (iommu->domains[num] == domain) {
1286                         found = 1;
1287                         break;
1288                 }
1289         }
1290
1291         if (found) {
1292                 clear_bit(num, iommu->domain_ids);
1293                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1294                 iommu->domains[num] = NULL;
1295         }
1296         spin_unlock_irqrestore(&iommu->lock, flags);
1297 }
1298
1299 static struct iova_domain reserved_iova_list;
1300 static struct lock_class_key reserved_rbtree_key;
1301
1302 static void dmar_init_reserved_ranges(void)
1303 {
1304         struct pci_dev *pdev = NULL;
1305         struct iova *iova;
1306         int i;
1307
1308         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1309
1310         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1311                 &reserved_rbtree_key);
1312
1313         /* IOAPIC ranges shouldn't be accessed by DMA */
1314         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1315                 IOVA_PFN(IOAPIC_RANGE_END));
1316         if (!iova)
1317                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1318
1319         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1320         for_each_pci_dev(pdev) {
1321                 struct resource *r;
1322
1323                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1324                         r = &pdev->resource[i];
1325                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1326                                 continue;
1327                         iova = reserve_iova(&reserved_iova_list,
1328                                             IOVA_PFN(r->start),
1329                                             IOVA_PFN(r->end));
1330                         if (!iova)
1331                                 printk(KERN_ERR "Reserve iova failed\n");
1332                 }
1333         }
1334
1335 }
1336
1337 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1338 {
1339         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1340 }
1341
1342 static inline int guestwidth_to_adjustwidth(int gaw)
1343 {
1344         int agaw;
1345         int r = (gaw - 12) % 9;
1346
1347         if (r == 0)
1348                 agaw = gaw;
1349         else
1350                 agaw = gaw + 9 - r;
1351         if (agaw > 64)
1352                 agaw = 64;
1353         return agaw;
1354 }
1355
1356 static int domain_init(struct dmar_domain *domain, int guest_width)
1357 {
1358         struct intel_iommu *iommu;
1359         int adjust_width, agaw;
1360         unsigned long sagaw;
1361
1362         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1363         spin_lock_init(&domain->iommu_lock);
1364
1365         domain_reserve_special_ranges(domain);
1366
1367         /* calculate AGAW */
1368         iommu = domain_get_iommu(domain);
1369         if (guest_width > cap_mgaw(iommu->cap))
1370                 guest_width = cap_mgaw(iommu->cap);
1371         domain->gaw = guest_width;
1372         adjust_width = guestwidth_to_adjustwidth(guest_width);
1373         agaw = width_to_agaw(adjust_width);
1374         sagaw = cap_sagaw(iommu->cap);
1375         if (!test_bit(agaw, &sagaw)) {
1376                 /* hardware doesn't support it, choose a bigger one */
1377                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1378                 agaw = find_next_bit(&sagaw, 5, agaw);
1379                 if (agaw >= 5)
1380                         return -ENODEV;
1381         }
1382         domain->agaw = agaw;
1383         INIT_LIST_HEAD(&domain->devices);
1384
1385         if (ecap_coherent(iommu->ecap))
1386                 domain->iommu_coherency = 1;
1387         else
1388                 domain->iommu_coherency = 0;
1389
1390         if (ecap_sc_support(iommu->ecap))
1391                 domain->iommu_snooping = 1;
1392         else
1393                 domain->iommu_snooping = 0;
1394
1395         domain->iommu_count = 1;
1396         domain->nid = iommu->node;
1397
1398         /* always allocate the top pgd */
1399         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1400         if (!domain->pgd)
1401                 return -ENOMEM;
1402         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1403         return 0;
1404 }
1405
1406 static void domain_exit(struct dmar_domain *domain)
1407 {
1408         struct dmar_drhd_unit *drhd;
1409         struct intel_iommu *iommu;
1410
1411         /* Domain 0 is reserved, so dont process it */
1412         if (!domain)
1413                 return;
1414
1415         domain_remove_dev_info(domain);
1416         /* destroy iovas */
1417         put_iova_domain(&domain->iovad);
1418
1419         /* clear ptes */
1420         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1421
1422         /* free page tables */
1423         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1424
1425         for_each_active_iommu(iommu, drhd)
1426                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1427                         iommu_detach_domain(domain, iommu);
1428
1429         free_domain_mem(domain);
1430 }
1431
1432 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1433                                  u8 bus, u8 devfn, int translation)
1434 {
1435         struct context_entry *context;
1436         unsigned long flags;
1437         struct intel_iommu *iommu;
1438         struct dma_pte *pgd;
1439         unsigned long num;
1440         unsigned long ndomains;
1441         int id;
1442         int agaw;
1443         struct device_domain_info *info = NULL;
1444
1445         pr_debug("Set context mapping for %02x:%02x.%d\n",
1446                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1447
1448         BUG_ON(!domain->pgd);
1449         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1450                translation != CONTEXT_TT_MULTI_LEVEL);
1451
1452         iommu = device_to_iommu(segment, bus, devfn);
1453         if (!iommu)
1454                 return -ENODEV;
1455
1456         context = device_to_context_entry(iommu, bus, devfn);
1457         if (!context)
1458                 return -ENOMEM;
1459         spin_lock_irqsave(&iommu->lock, flags);
1460         if (context_present(context)) {
1461                 spin_unlock_irqrestore(&iommu->lock, flags);
1462                 return 0;
1463         }
1464
1465         id = domain->id;
1466         pgd = domain->pgd;
1467
1468         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1469             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1470                 int found = 0;
1471
1472                 /* find an available domain id for this device in iommu */
1473                 ndomains = cap_ndoms(iommu->cap);
1474                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1475                         if (iommu->domains[num] == domain) {
1476                                 id = num;
1477                                 found = 1;
1478                                 break;
1479                         }
1480                 }
1481
1482                 if (found == 0) {
1483                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1484                         if (num >= ndomains) {
1485                                 spin_unlock_irqrestore(&iommu->lock, flags);
1486                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1487                                 return -EFAULT;
1488                         }
1489
1490                         set_bit(num, iommu->domain_ids);
1491                         iommu->domains[num] = domain;
1492                         id = num;
1493                 }
1494
1495                 /* Skip top levels of page tables for
1496                  * iommu which has less agaw than default.
1497                  * Unnecessary for PT mode.
1498                  */
1499                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1500                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1501                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1502                                 if (!dma_pte_present(pgd)) {
1503                                         spin_unlock_irqrestore(&iommu->lock, flags);
1504                                         return -ENOMEM;
1505                                 }
1506                         }
1507                 }
1508         }
1509
1510         context_set_domain_id(context, id);
1511
1512         if (translation != CONTEXT_TT_PASS_THROUGH) {
1513                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1514                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1515                                      CONTEXT_TT_MULTI_LEVEL;
1516         }
1517         /*
1518          * In pass through mode, AW must be programmed to indicate the largest
1519          * AGAW value supported by hardware. And ASR is ignored by hardware.
1520          */
1521         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1522                 context_set_address_width(context, iommu->msagaw);
1523         else {
1524                 context_set_address_root(context, virt_to_phys(pgd));
1525                 context_set_address_width(context, iommu->agaw);
1526         }
1527
1528         context_set_translation_type(context, translation);
1529         context_set_fault_enable(context);
1530         context_set_present(context);
1531         domain_flush_cache(domain, context, sizeof(*context));
1532
1533         /*
1534          * It's a non-present to present mapping. If hardware doesn't cache
1535          * non-present entry we only need to flush the write-buffer. If the
1536          * _does_ cache non-present entries, then it does so in the special
1537          * domain #0, which we have to flush:
1538          */
1539         if (cap_caching_mode(iommu->cap)) {
1540                 iommu->flush.flush_context(iommu, 0,
1541                                            (((u16)bus) << 8) | devfn,
1542                                            DMA_CCMD_MASK_NOBIT,
1543                                            DMA_CCMD_DEVICE_INVL);
1544                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1545         } else {
1546                 iommu_flush_write_buffer(iommu);
1547         }
1548         iommu_enable_dev_iotlb(info);
1549         spin_unlock_irqrestore(&iommu->lock, flags);
1550
1551         spin_lock_irqsave(&domain->iommu_lock, flags);
1552         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1553                 domain->iommu_count++;
1554                 if (domain->iommu_count == 1)
1555                         domain->nid = iommu->node;
1556                 domain_update_iommu_cap(domain);
1557         }
1558         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1559         return 0;
1560 }
1561
1562 static int
1563 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1564                         int translation)
1565 {
1566         int ret;
1567         struct pci_dev *tmp, *parent;
1568
1569         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1570                                          pdev->bus->number, pdev->devfn,
1571                                          translation);
1572         if (ret)
1573                 return ret;
1574
1575         /* dependent device mapping */
1576         tmp = pci_find_upstream_pcie_bridge(pdev);
1577         if (!tmp)
1578                 return 0;
1579         /* Secondary interface's bus number and devfn 0 */
1580         parent = pdev->bus->self;
1581         while (parent != tmp) {
1582                 ret = domain_context_mapping_one(domain,
1583                                                  pci_domain_nr(parent->bus),
1584                                                  parent->bus->number,
1585                                                  parent->devfn, translation);
1586                 if (ret)
1587                         return ret;
1588                 parent = parent->bus->self;
1589         }
1590         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1591                 return domain_context_mapping_one(domain,
1592                                         pci_domain_nr(tmp->subordinate),
1593                                         tmp->subordinate->number, 0,
1594                                         translation);
1595         else /* this is a legacy PCI bridge */
1596                 return domain_context_mapping_one(domain,
1597                                                   pci_domain_nr(tmp->bus),
1598                                                   tmp->bus->number,
1599                                                   tmp->devfn,
1600                                                   translation);
1601 }
1602
1603 static int domain_context_mapped(struct pci_dev *pdev)
1604 {
1605         int ret;
1606         struct pci_dev *tmp, *parent;
1607         struct intel_iommu *iommu;
1608
1609         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1610                                 pdev->devfn);
1611         if (!iommu)
1612                 return -ENODEV;
1613
1614         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1615         if (!ret)
1616                 return ret;
1617         /* dependent device mapping */
1618         tmp = pci_find_upstream_pcie_bridge(pdev);
1619         if (!tmp)
1620                 return ret;
1621         /* Secondary interface's bus number and devfn 0 */
1622         parent = pdev->bus->self;
1623         while (parent != tmp) {
1624                 ret = device_context_mapped(iommu, parent->bus->number,
1625                                             parent->devfn);
1626                 if (!ret)
1627                         return ret;
1628                 parent = parent->bus->self;
1629         }
1630         if (pci_is_pcie(tmp))
1631                 return device_context_mapped(iommu, tmp->subordinate->number,
1632                                              0);
1633         else
1634                 return device_context_mapped(iommu, tmp->bus->number,
1635                                              tmp->devfn);
1636 }
1637
1638 /* Returns a number of VTD pages, but aligned to MM page size */
1639 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1640                                             size_t size)
1641 {
1642         host_addr &= ~PAGE_MASK;
1643         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1644 }
1645
1646 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1647                             struct scatterlist *sg, unsigned long phys_pfn,
1648                             unsigned long nr_pages, int prot)
1649 {
1650         struct dma_pte *first_pte = NULL, *pte = NULL;
1651         phys_addr_t uninitialized_var(pteval);
1652         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1653         unsigned long sg_res;
1654
1655         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1656
1657         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1658                 return -EINVAL;
1659
1660         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1661
1662         if (sg)
1663                 sg_res = 0;
1664         else {
1665                 sg_res = nr_pages + 1;
1666                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1667         }
1668
1669         while (nr_pages--) {
1670                 uint64_t tmp;
1671
1672                 if (!sg_res) {
1673                         sg_res = aligned_nrpages(sg->offset, sg->length);
1674                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1675                         sg->dma_length = sg->length;
1676                         pteval = page_to_phys(sg_page(sg)) | prot;
1677                 }
1678                 if (!pte) {
1679                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1680                         if (!pte)
1681                                 return -ENOMEM;
1682                 }
1683                 /* We don't need lock here, nobody else
1684                  * touches the iova range
1685                  */
1686                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1687                 if (tmp) {
1688                         static int dumps = 5;
1689                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1690                                iov_pfn, tmp, (unsigned long long)pteval);
1691                         if (dumps) {
1692                                 dumps--;
1693                                 debug_dma_dump_mappings(NULL);
1694                         }
1695                         WARN_ON(1);
1696                 }
1697                 pte++;
1698                 if (!nr_pages || first_pte_in_page(pte)) {
1699                         domain_flush_cache(domain, first_pte,
1700                                            (void *)pte - (void *)first_pte);
1701                         pte = NULL;
1702                 }
1703                 iov_pfn++;
1704                 pteval += VTD_PAGE_SIZE;
1705                 sg_res--;
1706                 if (!sg_res)
1707                         sg = sg_next(sg);
1708         }
1709         return 0;
1710 }
1711
1712 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1713                                     struct scatterlist *sg, unsigned long nr_pages,
1714                                     int prot)
1715 {
1716         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1717 }
1718
1719 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1720                                      unsigned long phys_pfn, unsigned long nr_pages,
1721                                      int prot)
1722 {
1723         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1724 }
1725
1726 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1727 {
1728         if (!iommu)
1729                 return;
1730
1731         clear_context_table(iommu, bus, devfn);
1732         iommu->flush.flush_context(iommu, 0, 0, 0,
1733                                            DMA_CCMD_GLOBAL_INVL);
1734         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1735 }
1736
1737 static void domain_remove_dev_info(struct dmar_domain *domain)
1738 {
1739         struct device_domain_info *info;
1740         unsigned long flags;
1741         struct intel_iommu *iommu;
1742
1743         spin_lock_irqsave(&device_domain_lock, flags);
1744         while (!list_empty(&domain->devices)) {
1745                 info = list_entry(domain->devices.next,
1746                         struct device_domain_info, link);
1747                 list_del(&info->link);
1748                 list_del(&info->global);
1749                 if (info->dev)
1750                         info->dev->dev.archdata.iommu = NULL;
1751                 spin_unlock_irqrestore(&device_domain_lock, flags);
1752
1753                 iommu_disable_dev_iotlb(info);
1754                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1755                 iommu_detach_dev(iommu, info->bus, info->devfn);
1756                 free_devinfo_mem(info);
1757
1758                 spin_lock_irqsave(&device_domain_lock, flags);
1759         }
1760         spin_unlock_irqrestore(&device_domain_lock, flags);
1761 }
1762
1763 /*
1764  * find_domain
1765  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1766  */
1767 static struct dmar_domain *
1768 find_domain(struct pci_dev *pdev)
1769 {
1770         struct device_domain_info *info;
1771
1772         /* No lock here, assumes no domain exit in normal case */
1773         info = pdev->dev.archdata.iommu;
1774         if (info)
1775                 return info->domain;
1776         return NULL;
1777 }
1778
1779 /* domain is initialized */
1780 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1781 {
1782         struct dmar_domain *domain, *found = NULL;
1783         struct intel_iommu *iommu;
1784         struct dmar_drhd_unit *drhd;
1785         struct device_domain_info *info, *tmp;
1786         struct pci_dev *dev_tmp;
1787         unsigned long flags;
1788         int bus = 0, devfn = 0;
1789         int segment;
1790         int ret;
1791
1792         domain = find_domain(pdev);
1793         if (domain)
1794                 return domain;
1795
1796         segment = pci_domain_nr(pdev->bus);
1797
1798         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1799         if (dev_tmp) {
1800                 if (pci_is_pcie(dev_tmp)) {
1801                         bus = dev_tmp->subordinate->number;
1802                         devfn = 0;
1803                 } else {
1804                         bus = dev_tmp->bus->number;
1805                         devfn = dev_tmp->devfn;
1806                 }
1807                 spin_lock_irqsave(&device_domain_lock, flags);
1808                 list_for_each_entry(info, &device_domain_list, global) {
1809                         if (info->segment == segment &&
1810                             info->bus == bus && info->devfn == devfn) {
1811                                 found = info->domain;
1812                                 break;
1813                         }
1814                 }
1815                 spin_unlock_irqrestore(&device_domain_lock, flags);
1816                 /* pcie-pci bridge already has a domain, uses it */
1817                 if (found) {
1818                         domain = found;
1819                         goto found_domain;
1820                 }
1821         }
1822
1823         domain = alloc_domain();
1824         if (!domain)
1825                 goto error;
1826
1827         /* Allocate new domain for the device */
1828         drhd = dmar_find_matched_drhd_unit(pdev);
1829         if (!drhd) {
1830                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1831                         pci_name(pdev));
1832                 return NULL;
1833         }
1834         iommu = drhd->iommu;
1835
1836         ret = iommu_attach_domain(domain, iommu);
1837         if (ret) {
1838                 domain_exit(domain);
1839                 goto error;
1840         }
1841
1842         if (domain_init(domain, gaw)) {
1843                 domain_exit(domain);
1844                 goto error;
1845         }
1846
1847         /* register pcie-to-pci device */
1848         if (dev_tmp) {
1849                 info = alloc_devinfo_mem();
1850                 if (!info) {
1851                         domain_exit(domain);
1852                         goto error;
1853                 }
1854                 info->segment = segment;
1855                 info->bus = bus;
1856                 info->devfn = devfn;
1857                 info->dev = NULL;
1858                 info->domain = domain;
1859                 /* This domain is shared by devices under p2p bridge */
1860                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1861
1862                 /* pcie-to-pci bridge already has a domain, uses it */
1863                 found = NULL;
1864                 spin_lock_irqsave(&device_domain_lock, flags);
1865                 list_for_each_entry(tmp, &device_domain_list, global) {
1866                         if (tmp->segment == segment &&
1867                             tmp->bus == bus && tmp->devfn == devfn) {
1868                                 found = tmp->domain;
1869                                 break;
1870                         }
1871                 }
1872                 if (found) {
1873                         spin_unlock_irqrestore(&device_domain_lock, flags);
1874                         free_devinfo_mem(info);
1875                         domain_exit(domain);
1876                         domain = found;
1877                 } else {
1878                         list_add(&info->link, &domain->devices);
1879                         list_add(&info->global, &device_domain_list);
1880                         spin_unlock_irqrestore(&device_domain_lock, flags);
1881                 }
1882         }
1883
1884 found_domain:
1885         info = alloc_devinfo_mem();
1886         if (!info)
1887                 goto error;
1888         info->segment = segment;
1889         info->bus = pdev->bus->number;
1890         info->devfn = pdev->devfn;
1891         info->dev = pdev;
1892         info->domain = domain;
1893         spin_lock_irqsave(&device_domain_lock, flags);
1894         /* somebody is fast */
1895         found = find_domain(pdev);
1896         if (found != NULL) {
1897                 spin_unlock_irqrestore(&device_domain_lock, flags);
1898                 if (found != domain) {
1899                         domain_exit(domain);
1900                         domain = found;
1901                 }
1902                 free_devinfo_mem(info);
1903                 return domain;
1904         }
1905         list_add(&info->link, &domain->devices);
1906         list_add(&info->global, &device_domain_list);
1907         pdev->dev.archdata.iommu = info;
1908         spin_unlock_irqrestore(&device_domain_lock, flags);
1909         return domain;
1910 error:
1911         /* recheck it here, maybe others set it */
1912         return find_domain(pdev);
1913 }
1914
1915 static int iommu_identity_mapping;
1916 #define IDENTMAP_ALL            1
1917 #define IDENTMAP_GFX            2
1918 #define IDENTMAP_AZALIA         4
1919
1920 static int iommu_domain_identity_map(struct dmar_domain *domain,
1921                                      unsigned long long start,
1922                                      unsigned long long end)
1923 {
1924         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1925         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1926
1927         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1928                           dma_to_mm_pfn(last_vpfn))) {
1929                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1930                 return -ENOMEM;
1931         }
1932
1933         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1934                  start, end, domain->id);
1935         /*
1936          * RMRR range might have overlap with physical memory range,
1937          * clear it first
1938          */
1939         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1940
1941         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1942                                   last_vpfn - first_vpfn + 1,
1943                                   DMA_PTE_READ|DMA_PTE_WRITE);
1944 }
1945
1946 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1947                                       unsigned long long start,
1948                                       unsigned long long end)
1949 {
1950         struct dmar_domain *domain;
1951         int ret;
1952
1953         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1954         if (!domain)
1955                 return -ENOMEM;
1956
1957         /* For _hardware_ passthrough, don't bother. But for software
1958            passthrough, we do it anyway -- it may indicate a memory
1959            range which is reserved in E820, so which didn't get set
1960            up to start with in si_domain */
1961         if (domain == si_domain && hw_pass_through) {
1962                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1963                        pci_name(pdev), start, end);
1964                 return 0;
1965         }
1966
1967         printk(KERN_INFO
1968                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1969                pci_name(pdev), start, end);
1970         
1971         if (end < start) {
1972                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1973                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1974                         dmi_get_system_info(DMI_BIOS_VENDOR),
1975                         dmi_get_system_info(DMI_BIOS_VERSION),
1976                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1977                 ret = -EIO;
1978                 goto error;
1979         }
1980
1981         if (end >> agaw_to_width(domain->agaw)) {
1982                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1983                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1984                      agaw_to_width(domain->agaw),
1985                      dmi_get_system_info(DMI_BIOS_VENDOR),
1986                      dmi_get_system_info(DMI_BIOS_VERSION),
1987                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1988                 ret = -EIO;
1989                 goto error;
1990         }
1991
1992         ret = iommu_domain_identity_map(domain, start, end);
1993         if (ret)
1994                 goto error;
1995
1996         /* context entry init */
1997         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
1998         if (ret)
1999                 goto error;
2000
2001         return 0;
2002
2003  error:
2004         domain_exit(domain);
2005         return ret;
2006 }
2007
2008 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2009         struct pci_dev *pdev)
2010 {
2011         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2012                 return 0;
2013         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2014                 rmrr->end_address + 1);
2015 }
2016
2017 #ifdef CONFIG_DMAR_FLOPPY_WA
2018 static inline void iommu_prepare_isa(void)
2019 {
2020         struct pci_dev *pdev;
2021         int ret;
2022
2023         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2024         if (!pdev)
2025                 return;
2026
2027         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2028         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2029
2030         if (ret)
2031                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2032                        "floppy might not work\n");
2033
2034 }
2035 #else
2036 static inline void iommu_prepare_isa(void)
2037 {
2038         return;
2039 }
2040 #endif /* !CONFIG_DMAR_FLPY_WA */
2041
2042 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2043
2044 static int __init si_domain_work_fn(unsigned long start_pfn,
2045                                     unsigned long end_pfn, void *datax)
2046 {
2047         int *ret = datax;
2048
2049         *ret = iommu_domain_identity_map(si_domain,
2050                                          (uint64_t)start_pfn << PAGE_SHIFT,
2051                                          (uint64_t)end_pfn << PAGE_SHIFT);
2052         return *ret;
2053
2054 }
2055
2056 static int __init si_domain_init(int hw)
2057 {
2058         struct dmar_drhd_unit *drhd;
2059         struct intel_iommu *iommu;
2060         int nid, ret = 0;
2061
2062         si_domain = alloc_domain();
2063         if (!si_domain)
2064                 return -EFAULT;
2065
2066         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2067
2068         for_each_active_iommu(iommu, drhd) {
2069                 ret = iommu_attach_domain(si_domain, iommu);
2070                 if (ret) {
2071                         domain_exit(si_domain);
2072                         return -EFAULT;
2073                 }
2074         }
2075
2076         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2077                 domain_exit(si_domain);
2078                 return -EFAULT;
2079         }
2080
2081         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2082
2083         if (hw)
2084                 return 0;
2085
2086         for_each_online_node(nid) {
2087                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2088                 if (ret)
2089                         return ret;
2090         }
2091
2092         return 0;
2093 }
2094
2095 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2096                                           struct pci_dev *pdev);
2097 static int identity_mapping(struct pci_dev *pdev)
2098 {
2099         struct device_domain_info *info;
2100
2101         if (likely(!iommu_identity_mapping))
2102                 return 0;
2103
2104
2105         list_for_each_entry(info, &si_domain->devices, link)
2106                 if (info->dev == pdev)
2107                         return 1;
2108         return 0;
2109 }
2110
2111 static int domain_add_dev_info(struct dmar_domain *domain,
2112                                struct pci_dev *pdev,
2113                                int translation)
2114 {
2115         struct device_domain_info *info;
2116         unsigned long flags;
2117         int ret;
2118
2119         info = alloc_devinfo_mem();
2120         if (!info)
2121                 return -ENOMEM;
2122
2123         ret = domain_context_mapping(domain, pdev, translation);
2124         if (ret) {
2125                 free_devinfo_mem(info);
2126                 return ret;
2127         }
2128
2129         info->segment = pci_domain_nr(pdev->bus);
2130         info->bus = pdev->bus->number;
2131         info->devfn = pdev->devfn;
2132         info->dev = pdev;
2133         info->domain = domain;
2134
2135         spin_lock_irqsave(&device_domain_lock, flags);
2136         list_add(&info->link, &domain->devices);
2137         list_add(&info->global, &device_domain_list);
2138         pdev->dev.archdata.iommu = info;
2139         spin_unlock_irqrestore(&device_domain_lock, flags);
2140
2141         return 0;
2142 }
2143
2144 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2145 {
2146         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2147                 return 1;
2148
2149         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2150                 return 1;
2151
2152         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2153                 return 0;
2154
2155         /*
2156          * We want to start off with all devices in the 1:1 domain, and
2157          * take them out later if we find they can't access all of memory.
2158          *
2159          * However, we can't do this for PCI devices behind bridges,
2160          * because all PCI devices behind the same bridge will end up
2161          * with the same source-id on their transactions.
2162          *
2163          * Practically speaking, we can't change things around for these
2164          * devices at run-time, because we can't be sure there'll be no
2165          * DMA transactions in flight for any of their siblings.
2166          * 
2167          * So PCI devices (unless they're on the root bus) as well as
2168          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2169          * the 1:1 domain, just in _case_ one of their siblings turns out
2170          * not to be able to map all of memory.
2171          */
2172         if (!pci_is_pcie(pdev)) {
2173                 if (!pci_is_root_bus(pdev->bus))
2174                         return 0;
2175                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2176                         return 0;
2177         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2178                 return 0;
2179
2180         /* 
2181          * At boot time, we don't yet know if devices will be 64-bit capable.
2182          * Assume that they will -- if they turn out not to be, then we can 
2183          * take them out of the 1:1 domain later.
2184          */
2185         if (!startup)
2186                 return pdev->dma_mask > DMA_BIT_MASK(32);
2187
2188         return 1;
2189 }
2190
2191 static int __init iommu_prepare_static_identity_mapping(int hw)
2192 {
2193         struct pci_dev *pdev = NULL;
2194         int ret;
2195
2196         ret = si_domain_init(hw);
2197         if (ret)
2198                 return -EFAULT;
2199
2200         for_each_pci_dev(pdev) {
2201                 if (iommu_should_identity_map(pdev, 1)) {
2202                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2203                                hw ? "hardware" : "software", pci_name(pdev));
2204
2205                         ret = domain_add_dev_info(si_domain, pdev,
2206                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2207                                                      CONTEXT_TT_MULTI_LEVEL);
2208                         if (ret)
2209                                 return ret;
2210                 }
2211         }
2212
2213         return 0;
2214 }
2215
2216 int __init init_dmars(void)
2217 {
2218         struct dmar_drhd_unit *drhd;
2219         struct dmar_rmrr_unit *rmrr;
2220         struct pci_dev *pdev;
2221         struct intel_iommu *iommu;
2222         int i, ret;
2223
2224         /*
2225          * for each drhd
2226          *    allocate root
2227          *    initialize and program root entry to not present
2228          * endfor
2229          */
2230         for_each_drhd_unit(drhd) {
2231                 g_num_of_iommus++;
2232                 /*
2233                  * lock not needed as this is only incremented in the single
2234                  * threaded kernel __init code path all other access are read
2235                  * only
2236                  */
2237         }
2238
2239         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2240                         GFP_KERNEL);
2241         if (!g_iommus) {
2242                 printk(KERN_ERR "Allocating global iommu array failed\n");
2243                 ret = -ENOMEM;
2244                 goto error;
2245         }
2246
2247         deferred_flush = kzalloc(g_num_of_iommus *
2248                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2249         if (!deferred_flush) {
2250                 ret = -ENOMEM;
2251                 goto error;
2252         }
2253
2254         for_each_drhd_unit(drhd) {
2255                 if (drhd->ignored)
2256                         continue;
2257
2258                 iommu = drhd->iommu;
2259                 g_iommus[iommu->seq_id] = iommu;
2260
2261                 ret = iommu_init_domains(iommu);
2262                 if (ret)
2263                         goto error;
2264
2265                 /*
2266                  * TBD:
2267                  * we could share the same root & context tables
2268                  * amoung all IOMMU's. Need to Split it later.
2269                  */
2270                 ret = iommu_alloc_root_entry(iommu);
2271                 if (ret) {
2272                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2273                         goto error;
2274                 }
2275                 if (!ecap_pass_through(iommu->ecap))
2276                         hw_pass_through = 0;
2277         }
2278
2279         /*
2280          * Start from the sane iommu hardware state.
2281          */
2282         for_each_drhd_unit(drhd) {
2283                 if (drhd->ignored)
2284                         continue;
2285
2286                 iommu = drhd->iommu;
2287
2288                 /*
2289                  * If the queued invalidation is already initialized by us
2290                  * (for example, while enabling interrupt-remapping) then
2291                  * we got the things already rolling from a sane state.
2292                  */
2293                 if (iommu->qi)
2294                         continue;
2295
2296                 /*
2297                  * Clear any previous faults.
2298                  */
2299                 dmar_fault(-1, iommu);
2300                 /*
2301                  * Disable queued invalidation if supported and already enabled
2302                  * before OS handover.
2303                  */
2304                 dmar_disable_qi(iommu);
2305         }
2306
2307         for_each_drhd_unit(drhd) {
2308                 if (drhd->ignored)
2309                         continue;
2310
2311                 iommu = drhd->iommu;
2312
2313                 if (dmar_enable_qi(iommu)) {
2314                         /*
2315                          * Queued Invalidate not enabled, use Register Based
2316                          * Invalidate
2317                          */
2318                         iommu->flush.flush_context = __iommu_flush_context;
2319                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2320                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2321                                "invalidation\n",
2322                                 iommu->seq_id,
2323                                (unsigned long long)drhd->reg_base_addr);
2324                 } else {
2325                         iommu->flush.flush_context = qi_flush_context;
2326                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2327                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2328                                "invalidation\n",
2329                                 iommu->seq_id,
2330                                (unsigned long long)drhd->reg_base_addr);
2331                 }
2332         }
2333
2334         if (iommu_pass_through)
2335                 iommu_identity_mapping |= IDENTMAP_ALL;
2336
2337 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2338         iommu_identity_mapping |= IDENTMAP_GFX;
2339 #endif
2340
2341         check_tylersburg_isoch();
2342
2343         /*
2344          * If pass through is not set or not enabled, setup context entries for
2345          * identity mappings for rmrr, gfx, and isa and may fall back to static
2346          * identity mapping if iommu_identity_mapping is set.
2347          */
2348         if (iommu_identity_mapping) {
2349                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2350                 if (ret) {
2351                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2352                         goto error;
2353                 }
2354         }
2355         /*
2356          * For each rmrr
2357          *   for each dev attached to rmrr
2358          *   do
2359          *     locate drhd for dev, alloc domain for dev
2360          *     allocate free domain
2361          *     allocate page table entries for rmrr
2362          *     if context not allocated for bus
2363          *           allocate and init context
2364          *           set present in root table for this bus
2365          *     init context with domain, translation etc
2366          *    endfor
2367          * endfor
2368          */
2369         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2370         for_each_rmrr_units(rmrr) {
2371                 for (i = 0; i < rmrr->devices_cnt; i++) {
2372                         pdev = rmrr->devices[i];
2373                         /*
2374                          * some BIOS lists non-exist devices in DMAR
2375                          * table.
2376                          */
2377                         if (!pdev)
2378                                 continue;
2379                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2380                         if (ret)
2381                                 printk(KERN_ERR
2382                                        "IOMMU: mapping reserved region failed\n");
2383                 }
2384         }
2385
2386         iommu_prepare_isa();
2387
2388         /*
2389          * for each drhd
2390          *   enable fault log
2391          *   global invalidate context cache
2392          *   global invalidate iotlb
2393          *   enable translation
2394          */
2395         for_each_drhd_unit(drhd) {
2396                 if (drhd->ignored)
2397                         continue;
2398                 iommu = drhd->iommu;
2399
2400                 iommu_flush_write_buffer(iommu);
2401
2402                 ret = dmar_set_interrupt(iommu);
2403                 if (ret)
2404                         goto error;
2405
2406                 iommu_set_root_entry(iommu);
2407
2408                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2409                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2410
2411                 ret = iommu_enable_translation(iommu);
2412                 if (ret)
2413                         goto error;
2414
2415                 iommu_disable_protect_mem_regions(iommu);
2416         }
2417
2418         return 0;
2419 error:
2420         for_each_drhd_unit(drhd) {
2421                 if (drhd->ignored)
2422                         continue;
2423                 iommu = drhd->iommu;
2424                 free_iommu(iommu);
2425         }
2426         kfree(g_iommus);
2427         return ret;
2428 }
2429
2430 /* This takes a number of _MM_ pages, not VTD pages */
2431 static struct iova *intel_alloc_iova(struct device *dev,
2432                                      struct dmar_domain *domain,
2433                                      unsigned long nrpages, uint64_t dma_mask)
2434 {
2435         struct pci_dev *pdev = to_pci_dev(dev);
2436         struct iova *iova = NULL;
2437
2438         /* Restrict dma_mask to the width that the iommu can handle */
2439         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2440
2441         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2442                 /*
2443                  * First try to allocate an io virtual address in
2444                  * DMA_BIT_MASK(32) and if that fails then try allocating
2445                  * from higher range
2446                  */
2447                 iova = alloc_iova(&domain->iovad, nrpages,
2448                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2449                 if (iova)
2450                         return iova;
2451         }
2452         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2453         if (unlikely(!iova)) {
2454                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2455                        nrpages, pci_name(pdev));
2456                 return NULL;
2457         }
2458
2459         return iova;
2460 }
2461
2462 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2463 {
2464         struct dmar_domain *domain;
2465         int ret;
2466
2467         domain = get_domain_for_dev(pdev,
2468                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2469         if (!domain) {
2470                 printk(KERN_ERR
2471                         "Allocating domain for %s failed", pci_name(pdev));
2472                 return NULL;
2473         }
2474
2475         /* make sure context mapping is ok */
2476         if (unlikely(!domain_context_mapped(pdev))) {
2477                 ret = domain_context_mapping(domain, pdev,
2478                                              CONTEXT_TT_MULTI_LEVEL);
2479                 if (ret) {
2480                         printk(KERN_ERR
2481                                 "Domain context map for %s failed",
2482                                 pci_name(pdev));
2483                         return NULL;
2484                 }
2485         }
2486
2487         return domain;
2488 }
2489
2490 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2491 {
2492         struct device_domain_info *info;
2493
2494         /* No lock here, assumes no domain exit in normal case */
2495         info = dev->dev.archdata.iommu;
2496         if (likely(info))
2497                 return info->domain;
2498
2499         return __get_valid_domain_for_dev(dev);
2500 }
2501
2502 static int iommu_dummy(struct pci_dev *pdev)
2503 {
2504         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2505 }
2506
2507 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2508 static int iommu_no_mapping(struct device *dev)
2509 {
2510         struct pci_dev *pdev;
2511         int found;
2512
2513         if (unlikely(dev->bus != &pci_bus_type))
2514                 return 1;
2515
2516         pdev = to_pci_dev(dev);
2517         if (iommu_dummy(pdev))
2518                 return 1;
2519
2520         if (!iommu_identity_mapping)
2521                 return 0;
2522
2523         found = identity_mapping(pdev);
2524         if (found) {
2525                 if (iommu_should_identity_map(pdev, 0))
2526                         return 1;
2527                 else {
2528                         /*
2529                          * 32 bit DMA is removed from si_domain and fall back
2530                          * to non-identity mapping.
2531                          */
2532                         domain_remove_one_dev_info(si_domain, pdev);
2533                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2534                                pci_name(pdev));
2535                         return 0;
2536                 }
2537         } else {
2538                 /*
2539                  * In case of a detached 64 bit DMA device from vm, the device
2540                  * is put into si_domain for identity mapping.
2541                  */
2542                 if (iommu_should_identity_map(pdev, 0)) {
2543                         int ret;
2544                         ret = domain_add_dev_info(si_domain, pdev,
2545                                                   hw_pass_through ?
2546                                                   CONTEXT_TT_PASS_THROUGH :
2547                                                   CONTEXT_TT_MULTI_LEVEL);
2548                         if (!ret) {
2549                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2550                                        pci_name(pdev));
2551                                 return 1;
2552                         }
2553                 }
2554         }
2555
2556         return 0;
2557 }
2558
2559 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2560                                      size_t size, int dir, u64 dma_mask)
2561 {
2562         struct pci_dev *pdev = to_pci_dev(hwdev);
2563         struct dmar_domain *domain;
2564         phys_addr_t start_paddr;
2565         struct iova *iova;
2566         int prot = 0;
2567         int ret;
2568         struct intel_iommu *iommu;
2569         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2570
2571         BUG_ON(dir == DMA_NONE);
2572
2573         if (iommu_no_mapping(hwdev))
2574                 return paddr;
2575
2576         domain = get_valid_domain_for_dev(pdev);
2577         if (!domain)
2578                 return 0;
2579
2580         iommu = domain_get_iommu(domain);
2581         size = aligned_nrpages(paddr, size);
2582
2583         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2584                                 pdev->dma_mask);
2585         if (!iova)
2586                 goto error;
2587
2588         /*
2589          * Check if DMAR supports zero-length reads on write only
2590          * mappings..
2591          */
2592         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2593                         !cap_zlr(iommu->cap))
2594                 prot |= DMA_PTE_READ;
2595         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2596                 prot |= DMA_PTE_WRITE;
2597         /*
2598          * paddr - (paddr + size) might be partial page, we should map the whole
2599          * page.  Note: if two part of one page are separately mapped, we
2600          * might have two guest_addr mapping to the same host paddr, but this
2601          * is not a big problem
2602          */
2603         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2604                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2605         if (ret)
2606                 goto error;
2607
2608         /* it's a non-present to present mapping. Only flush if caching mode */
2609         if (cap_caching_mode(iommu->cap))
2610                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2611         else
2612                 iommu_flush_write_buffer(iommu);
2613
2614         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2615         start_paddr += paddr & ~PAGE_MASK;
2616         return start_paddr;
2617
2618 error:
2619         if (iova)
2620                 __free_iova(&domain->iovad, iova);
2621         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2622                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2623         return 0;
2624 }
2625
2626 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2627                                  unsigned long offset, size_t size,
2628                                  enum dma_data_direction dir,
2629                                  struct dma_attrs *attrs)
2630 {
2631         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2632                                   dir, to_pci_dev(dev)->dma_mask);
2633 }
2634
2635 static void flush_unmaps(void)
2636 {
2637         int i, j;
2638
2639         timer_on = 0;
2640
2641         /* just flush them all */
2642         for (i = 0; i < g_num_of_iommus; i++) {
2643                 struct intel_iommu *iommu = g_iommus[i];
2644                 if (!iommu)
2645                         continue;
2646
2647                 if (!deferred_flush[i].next)
2648                         continue;
2649
2650                 /* In caching mode, global flushes turn emulation expensive */
2651                 if (!cap_caching_mode(iommu->cap))
2652                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2653                                          DMA_TLB_GLOBAL_FLUSH);
2654                 for (j = 0; j < deferred_flush[i].next; j++) {
2655                         unsigned long mask;
2656                         struct iova *iova = deferred_flush[i].iova[j];
2657                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2658
2659                         /* On real hardware multiple invalidations are expensive */
2660                         if (cap_caching_mode(iommu->cap))
2661                                 iommu_flush_iotlb_psi(iommu, domain->id,
2662                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2663                         else {
2664                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2665                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2666                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2667                         }
2668                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2669                 }
2670                 deferred_flush[i].next = 0;
2671         }
2672
2673         list_size = 0;
2674 }
2675
2676 static void flush_unmaps_timeout(unsigned long data)
2677 {
2678         unsigned long flags;
2679
2680         spin_lock_irqsave(&async_umap_flush_lock, flags);
2681         flush_unmaps();
2682         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2683 }
2684
2685 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2686 {
2687         unsigned long flags;
2688         int next, iommu_id;
2689         struct intel_iommu *iommu;
2690
2691         spin_lock_irqsave(&async_umap_flush_lock, flags);
2692         if (list_size == HIGH_WATER_MARK)
2693                 flush_unmaps();
2694
2695         iommu = domain_get_iommu(dom);
2696         iommu_id = iommu->seq_id;
2697
2698         next = deferred_flush[iommu_id].next;
2699         deferred_flush[iommu_id].domain[next] = dom;
2700         deferred_flush[iommu_id].iova[next] = iova;
2701         deferred_flush[iommu_id].next++;
2702
2703         if (!timer_on) {
2704                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2705                 timer_on = 1;
2706         }
2707         list_size++;
2708         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2709 }
2710
2711 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2712                              size_t size, enum dma_data_direction dir,
2713                              struct dma_attrs *attrs)
2714 {
2715         struct pci_dev *pdev = to_pci_dev(dev);
2716         struct dmar_domain *domain;
2717         unsigned long start_pfn, last_pfn;
2718         struct iova *iova;
2719         struct intel_iommu *iommu;
2720
2721         if (iommu_no_mapping(dev))
2722                 return;
2723
2724         domain = find_domain(pdev);
2725         BUG_ON(!domain);
2726
2727         iommu = domain_get_iommu(domain);
2728
2729         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2730         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2731                       (unsigned long long)dev_addr))
2732                 return;
2733
2734         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2735         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2736
2737         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2738                  pci_name(pdev), start_pfn, last_pfn);
2739
2740         /*  clear the whole page */
2741         dma_pte_clear_range(domain, start_pfn, last_pfn);
2742
2743         /* free page tables */
2744         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2745
2746         if (intel_iommu_strict) {
2747                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2748                                       last_pfn - start_pfn + 1, 0);
2749                 /* free iova */
2750                 __free_iova(&domain->iovad, iova);
2751         } else {
2752                 add_unmap(domain, iova);
2753                 /*
2754                  * queue up the release of the unmap to save the 1/6th of the
2755                  * cpu used up by the iotlb flush operation...
2756                  */
2757         }
2758 }
2759
2760 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2761                                   dma_addr_t *dma_handle, gfp_t flags)
2762 {
2763         void *vaddr;
2764         int order;
2765
2766         size = PAGE_ALIGN(size);
2767         order = get_order(size);
2768
2769         if (!iommu_no_mapping(hwdev))
2770                 flags &= ~(GFP_DMA | GFP_DMA32);
2771         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2772                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2773                         flags |= GFP_DMA;
2774                 else
2775                         flags |= GFP_DMA32;
2776         }
2777
2778         vaddr = (void *)__get_free_pages(flags, order);
2779         if (!vaddr)
2780                 return NULL;
2781         memset(vaddr, 0, size);
2782
2783         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2784                                          DMA_BIDIRECTIONAL,
2785                                          hwdev->coherent_dma_mask);
2786         if (*dma_handle)
2787                 return vaddr;
2788         free_pages((unsigned long)vaddr, order);
2789         return NULL;
2790 }
2791
2792 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2793                                 dma_addr_t dma_handle)
2794 {
2795         int order;
2796
2797         size = PAGE_ALIGN(size);
2798         order = get_order(size);
2799
2800         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2801         free_pages((unsigned long)vaddr, order);
2802 }
2803
2804 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2805                            int nelems, enum dma_data_direction dir,
2806                            struct dma_attrs *attrs)
2807 {
2808         struct pci_dev *pdev = to_pci_dev(hwdev);
2809         struct dmar_domain *domain;
2810         unsigned long start_pfn, last_pfn;
2811         struct iova *iova;
2812         struct intel_iommu *iommu;
2813
2814         if (iommu_no_mapping(hwdev))
2815                 return;
2816
2817         domain = find_domain(pdev);
2818         BUG_ON(!domain);
2819
2820         iommu = domain_get_iommu(domain);
2821
2822         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2823         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2824                       (unsigned long long)sglist[0].dma_address))
2825                 return;
2826
2827         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2828         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2829
2830         /*  clear the whole page */
2831         dma_pte_clear_range(domain, start_pfn, last_pfn);
2832
2833         /* free page tables */
2834         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2835
2836         if (intel_iommu_strict) {
2837                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2838                                       last_pfn - start_pfn + 1, 0);
2839                 /* free iova */
2840                 __free_iova(&domain->iovad, iova);
2841         } else {
2842                 add_unmap(domain, iova);
2843                 /*
2844                  * queue up the release of the unmap to save the 1/6th of the
2845                  * cpu used up by the iotlb flush operation...
2846                  */
2847         }
2848 }
2849
2850 static int intel_nontranslate_map_sg(struct device *hddev,
2851         struct scatterlist *sglist, int nelems, int dir)
2852 {
2853         int i;
2854         struct scatterlist *sg;
2855
2856         for_each_sg(sglist, sg, nelems, i) {
2857                 BUG_ON(!sg_page(sg));
2858                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2859                 sg->dma_length = sg->length;
2860         }
2861         return nelems;
2862 }
2863
2864 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2865                         enum dma_data_direction dir, struct dma_attrs *attrs)
2866 {
2867         int i;
2868         struct pci_dev *pdev = to_pci_dev(hwdev);
2869         struct dmar_domain *domain;
2870         size_t size = 0;
2871         int prot = 0;
2872         struct iova *iova = NULL;
2873         int ret;
2874         struct scatterlist *sg;
2875         unsigned long start_vpfn;
2876         struct intel_iommu *iommu;
2877
2878         BUG_ON(dir == DMA_NONE);
2879         if (iommu_no_mapping(hwdev))
2880                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2881
2882         domain = get_valid_domain_for_dev(pdev);
2883         if (!domain)
2884                 return 0;
2885
2886         iommu = domain_get_iommu(domain);
2887
2888         for_each_sg(sglist, sg, nelems, i)
2889                 size += aligned_nrpages(sg->offset, sg->length);
2890
2891         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2892                                 pdev->dma_mask);
2893         if (!iova) {
2894                 sglist->dma_length = 0;
2895                 return 0;
2896         }
2897
2898         /*
2899          * Check if DMAR supports zero-length reads on write only
2900          * mappings..
2901          */
2902         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2903                         !cap_zlr(iommu->cap))
2904                 prot |= DMA_PTE_READ;
2905         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2906                 prot |= DMA_PTE_WRITE;
2907
2908         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2909
2910         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2911         if (unlikely(ret)) {
2912                 /*  clear the page */
2913                 dma_pte_clear_range(domain, start_vpfn,
2914                                     start_vpfn + size - 1);
2915                 /* free page tables */
2916                 dma_pte_free_pagetable(domain, start_vpfn,
2917                                        start_vpfn + size - 1);
2918                 /* free iova */
2919                 __free_iova(&domain->iovad, iova);
2920                 return 0;
2921         }
2922
2923         /* it's a non-present to present mapping. Only flush if caching mode */
2924         if (cap_caching_mode(iommu->cap))
2925                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
2926         else
2927                 iommu_flush_write_buffer(iommu);
2928
2929         return nelems;
2930 }
2931
2932 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2933 {
2934         return !dma_addr;
2935 }
2936
2937 struct dma_map_ops intel_dma_ops = {
2938         .alloc_coherent = intel_alloc_coherent,
2939         .free_coherent = intel_free_coherent,
2940         .map_sg = intel_map_sg,
2941         .unmap_sg = intel_unmap_sg,
2942         .map_page = intel_map_page,
2943         .unmap_page = intel_unmap_page,
2944         .mapping_error = intel_mapping_error,
2945 };
2946
2947 static inline int iommu_domain_cache_init(void)
2948 {
2949         int ret = 0;
2950
2951         iommu_domain_cache = kmem_cache_create("iommu_domain",
2952                                          sizeof(struct dmar_domain),
2953                                          0,
2954                                          SLAB_HWCACHE_ALIGN,
2955
2956                                          NULL);
2957         if (!iommu_domain_cache) {
2958                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2959                 ret = -ENOMEM;
2960         }
2961
2962         return ret;
2963 }
2964
2965 static inline int iommu_devinfo_cache_init(void)
2966 {
2967         int ret = 0;
2968
2969         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2970                                          sizeof(struct device_domain_info),
2971                                          0,
2972                                          SLAB_HWCACHE_ALIGN,
2973                                          NULL);
2974         if (!iommu_devinfo_cache) {
2975                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2976                 ret = -ENOMEM;
2977         }
2978
2979         return ret;
2980 }
2981
2982 static inline int iommu_iova_cache_init(void)
2983 {
2984         int ret = 0;
2985
2986         iommu_iova_cache = kmem_cache_create("iommu_iova",
2987                                          sizeof(struct iova),
2988                                          0,
2989                                          SLAB_HWCACHE_ALIGN,
2990                                          NULL);
2991         if (!iommu_iova_cache) {
2992                 printk(KERN_ERR "Couldn't create iova cache\n");
2993                 ret = -ENOMEM;
2994         }
2995
2996         return ret;
2997 }
2998
2999 static int __init iommu_init_mempool(void)
3000 {
3001         int ret;
3002         ret = iommu_iova_cache_init();
3003         if (ret)
3004                 return ret;
3005
3006         ret = iommu_domain_cache_init();
3007         if (ret)
3008                 goto domain_error;
3009
3010         ret = iommu_devinfo_cache_init();
3011         if (!ret)
3012                 return ret;
3013
3014         kmem_cache_destroy(iommu_domain_cache);
3015 domain_error:
3016         kmem_cache_destroy(iommu_iova_cache);
3017
3018         return -ENOMEM;
3019 }
3020
3021 static void __init iommu_exit_mempool(void)
3022 {
3023         kmem_cache_destroy(iommu_devinfo_cache);
3024         kmem_cache_destroy(iommu_domain_cache);
3025         kmem_cache_destroy(iommu_iova_cache);
3026
3027 }
3028
3029 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3030 {
3031         struct dmar_drhd_unit *drhd;
3032         u32 vtbar;
3033         int rc;
3034
3035         /* We know that this device on this chipset has its own IOMMU.
3036          * If we find it under a different IOMMU, then the BIOS is lying
3037          * to us. Hope that the IOMMU for this device is actually
3038          * disabled, and it needs no translation...
3039          */
3040         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3041         if (rc) {
3042                 /* "can't" happen */
3043                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3044                 return;
3045         }
3046         vtbar &= 0xffff0000;
3047
3048         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3049         drhd = dmar_find_matched_drhd_unit(pdev);
3050         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3051                             TAINT_FIRMWARE_WORKAROUND,
3052                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3053                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3054 }
3055 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3056
3057 static void __init init_no_remapping_devices(void)
3058 {
3059         struct dmar_drhd_unit *drhd;
3060
3061         for_each_drhd_unit(drhd) {
3062                 if (!drhd->include_all) {
3063                         int i;
3064                         for (i = 0; i < drhd->devices_cnt; i++)
3065                                 if (drhd->devices[i] != NULL)
3066                                         break;
3067                         /* ignore DMAR unit if no pci devices exist */
3068                         if (i == drhd->devices_cnt)
3069                                 drhd->ignored = 1;
3070                 }
3071         }
3072
3073         if (dmar_map_gfx)
3074                 return;
3075
3076         for_each_drhd_unit(drhd) {
3077                 int i;
3078                 if (drhd->ignored || drhd->include_all)
3079                         continue;
3080
3081                 for (i = 0; i < drhd->devices_cnt; i++)
3082                         if (drhd->devices[i] &&
3083                                 !IS_GFX_DEVICE(drhd->devices[i]))
3084                                 break;
3085
3086                 if (i < drhd->devices_cnt)
3087                         continue;
3088
3089                 /* bypass IOMMU if it is just for gfx devices */
3090                 drhd->ignored = 1;
3091                 for (i = 0; i < drhd->devices_cnt; i++) {
3092                         if (!drhd->devices[i])
3093                                 continue;
3094                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3095                 }
3096         }
3097 }
3098
3099 #ifdef CONFIG_SUSPEND
3100 static int init_iommu_hw(void)
3101 {
3102         struct dmar_drhd_unit *drhd;
3103         struct intel_iommu *iommu = NULL;
3104
3105         for_each_active_iommu(iommu, drhd)
3106                 if (iommu->qi)
3107                         dmar_reenable_qi(iommu);
3108
3109         for_each_active_iommu(iommu, drhd) {
3110                 iommu_flush_write_buffer(iommu);
3111
3112                 iommu_set_root_entry(iommu);
3113
3114                 iommu->flush.flush_context(iommu, 0, 0, 0,
3115                                            DMA_CCMD_GLOBAL_INVL);
3116                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3117                                          DMA_TLB_GLOBAL_FLUSH);
3118                 iommu_enable_translation(iommu);
3119                 iommu_disable_protect_mem_regions(iommu);
3120         }
3121
3122         return 0;
3123 }
3124
3125 static void iommu_flush_all(void)
3126 {
3127         struct dmar_drhd_unit *drhd;
3128         struct intel_iommu *iommu;
3129
3130         for_each_active_iommu(iommu, drhd) {
3131                 iommu->flush.flush_context(iommu, 0, 0, 0,
3132                                            DMA_CCMD_GLOBAL_INVL);
3133                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3134                                          DMA_TLB_GLOBAL_FLUSH);
3135         }
3136 }
3137
3138 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
3139 {
3140         struct dmar_drhd_unit *drhd;
3141         struct intel_iommu *iommu = NULL;
3142         unsigned long flag;
3143
3144         for_each_active_iommu(iommu, drhd) {
3145                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3146                                                  GFP_ATOMIC);
3147                 if (!iommu->iommu_state)
3148                         goto nomem;
3149         }
3150
3151         iommu_flush_all();
3152
3153         for_each_active_iommu(iommu, drhd) {
3154                 iommu_disable_translation(iommu);
3155
3156                 spin_lock_irqsave(&iommu->register_lock, flag);
3157
3158                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3159                         readl(iommu->reg + DMAR_FECTL_REG);
3160                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3161                         readl(iommu->reg + DMAR_FEDATA_REG);
3162                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3163                         readl(iommu->reg + DMAR_FEADDR_REG);
3164                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3165                         readl(iommu->reg + DMAR_FEUADDR_REG);
3166
3167                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3168         }
3169         return 0;
3170
3171 nomem:
3172         for_each_active_iommu(iommu, drhd)
3173                 kfree(iommu->iommu_state);
3174
3175         return -ENOMEM;
3176 }
3177
3178 static int iommu_resume(struct sys_device *dev)
3179 {
3180         struct dmar_drhd_unit *drhd;
3181         struct intel_iommu *iommu = NULL;
3182         unsigned long flag;
3183
3184         if (init_iommu_hw()) {
3185                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3186                 return -EIO;
3187         }
3188
3189         for_each_active_iommu(iommu, drhd) {
3190
3191                 spin_lock_irqsave(&iommu->register_lock, flag);
3192
3193                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3194                         iommu->reg + DMAR_FECTL_REG);
3195                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3196                         iommu->reg + DMAR_FEDATA_REG);
3197                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3198                         iommu->reg + DMAR_FEADDR_REG);
3199                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3200                         iommu->reg + DMAR_FEUADDR_REG);
3201
3202                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3203         }
3204
3205         for_each_active_iommu(iommu, drhd)
3206                 kfree(iommu->iommu_state);
3207
3208         return 0;
3209 }
3210
3211 static struct sysdev_class iommu_sysclass = {
3212         .name           = "iommu",
3213         .resume         = iommu_resume,
3214         .suspend        = iommu_suspend,
3215 };
3216
3217 static struct sys_device device_iommu = {
3218         .cls    = &iommu_sysclass,
3219 };
3220
3221 static int __init init_iommu_sysfs(void)
3222 {
3223         int error;
3224
3225         error = sysdev_class_register(&iommu_sysclass);
3226         if (error)
3227                 return error;
3228
3229         error = sysdev_register(&device_iommu);
3230         if (error)
3231                 sysdev_class_unregister(&iommu_sysclass);
3232
3233         return error;
3234 }
3235
3236 #else
3237 static int __init init_iommu_sysfs(void)
3238 {
3239         return 0;
3240 }
3241 #endif  /* CONFIG_PM */
3242
3243 /*
3244  * Here we only respond to action of unbound device from driver.
3245  *
3246  * Added device is not attached to its DMAR domain here yet. That will happen
3247  * when mapping the device to iova.
3248  */
3249 static int device_notifier(struct notifier_block *nb,
3250                                   unsigned long action, void *data)
3251 {
3252         struct device *dev = data;
3253         struct pci_dev *pdev = to_pci_dev(dev);
3254         struct dmar_domain *domain;
3255
3256         if (iommu_no_mapping(dev))
3257                 return 0;
3258
3259         domain = find_domain(pdev);
3260         if (!domain)
3261                 return 0;
3262
3263         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through)
3264                 domain_remove_one_dev_info(domain, pdev);
3265
3266         return 0;
3267 }
3268
3269 static struct notifier_block device_nb = {
3270         .notifier_call = device_notifier,
3271 };
3272
3273 int __init intel_iommu_init(void)
3274 {
3275         int ret = 0;
3276         int force_on = 0;
3277
3278         /* VT-d is required for a TXT/tboot launch, so enforce that */
3279         force_on = tboot_force_iommu();
3280
3281         if (dmar_table_init()) {
3282                 if (force_on)
3283                         panic("tboot: Failed to initialize DMAR table\n");
3284                 return  -ENODEV;
3285         }
3286
3287         if (dmar_dev_scope_init()) {
3288                 if (force_on)
3289                         panic("tboot: Failed to initialize DMAR device scope\n");
3290                 return  -ENODEV;
3291         }
3292
3293         /*
3294          * Check the need for DMA-remapping initialization now.
3295          * Above initialization will also be used by Interrupt-remapping.
3296          */
3297         if (no_iommu || dmar_disabled)
3298                 return -ENODEV;
3299
3300         iommu_init_mempool();
3301         dmar_init_reserved_ranges();
3302
3303         init_no_remapping_devices();
3304
3305         ret = init_dmars();
3306         if (ret) {
3307                 if (force_on)
3308                         panic("tboot: Failed to initialize DMARs\n");
3309                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3310                 put_iova_domain(&reserved_iova_list);
3311                 iommu_exit_mempool();
3312                 return ret;
3313         }
3314         printk(KERN_INFO
3315         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3316
3317         init_timer(&unmap_timer);
3318 #ifdef CONFIG_SWIOTLB
3319         swiotlb = 0;
3320 #endif
3321         dma_ops = &intel_dma_ops;
3322
3323         init_iommu_sysfs();
3324
3325         register_iommu(&intel_iommu_ops);
3326
3327         bus_register_notifier(&pci_bus_type, &device_nb);
3328
3329         return 0;
3330 }
3331
3332 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3333                                            struct pci_dev *pdev)
3334 {
3335         struct pci_dev *tmp, *parent;
3336
3337         if (!iommu || !pdev)
3338                 return;
3339
3340         /* dependent device detach */
3341         tmp = pci_find_upstream_pcie_bridge(pdev);
3342         /* Secondary interface's bus number and devfn 0 */
3343         if (tmp) {
3344                 parent = pdev->bus->self;
3345                 while (parent != tmp) {
3346                         iommu_detach_dev(iommu, parent->bus->number,
3347                                          parent->devfn);
3348                         parent = parent->bus->self;
3349                 }
3350                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3351                         iommu_detach_dev(iommu,
3352                                 tmp->subordinate->number, 0);
3353                 else /* this is a legacy PCI bridge */
3354                         iommu_detach_dev(iommu, tmp->bus->number,
3355                                          tmp->devfn);
3356         }
3357 }
3358
3359 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3360                                           struct pci_dev *pdev)
3361 {
3362         struct device_domain_info *info;
3363         struct intel_iommu *iommu;
3364         unsigned long flags;
3365         int found = 0;
3366         struct list_head *entry, *tmp;
3367
3368         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3369                                 pdev->devfn);
3370         if (!iommu)
3371                 return;
3372
3373         spin_lock_irqsave(&device_domain_lock, flags);
3374         list_for_each_safe(entry, tmp, &domain->devices) {
3375                 info = list_entry(entry, struct device_domain_info, link);
3376                 /* No need to compare PCI domain; it has to be the same */
3377                 if (info->bus == pdev->bus->number &&
3378                     info->devfn == pdev->devfn) {
3379                         list_del(&info->link);
3380                         list_del(&info->global);
3381                         if (info->dev)
3382                                 info->dev->dev.archdata.iommu = NULL;
3383                         spin_unlock_irqrestore(&device_domain_lock, flags);
3384
3385                         iommu_disable_dev_iotlb(info);
3386                         iommu_detach_dev(iommu, info->bus, info->devfn);
3387                         iommu_detach_dependent_devices(iommu, pdev);
3388                         free_devinfo_mem(info);
3389
3390                         spin_lock_irqsave(&device_domain_lock, flags);
3391
3392                         if (found)
3393                                 break;
3394                         else
3395                                 continue;
3396                 }
3397
3398                 /* if there is no other devices under the same iommu
3399                  * owned by this domain, clear this iommu in iommu_bmp
3400                  * update iommu count and coherency
3401                  */
3402                 if (iommu == device_to_iommu(info->segment, info->bus,
3403                                             info->devfn))
3404                         found = 1;
3405         }
3406
3407         if (found == 0) {
3408                 unsigned long tmp_flags;
3409                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3410                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3411                 domain->iommu_count--;
3412                 domain_update_iommu_cap(domain);
3413                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3414         }
3415
3416         spin_unlock_irqrestore(&device_domain_lock, flags);
3417 }
3418
3419 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3420 {
3421         struct device_domain_info *info;
3422         struct intel_iommu *iommu;
3423         unsigned long flags1, flags2;
3424
3425         spin_lock_irqsave(&device_domain_lock, flags1);
3426         while (!list_empty(&domain->devices)) {
3427                 info = list_entry(domain->devices.next,
3428                         struct device_domain_info, link);
3429                 list_del(&info->link);
3430                 list_del(&info->global);
3431                 if (info->dev)
3432                         info->dev->dev.archdata.iommu = NULL;
3433
3434                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3435
3436                 iommu_disable_dev_iotlb(info);
3437                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3438                 iommu_detach_dev(iommu, info->bus, info->devfn);
3439                 iommu_detach_dependent_devices(iommu, info->dev);
3440
3441                 /* clear this iommu in iommu_bmp, update iommu count
3442                  * and capabilities
3443                  */
3444                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3445                 if (test_and_clear_bit(iommu->seq_id,
3446                                        &domain->iommu_bmp)) {
3447                         domain->iommu_count--;
3448                         domain_update_iommu_cap(domain);
3449                 }
3450                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3451
3452                 free_devinfo_mem(info);
3453                 spin_lock_irqsave(&device_domain_lock, flags1);
3454         }
3455         spin_unlock_irqrestore(&device_domain_lock, flags1);
3456 }
3457
3458 /* domain id for virtual machine, it won't be set in context */
3459 static unsigned long vm_domid;
3460
3461 static struct dmar_domain *iommu_alloc_vm_domain(void)
3462 {
3463         struct dmar_domain *domain;
3464
3465         domain = alloc_domain_mem();
3466         if (!domain)
3467                 return NULL;
3468
3469         domain->id = vm_domid++;
3470         domain->nid = -1;
3471         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3472         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3473
3474         return domain;
3475 }
3476
3477 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3478 {
3479         int adjust_width;
3480
3481         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3482         spin_lock_init(&domain->iommu_lock);
3483
3484         domain_reserve_special_ranges(domain);
3485
3486         /* calculate AGAW */
3487         domain->gaw = guest_width;
3488         adjust_width = guestwidth_to_adjustwidth(guest_width);
3489         domain->agaw = width_to_agaw(adjust_width);
3490
3491         INIT_LIST_HEAD(&domain->devices);
3492
3493         domain->iommu_count = 0;
3494         domain->iommu_coherency = 0;
3495         domain->iommu_snooping = 0;
3496         domain->max_addr = 0;
3497         domain->nid = -1;
3498
3499         /* always allocate the top pgd */
3500         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3501         if (!domain->pgd)
3502                 return -ENOMEM;
3503         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3504         return 0;
3505 }
3506
3507 static void iommu_free_vm_domain(struct dmar_domain *domain)
3508 {
3509         unsigned long flags;
3510         struct dmar_drhd_unit *drhd;
3511         struct intel_iommu *iommu;
3512         unsigned long i;
3513         unsigned long ndomains;
3514
3515         for_each_drhd_unit(drhd) {
3516                 if (drhd->ignored)
3517                         continue;
3518                 iommu = drhd->iommu;
3519
3520                 ndomains = cap_ndoms(iommu->cap);
3521                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3522                         if (iommu->domains[i] == domain) {
3523                                 spin_lock_irqsave(&iommu->lock, flags);
3524                                 clear_bit(i, iommu->domain_ids);
3525                                 iommu->domains[i] = NULL;
3526                                 spin_unlock_irqrestore(&iommu->lock, flags);
3527                                 break;
3528                         }
3529                 }
3530         }
3531 }
3532
3533 static void vm_domain_exit(struct dmar_domain *domain)
3534 {
3535         /* Domain 0 is reserved, so dont process it */
3536         if (!domain)
3537                 return;
3538
3539         vm_domain_remove_all_dev_info(domain);
3540         /* destroy iovas */
3541         put_iova_domain(&domain->iovad);
3542
3543         /* clear ptes */
3544         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3545
3546         /* free page tables */
3547         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3548
3549         iommu_free_vm_domain(domain);
3550         free_domain_mem(domain);
3551 }
3552
3553 static int intel_iommu_domain_init(struct iommu_domain *domain)
3554 {
3555         struct dmar_domain *dmar_domain;
3556
3557         dmar_domain = iommu_alloc_vm_domain();
3558         if (!dmar_domain) {
3559                 printk(KERN_ERR
3560                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3561                 return -ENOMEM;
3562         }
3563         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3564                 printk(KERN_ERR
3565                         "intel_iommu_domain_init() failed\n");
3566                 vm_domain_exit(dmar_domain);
3567                 return -ENOMEM;
3568         }
3569         domain->priv = dmar_domain;
3570
3571         return 0;
3572 }
3573
3574 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3575 {
3576         struct dmar_domain *dmar_domain = domain->priv;
3577
3578         domain->priv = NULL;
3579         vm_domain_exit(dmar_domain);
3580 }
3581
3582 static int intel_iommu_attach_device(struct iommu_domain *domain,
3583                                      struct device *dev)
3584 {
3585         struct dmar_domain *dmar_domain = domain->priv;
3586         struct pci_dev *pdev = to_pci_dev(dev);
3587         struct intel_iommu *iommu;
3588         int addr_width;
3589
3590         /* normally pdev is not mapped */
3591         if (unlikely(domain_context_mapped(pdev))) {
3592                 struct dmar_domain *old_domain;
3593
3594                 old_domain = find_domain(pdev);
3595                 if (old_domain) {
3596                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3597                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3598                                 domain_remove_one_dev_info(old_domain, pdev);
3599                         else
3600                                 domain_remove_dev_info(old_domain);
3601                 }
3602         }
3603
3604         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3605                                 pdev->devfn);
3606         if (!iommu)
3607                 return -ENODEV;
3608
3609         /* check if this iommu agaw is sufficient for max mapped address */
3610         addr_width = agaw_to_width(iommu->agaw);
3611         if (addr_width > cap_mgaw(iommu->cap))
3612                 addr_width = cap_mgaw(iommu->cap);
3613
3614         if (dmar_domain->max_addr > (1LL << addr_width)) {
3615                 printk(KERN_ERR "%s: iommu width (%d) is not "
3616                        "sufficient for the mapped address (%llx)\n",
3617                        __func__, addr_width, dmar_domain->max_addr);
3618                 return -EFAULT;
3619         }
3620         dmar_domain->gaw = addr_width;
3621
3622         /*
3623          * Knock out extra levels of page tables if necessary
3624          */
3625         while (iommu->agaw < dmar_domain->agaw) {
3626                 struct dma_pte *pte;
3627
3628                 pte = dmar_domain->pgd;
3629                 if (dma_pte_present(pte)) {
3630                         free_pgtable_page(dmar_domain->pgd);
3631                         dmar_domain->pgd = (struct dma_pte *)
3632                                 phys_to_virt(dma_pte_addr(pte));
3633                 }
3634                 dmar_domain->agaw--;
3635         }
3636
3637         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3638 }
3639
3640 static void intel_iommu_detach_device(struct iommu_domain *domain,
3641                                       struct device *dev)
3642 {
3643         struct dmar_domain *dmar_domain = domain->priv;
3644         struct pci_dev *pdev = to_pci_dev(dev);
3645
3646         domain_remove_one_dev_info(dmar_domain, pdev);
3647 }
3648
3649 static int intel_iommu_map(struct iommu_domain *domain,
3650                            unsigned long iova, phys_addr_t hpa,
3651                            int gfp_order, int iommu_prot)
3652 {
3653         struct dmar_domain *dmar_domain = domain->priv;
3654         u64 max_addr;
3655         int prot = 0;
3656         size_t size;
3657         int ret;
3658
3659         if (iommu_prot & IOMMU_READ)
3660                 prot |= DMA_PTE_READ;
3661         if (iommu_prot & IOMMU_WRITE)
3662                 prot |= DMA_PTE_WRITE;
3663         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3664                 prot |= DMA_PTE_SNP;
3665
3666         size     = PAGE_SIZE << gfp_order;
3667         max_addr = iova + size;
3668         if (dmar_domain->max_addr < max_addr) {
3669                 u64 end;
3670
3671                 /* check if minimum agaw is sufficient for mapped address */
3672                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3673                 if (end < max_addr) {
3674                         printk(KERN_ERR "%s: iommu width (%d) is not "
3675                                "sufficient for the mapped address (%llx)\n",
3676                                __func__, dmar_domain->gaw, max_addr);
3677                         return -EFAULT;
3678                 }
3679                 dmar_domain->max_addr = max_addr;
3680         }
3681         /* Round up size to next multiple of PAGE_SIZE, if it and
3682            the low bits of hpa would take us onto the next page */
3683         size = aligned_nrpages(hpa, size);
3684         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3685                                  hpa >> VTD_PAGE_SHIFT, size, prot);
3686         return ret;
3687 }
3688
3689 static int intel_iommu_unmap(struct iommu_domain *domain,
3690                              unsigned long iova, int gfp_order)
3691 {
3692         struct dmar_domain *dmar_domain = domain->priv;
3693         size_t size = PAGE_SIZE << gfp_order;
3694
3695         dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3696                             (iova + size - 1) >> VTD_PAGE_SHIFT);
3697
3698         if (dmar_domain->max_addr == iova + size)
3699                 dmar_domain->max_addr = iova;
3700
3701         return gfp_order;
3702 }
3703
3704 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3705                                             unsigned long iova)
3706 {
3707         struct dmar_domain *dmar_domain = domain->priv;
3708         struct dma_pte *pte;
3709         u64 phys = 0;
3710
3711         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3712         if (pte)
3713                 phys = dma_pte_addr(pte);
3714
3715         return phys;
3716 }
3717
3718 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3719                                       unsigned long cap)
3720 {
3721         struct dmar_domain *dmar_domain = domain->priv;
3722
3723         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3724                 return dmar_domain->iommu_snooping;
3725         if (cap == IOMMU_CAP_INTR_REMAP)
3726                 return intr_remapping_enabled;
3727
3728         return 0;
3729 }
3730
3731 static struct iommu_ops intel_iommu_ops = {
3732         .domain_init    = intel_iommu_domain_init,
3733         .domain_destroy = intel_iommu_domain_destroy,
3734         .attach_dev     = intel_iommu_attach_device,
3735         .detach_dev     = intel_iommu_detach_device,
3736         .map            = intel_iommu_map,
3737         .unmap          = intel_iommu_unmap,
3738         .iova_to_phys   = intel_iommu_iova_to_phys,
3739         .domain_has_cap = intel_iommu_domain_has_cap,
3740 };
3741
3742 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3743 {
3744         /*
3745          * Mobile 4 Series Chipset neglects to set RWBF capability,
3746          * but needs it:
3747          */
3748         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3749         rwbf_quirk = 1;
3750
3751         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
3752         if (dev->revision == 0x07) {
3753                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
3754                 dmar_map_gfx = 0;
3755         }
3756 }
3757
3758 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3759
3760 #define GGC 0x52
3761 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
3762 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
3763 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
3764 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
3765 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
3766 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
3767 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
3768 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
3769
3770 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
3771 {
3772         unsigned short ggc;
3773
3774         if (pci_read_config_word(dev, GGC, &ggc))
3775                 return;
3776
3777         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
3778                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
3779                 dmar_map_gfx = 0;
3780         }
3781 }
3782 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
3783 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
3784 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
3785 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
3786
3787 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3788    ISOCH DMAR unit for the Azalia sound device, but not give it any
3789    TLB entries, which causes it to deadlock. Check for that.  We do
3790    this in a function called from init_dmars(), instead of in a PCI
3791    quirk, because we don't want to print the obnoxious "BIOS broken"
3792    message if VT-d is actually disabled.
3793 */
3794 static void __init check_tylersburg_isoch(void)
3795 {
3796         struct pci_dev *pdev;
3797         uint32_t vtisochctrl;
3798
3799         /* If there's no Azalia in the system anyway, forget it. */
3800         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3801         if (!pdev)
3802                 return;
3803         pci_dev_put(pdev);
3804
3805         /* System Management Registers. Might be hidden, in which case
3806            we can't do the sanity check. But that's OK, because the
3807            known-broken BIOSes _don't_ actually hide it, so far. */
3808         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3809         if (!pdev)
3810                 return;
3811
3812         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3813                 pci_dev_put(pdev);
3814                 return;
3815         }
3816
3817         pci_dev_put(pdev);
3818
3819         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3820         if (vtisochctrl & 1)
3821                 return;
3822
3823         /* Drop all bits other than the number of TLB entries */
3824         vtisochctrl &= 0x1c;
3825
3826         /* If we have the recommended number of TLB entries (16), fine. */
3827         if (vtisochctrl == 0x10)
3828                 return;
3829
3830         /* Zero TLB entries? You get to ride the short bus to school. */
3831         if (!vtisochctrl) {
3832                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3833                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3834                      dmi_get_system_info(DMI_BIOS_VENDOR),
3835                      dmi_get_system_info(DMI_BIOS_VERSION),
3836                      dmi_get_system_info(DMI_PRODUCT_VERSION));
3837                 iommu_identity_mapping |= IDENTMAP_AZALIA;
3838                 return;
3839         }
3840         
3841         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3842                vtisochctrl);
3843 }