Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph...
[pandora-kernel.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
45
46 #define ROOT_SIZE               VTD_PAGE_SIZE
47 #define CONTEXT_SIZE            VTD_PAGE_SIZE
48
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
52
53 #define IOAPIC_RANGE_START      (0xfee00000)
54 #define IOAPIC_RANGE_END        (0xfeefffff)
55 #define IOVA_START_ADDR         (0x1000)
56
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
58
59 #define MAX_AGAW_WIDTH 64
60
61 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
63
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
67                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
69
70 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
73
74
75 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
76    are never going to work. */
77 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
78 {
79         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
80 }
81
82 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
83 {
84         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
85 }
86 static inline unsigned long page_to_dma_pfn(struct page *pg)
87 {
88         return mm_to_dma_pfn(page_to_pfn(pg));
89 }
90 static inline unsigned long virt_to_dma_pfn(void *p)
91 {
92         return page_to_dma_pfn(virt_to_page(p));
93 }
94
95 /* global iommu list, set NULL for ignored DMAR units */
96 static struct intel_iommu **g_iommus;
97
98 static void __init check_tylersburg_isoch(void);
99 static int rwbf_quirk;
100
101 /*
102  * 0: Present
103  * 1-11: Reserved
104  * 12-63: Context Ptr (12 - (haw-1))
105  * 64-127: Reserved
106  */
107 struct root_entry {
108         u64     val;
109         u64     rsvd1;
110 };
111 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
112 static inline bool root_present(struct root_entry *root)
113 {
114         return (root->val & 1);
115 }
116 static inline void set_root_present(struct root_entry *root)
117 {
118         root->val |= 1;
119 }
120 static inline void set_root_value(struct root_entry *root, unsigned long value)
121 {
122         root->val |= value & VTD_PAGE_MASK;
123 }
124
125 static inline struct context_entry *
126 get_context_addr_from_root(struct root_entry *root)
127 {
128         return (struct context_entry *)
129                 (root_present(root)?phys_to_virt(
130                 root->val & VTD_PAGE_MASK) :
131                 NULL);
132 }
133
134 /*
135  * low 64 bits:
136  * 0: present
137  * 1: fault processing disable
138  * 2-3: translation type
139  * 12-63: address space root
140  * high 64 bits:
141  * 0-2: address width
142  * 3-6: aval
143  * 8-23: domain id
144  */
145 struct context_entry {
146         u64 lo;
147         u64 hi;
148 };
149
150 static inline bool context_present(struct context_entry *context)
151 {
152         return (context->lo & 1);
153 }
154 static inline void context_set_present(struct context_entry *context)
155 {
156         context->lo |= 1;
157 }
158
159 static inline void context_set_fault_enable(struct context_entry *context)
160 {
161         context->lo &= (((u64)-1) << 2) | 1;
162 }
163
164 static inline void context_set_translation_type(struct context_entry *context,
165                                                 unsigned long value)
166 {
167         context->lo &= (((u64)-1) << 4) | 3;
168         context->lo |= (value & 3) << 2;
169 }
170
171 static inline void context_set_address_root(struct context_entry *context,
172                                             unsigned long value)
173 {
174         context->lo |= value & VTD_PAGE_MASK;
175 }
176
177 static inline void context_set_address_width(struct context_entry *context,
178                                              unsigned long value)
179 {
180         context->hi |= value & 7;
181 }
182
183 static inline void context_set_domain_id(struct context_entry *context,
184                                          unsigned long value)
185 {
186         context->hi |= (value & ((1 << 16) - 1)) << 8;
187 }
188
189 static inline void context_clear_entry(struct context_entry *context)
190 {
191         context->lo = 0;
192         context->hi = 0;
193 }
194
195 /*
196  * 0: readable
197  * 1: writable
198  * 2-6: reserved
199  * 7: super page
200  * 8-10: available
201  * 11: snoop behavior
202  * 12-63: Host physcial address
203  */
204 struct dma_pte {
205         u64 val;
206 };
207
208 static inline void dma_clear_pte(struct dma_pte *pte)
209 {
210         pte->val = 0;
211 }
212
213 static inline void dma_set_pte_readable(struct dma_pte *pte)
214 {
215         pte->val |= DMA_PTE_READ;
216 }
217
218 static inline void dma_set_pte_writable(struct dma_pte *pte)
219 {
220         pte->val |= DMA_PTE_WRITE;
221 }
222
223 static inline void dma_set_pte_snp(struct dma_pte *pte)
224 {
225         pte->val |= DMA_PTE_SNP;
226 }
227
228 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
229 {
230         pte->val = (pte->val & ~3) | (prot & 3);
231 }
232
233 static inline u64 dma_pte_addr(struct dma_pte *pte)
234 {
235 #ifdef CONFIG_64BIT
236         return pte->val & VTD_PAGE_MASK;
237 #else
238         /* Must have a full atomic 64-bit read */
239         return  __cmpxchg64(pte, 0ULL, 0ULL) & VTD_PAGE_MASK;
240 #endif
241 }
242
243 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
244 {
245         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
246 }
247
248 static inline bool dma_pte_present(struct dma_pte *pte)
249 {
250         return (pte->val & 3) != 0;
251 }
252
253 static inline int first_pte_in_page(struct dma_pte *pte)
254 {
255         return !((unsigned long)pte & ~VTD_PAGE_MASK);
256 }
257
258 /*
259  * This domain is a statically identity mapping domain.
260  *      1. This domain creats a static 1:1 mapping to all usable memory.
261  *      2. It maps to each iommu if successful.
262  *      3. Each iommu mapps to this domain if successful.
263  */
264 static struct dmar_domain *si_domain;
265 static int hw_pass_through = 1;
266
267 /* devices under the same p2p bridge are owned in one domain */
268 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
269
270 /* domain represents a virtual machine, more than one devices
271  * across iommus may be owned in one domain, e.g. kvm guest.
272  */
273 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
274
275 /* si_domain contains mulitple devices */
276 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
277
278 struct dmar_domain {
279         int     id;                     /* domain id */
280         int     nid;                    /* node id */
281         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
282
283         struct list_head devices;       /* all devices' list */
284         struct iova_domain iovad;       /* iova's that belong to this domain */
285
286         struct dma_pte  *pgd;           /* virtual address */
287         int             gaw;            /* max guest address width */
288
289         /* adjusted guest address width, 0 is level 2 30-bit */
290         int             agaw;
291
292         int             flags;          /* flags to find out type of domain */
293
294         int             iommu_coherency;/* indicate coherency of iommu access */
295         int             iommu_snooping; /* indicate snooping control feature*/
296         int             iommu_count;    /* reference count of iommu */
297         spinlock_t      iommu_lock;     /* protect iommu set in domain */
298         u64             max_addr;       /* maximum mapped address */
299 };
300
301 /* PCI domain-device relationship */
302 struct device_domain_info {
303         struct list_head link;  /* link to domain siblings */
304         struct list_head global; /* link to global list */
305         int segment;            /* PCI domain */
306         u8 bus;                 /* PCI bus number */
307         u8 devfn;               /* PCI devfn number */
308         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
309         struct intel_iommu *iommu; /* IOMMU used by this device */
310         struct dmar_domain *domain; /* pointer to domain */
311 };
312
313 static void flush_unmaps_timeout(unsigned long data);
314
315 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
316
317 #define HIGH_WATER_MARK 250
318 struct deferred_flush_tables {
319         int next;
320         struct iova *iova[HIGH_WATER_MARK];
321         struct dmar_domain *domain[HIGH_WATER_MARK];
322 };
323
324 static struct deferred_flush_tables *deferred_flush;
325
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328
329 static DEFINE_SPINLOCK(async_umap_flush_lock);
330 static LIST_HEAD(unmaps_to_do);
331
332 static int timer_on;
333 static long list_size;
334
335 static void domain_remove_dev_info(struct dmar_domain *domain);
336
337 #ifdef CONFIG_DMAR_DEFAULT_ON
338 int dmar_disabled = 0;
339 #else
340 int dmar_disabled = 1;
341 #endif /*CONFIG_DMAR_DEFAULT_ON*/
342
343 static int __initdata dmar_map_gfx = 1;
344 static int dmar_forcedac;
345 static int intel_iommu_strict;
346
347 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
348 static DEFINE_SPINLOCK(device_domain_lock);
349 static LIST_HEAD(device_domain_list);
350
351 static struct iommu_ops intel_iommu_ops;
352
353 static int __init intel_iommu_setup(char *str)
354 {
355         if (!str)
356                 return -EINVAL;
357         while (*str) {
358                 if (!strncmp(str, "on", 2)) {
359                         dmar_disabled = 0;
360                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
361                 } else if (!strncmp(str, "off", 3)) {
362                         dmar_disabled = 1;
363                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
364                 } else if (!strncmp(str, "igfx_off", 8)) {
365                         dmar_map_gfx = 0;
366                         printk(KERN_INFO
367                                 "Intel-IOMMU: disable GFX device mapping\n");
368                 } else if (!strncmp(str, "forcedac", 8)) {
369                         printk(KERN_INFO
370                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
371                         dmar_forcedac = 1;
372                 } else if (!strncmp(str, "strict", 6)) {
373                         printk(KERN_INFO
374                                 "Intel-IOMMU: disable batched IOTLB flush\n");
375                         intel_iommu_strict = 1;
376                 }
377
378                 str += strcspn(str, ",");
379                 while (*str == ',')
380                         str++;
381         }
382         return 0;
383 }
384 __setup("intel_iommu=", intel_iommu_setup);
385
386 static struct kmem_cache *iommu_domain_cache;
387 static struct kmem_cache *iommu_devinfo_cache;
388 static struct kmem_cache *iommu_iova_cache;
389
390 static inline void *alloc_pgtable_page(int node)
391 {
392         struct page *page;
393         void *vaddr = NULL;
394
395         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
396         if (page)
397                 vaddr = page_address(page);
398         return vaddr;
399 }
400
401 static inline void free_pgtable_page(void *vaddr)
402 {
403         free_page((unsigned long)vaddr);
404 }
405
406 static inline void *alloc_domain_mem(void)
407 {
408         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
409 }
410
411 static void free_domain_mem(void *vaddr)
412 {
413         kmem_cache_free(iommu_domain_cache, vaddr);
414 }
415
416 static inline void * alloc_devinfo_mem(void)
417 {
418         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
419 }
420
421 static inline void free_devinfo_mem(void *vaddr)
422 {
423         kmem_cache_free(iommu_devinfo_cache, vaddr);
424 }
425
426 struct iova *alloc_iova_mem(void)
427 {
428         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
429 }
430
431 void free_iova_mem(struct iova *iova)
432 {
433         kmem_cache_free(iommu_iova_cache, iova);
434 }
435
436
437 static inline int width_to_agaw(int width);
438
439 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
440 {
441         unsigned long sagaw;
442         int agaw = -1;
443
444         sagaw = cap_sagaw(iommu->cap);
445         for (agaw = width_to_agaw(max_gaw);
446              agaw >= 0; agaw--) {
447                 if (test_bit(agaw, &sagaw))
448                         break;
449         }
450
451         return agaw;
452 }
453
454 /*
455  * Calculate max SAGAW for each iommu.
456  */
457 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
458 {
459         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
460 }
461
462 /*
463  * calculate agaw for each iommu.
464  * "SAGAW" may be different across iommus, use a default agaw, and
465  * get a supported less agaw for iommus that don't support the default agaw.
466  */
467 int iommu_calculate_agaw(struct intel_iommu *iommu)
468 {
469         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
470 }
471
472 /* This functionin only returns single iommu in a domain */
473 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
474 {
475         int iommu_id;
476
477         /* si_domain and vm domain should not get here. */
478         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
479         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
480
481         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
482         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
483                 return NULL;
484
485         return g_iommus[iommu_id];
486 }
487
488 static void domain_update_iommu_coherency(struct dmar_domain *domain)
489 {
490         int i;
491
492         domain->iommu_coherency = 1;
493
494         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
495                 if (!ecap_coherent(g_iommus[i]->ecap)) {
496                         domain->iommu_coherency = 0;
497                         break;
498                 }
499         }
500 }
501
502 static void domain_update_iommu_snooping(struct dmar_domain *domain)
503 {
504         int i;
505
506         domain->iommu_snooping = 1;
507
508         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
509                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
510                         domain->iommu_snooping = 0;
511                         break;
512                 }
513         }
514 }
515
516 /* Some capabilities may be different across iommus */
517 static void domain_update_iommu_cap(struct dmar_domain *domain)
518 {
519         domain_update_iommu_coherency(domain);
520         domain_update_iommu_snooping(domain);
521 }
522
523 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
524 {
525         struct dmar_drhd_unit *drhd = NULL;
526         int i;
527
528         for_each_drhd_unit(drhd) {
529                 if (drhd->ignored)
530                         continue;
531                 if (segment != drhd->segment)
532                         continue;
533
534                 for (i = 0; i < drhd->devices_cnt; i++) {
535                         if (drhd->devices[i] &&
536                             drhd->devices[i]->bus->number == bus &&
537                             drhd->devices[i]->devfn == devfn)
538                                 return drhd->iommu;
539                         if (drhd->devices[i] &&
540                             drhd->devices[i]->subordinate &&
541                             drhd->devices[i]->subordinate->number <= bus &&
542                             drhd->devices[i]->subordinate->subordinate >= bus)
543                                 return drhd->iommu;
544                 }
545
546                 if (drhd->include_all)
547                         return drhd->iommu;
548         }
549
550         return NULL;
551 }
552
553 static void domain_flush_cache(struct dmar_domain *domain,
554                                void *addr, int size)
555 {
556         if (!domain->iommu_coherency)
557                 clflush_cache_range(addr, size);
558 }
559
560 /* Gets context entry for a given bus and devfn */
561 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
562                 u8 bus, u8 devfn)
563 {
564         struct root_entry *root;
565         struct context_entry *context;
566         unsigned long phy_addr;
567         unsigned long flags;
568
569         spin_lock_irqsave(&iommu->lock, flags);
570         root = &iommu->root_entry[bus];
571         context = get_context_addr_from_root(root);
572         if (!context) {
573                 context = (struct context_entry *)
574                                 alloc_pgtable_page(iommu->node);
575                 if (!context) {
576                         spin_unlock_irqrestore(&iommu->lock, flags);
577                         return NULL;
578                 }
579                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
580                 phy_addr = virt_to_phys((void *)context);
581                 set_root_value(root, phy_addr);
582                 set_root_present(root);
583                 __iommu_flush_cache(iommu, root, sizeof(*root));
584         }
585         spin_unlock_irqrestore(&iommu->lock, flags);
586         return &context[devfn];
587 }
588
589 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
590 {
591         struct root_entry *root;
592         struct context_entry *context;
593         int ret;
594         unsigned long flags;
595
596         spin_lock_irqsave(&iommu->lock, flags);
597         root = &iommu->root_entry[bus];
598         context = get_context_addr_from_root(root);
599         if (!context) {
600                 ret = 0;
601                 goto out;
602         }
603         ret = context_present(&context[devfn]);
604 out:
605         spin_unlock_irqrestore(&iommu->lock, flags);
606         return ret;
607 }
608
609 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
610 {
611         struct root_entry *root;
612         struct context_entry *context;
613         unsigned long flags;
614
615         spin_lock_irqsave(&iommu->lock, flags);
616         root = &iommu->root_entry[bus];
617         context = get_context_addr_from_root(root);
618         if (context) {
619                 context_clear_entry(&context[devfn]);
620                 __iommu_flush_cache(iommu, &context[devfn], \
621                         sizeof(*context));
622         }
623         spin_unlock_irqrestore(&iommu->lock, flags);
624 }
625
626 static void free_context_table(struct intel_iommu *iommu)
627 {
628         struct root_entry *root;
629         int i;
630         unsigned long flags;
631         struct context_entry *context;
632
633         spin_lock_irqsave(&iommu->lock, flags);
634         if (!iommu->root_entry) {
635                 goto out;
636         }
637         for (i = 0; i < ROOT_ENTRY_NR; i++) {
638                 root = &iommu->root_entry[i];
639                 context = get_context_addr_from_root(root);
640                 if (context)
641                         free_pgtable_page(context);
642         }
643         free_pgtable_page(iommu->root_entry);
644         iommu->root_entry = NULL;
645 out:
646         spin_unlock_irqrestore(&iommu->lock, flags);
647 }
648
649 /* page table handling */
650 #define LEVEL_STRIDE            (9)
651 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
652
653 static inline int agaw_to_level(int agaw)
654 {
655         return agaw + 2;
656 }
657
658 static inline int agaw_to_width(int agaw)
659 {
660         return 30 + agaw * LEVEL_STRIDE;
661
662 }
663
664 static inline int width_to_agaw(int width)
665 {
666         return (width - 30) / LEVEL_STRIDE;
667 }
668
669 static inline unsigned int level_to_offset_bits(int level)
670 {
671         return (level - 1) * LEVEL_STRIDE;
672 }
673
674 static inline int pfn_level_offset(unsigned long pfn, int level)
675 {
676         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
677 }
678
679 static inline unsigned long level_mask(int level)
680 {
681         return -1UL << level_to_offset_bits(level);
682 }
683
684 static inline unsigned long level_size(int level)
685 {
686         return 1UL << level_to_offset_bits(level);
687 }
688
689 static inline unsigned long align_to_level(unsigned long pfn, int level)
690 {
691         return (pfn + level_size(level) - 1) & level_mask(level);
692 }
693
694 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
695                                       unsigned long pfn)
696 {
697         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
698         struct dma_pte *parent, *pte = NULL;
699         int level = agaw_to_level(domain->agaw);
700         int offset;
701
702         BUG_ON(!domain->pgd);
703         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
704         parent = domain->pgd;
705
706         while (level > 0) {
707                 void *tmp_page;
708
709                 offset = pfn_level_offset(pfn, level);
710                 pte = &parent[offset];
711                 if (level == 1)
712                         break;
713
714                 if (!dma_pte_present(pte)) {
715                         uint64_t pteval;
716
717                         tmp_page = alloc_pgtable_page(domain->nid);
718
719                         if (!tmp_page)
720                                 return NULL;
721
722                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
723                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
724                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
725                                 /* Someone else set it while we were thinking; use theirs. */
726                                 free_pgtable_page(tmp_page);
727                         } else {
728                                 dma_pte_addr(pte);
729                                 domain_flush_cache(domain, pte, sizeof(*pte));
730                         }
731                 }
732                 parent = phys_to_virt(dma_pte_addr(pte));
733                 level--;
734         }
735
736         return pte;
737 }
738
739 /* return address's pte at specific level */
740 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
741                                          unsigned long pfn,
742                                          int level)
743 {
744         struct dma_pte *parent, *pte = NULL;
745         int total = agaw_to_level(domain->agaw);
746         int offset;
747
748         parent = domain->pgd;
749         while (level <= total) {
750                 offset = pfn_level_offset(pfn, total);
751                 pte = &parent[offset];
752                 if (level == total)
753                         return pte;
754
755                 if (!dma_pte_present(pte))
756                         break;
757                 parent = phys_to_virt(dma_pte_addr(pte));
758                 total--;
759         }
760         return NULL;
761 }
762
763 /* clear last level pte, a tlb flush should be followed */
764 static void dma_pte_clear_range(struct dmar_domain *domain,
765                                 unsigned long start_pfn,
766                                 unsigned long last_pfn)
767 {
768         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
769         struct dma_pte *first_pte, *pte;
770
771         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
772         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
773         BUG_ON(start_pfn > last_pfn);
774
775         /* we don't need lock here; nobody else touches the iova range */
776         do {
777                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
778                 if (!pte) {
779                         start_pfn = align_to_level(start_pfn + 1, 2);
780                         continue;
781                 }
782                 do { 
783                         dma_clear_pte(pte);
784                         start_pfn++;
785                         pte++;
786                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
787
788                 domain_flush_cache(domain, first_pte,
789                                    (void *)pte - (void *)first_pte);
790
791         } while (start_pfn && start_pfn <= last_pfn);
792 }
793
794 /* free page table pages. last level pte should already be cleared */
795 static void dma_pte_free_pagetable(struct dmar_domain *domain,
796                                    unsigned long start_pfn,
797                                    unsigned long last_pfn)
798 {
799         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
800         struct dma_pte *first_pte, *pte;
801         int total = agaw_to_level(domain->agaw);
802         int level;
803         unsigned long tmp;
804
805         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
806         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
807         BUG_ON(start_pfn > last_pfn);
808
809         /* We don't need lock here; nobody else touches the iova range */
810         level = 2;
811         while (level <= total) {
812                 tmp = align_to_level(start_pfn, level);
813
814                 /* If we can't even clear one PTE at this level, we're done */
815                 if (tmp + level_size(level) - 1 > last_pfn)
816                         return;
817
818                 do {
819                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
820                         if (!pte) {
821                                 tmp = align_to_level(tmp + 1, level + 1);
822                                 continue;
823                         }
824                         do {
825                                 if (dma_pte_present(pte)) {
826                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
827                                         dma_clear_pte(pte);
828                                 }
829                                 pte++;
830                                 tmp += level_size(level);
831                         } while (!first_pte_in_page(pte) &&
832                                  tmp + level_size(level) - 1 <= last_pfn);
833
834                         domain_flush_cache(domain, first_pte,
835                                            (void *)pte - (void *)first_pte);
836                         
837                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
838                 level++;
839         }
840         /* free pgd */
841         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
842                 free_pgtable_page(domain->pgd);
843                 domain->pgd = NULL;
844         }
845 }
846
847 /* iommu handling */
848 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
849 {
850         struct root_entry *root;
851         unsigned long flags;
852
853         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
854         if (!root)
855                 return -ENOMEM;
856
857         __iommu_flush_cache(iommu, root, ROOT_SIZE);
858
859         spin_lock_irqsave(&iommu->lock, flags);
860         iommu->root_entry = root;
861         spin_unlock_irqrestore(&iommu->lock, flags);
862
863         return 0;
864 }
865
866 static void iommu_set_root_entry(struct intel_iommu *iommu)
867 {
868         void *addr;
869         u32 sts;
870         unsigned long flag;
871
872         addr = iommu->root_entry;
873
874         spin_lock_irqsave(&iommu->register_lock, flag);
875         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
876
877         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
878
879         /* Make sure hardware complete it */
880         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
881                       readl, (sts & DMA_GSTS_RTPS), sts);
882
883         spin_unlock_irqrestore(&iommu->register_lock, flag);
884 }
885
886 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
887 {
888         u32 val;
889         unsigned long flag;
890
891         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
892                 return;
893
894         spin_lock_irqsave(&iommu->register_lock, flag);
895         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
896
897         /* Make sure hardware complete it */
898         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
899                       readl, (!(val & DMA_GSTS_WBFS)), val);
900
901         spin_unlock_irqrestore(&iommu->register_lock, flag);
902 }
903
904 /* return value determine if we need a write buffer flush */
905 static void __iommu_flush_context(struct intel_iommu *iommu,
906                                   u16 did, u16 source_id, u8 function_mask,
907                                   u64 type)
908 {
909         u64 val = 0;
910         unsigned long flag;
911
912         switch (type) {
913         case DMA_CCMD_GLOBAL_INVL:
914                 val = DMA_CCMD_GLOBAL_INVL;
915                 break;
916         case DMA_CCMD_DOMAIN_INVL:
917                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
918                 break;
919         case DMA_CCMD_DEVICE_INVL:
920                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
921                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
922                 break;
923         default:
924                 BUG();
925         }
926         val |= DMA_CCMD_ICC;
927
928         spin_lock_irqsave(&iommu->register_lock, flag);
929         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
930
931         /* Make sure hardware complete it */
932         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
933                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
934
935         spin_unlock_irqrestore(&iommu->register_lock, flag);
936 }
937
938 /* return value determine if we need a write buffer flush */
939 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
940                                 u64 addr, unsigned int size_order, u64 type)
941 {
942         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
943         u64 val = 0, val_iva = 0;
944         unsigned long flag;
945
946         switch (type) {
947         case DMA_TLB_GLOBAL_FLUSH:
948                 /* global flush doesn't need set IVA_REG */
949                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
950                 break;
951         case DMA_TLB_DSI_FLUSH:
952                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
953                 break;
954         case DMA_TLB_PSI_FLUSH:
955                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
956                 /* Note: always flush non-leaf currently */
957                 val_iva = size_order | addr;
958                 break;
959         default:
960                 BUG();
961         }
962         /* Note: set drain read/write */
963 #if 0
964         /*
965          * This is probably to be super secure.. Looks like we can
966          * ignore it without any impact.
967          */
968         if (cap_read_drain(iommu->cap))
969                 val |= DMA_TLB_READ_DRAIN;
970 #endif
971         if (cap_write_drain(iommu->cap))
972                 val |= DMA_TLB_WRITE_DRAIN;
973
974         spin_lock_irqsave(&iommu->register_lock, flag);
975         /* Note: Only uses first TLB reg currently */
976         if (val_iva)
977                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
978         dmar_writeq(iommu->reg + tlb_offset + 8, val);
979
980         /* Make sure hardware complete it */
981         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
982                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
983
984         spin_unlock_irqrestore(&iommu->register_lock, flag);
985
986         /* check IOTLB invalidation granularity */
987         if (DMA_TLB_IAIG(val) == 0)
988                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
989         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
990                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
991                         (unsigned long long)DMA_TLB_IIRG(type),
992                         (unsigned long long)DMA_TLB_IAIG(val));
993 }
994
995 static struct device_domain_info *iommu_support_dev_iotlb(
996         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
997 {
998         int found = 0;
999         unsigned long flags;
1000         struct device_domain_info *info;
1001         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1002
1003         if (!ecap_dev_iotlb_support(iommu->ecap))
1004                 return NULL;
1005
1006         if (!iommu->qi)
1007                 return NULL;
1008
1009         spin_lock_irqsave(&device_domain_lock, flags);
1010         list_for_each_entry(info, &domain->devices, link)
1011                 if (info->bus == bus && info->devfn == devfn) {
1012                         found = 1;
1013                         break;
1014                 }
1015         spin_unlock_irqrestore(&device_domain_lock, flags);
1016
1017         if (!found || !info->dev)
1018                 return NULL;
1019
1020         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1021                 return NULL;
1022
1023         if (!dmar_find_matched_atsr_unit(info->dev))
1024                 return NULL;
1025
1026         info->iommu = iommu;
1027
1028         return info;
1029 }
1030
1031 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1032 {
1033         if (!info)
1034                 return;
1035
1036         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1037 }
1038
1039 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1040 {
1041         if (!info->dev || !pci_ats_enabled(info->dev))
1042                 return;
1043
1044         pci_disable_ats(info->dev);
1045 }
1046
1047 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1048                                   u64 addr, unsigned mask)
1049 {
1050         u16 sid, qdep;
1051         unsigned long flags;
1052         struct device_domain_info *info;
1053
1054         spin_lock_irqsave(&device_domain_lock, flags);
1055         list_for_each_entry(info, &domain->devices, link) {
1056                 if (!info->dev || !pci_ats_enabled(info->dev))
1057                         continue;
1058
1059                 sid = info->bus << 8 | info->devfn;
1060                 qdep = pci_ats_queue_depth(info->dev);
1061                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1062         }
1063         spin_unlock_irqrestore(&device_domain_lock, flags);
1064 }
1065
1066 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1067                                   unsigned long pfn, unsigned int pages, int map)
1068 {
1069         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1070         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1071
1072         BUG_ON(pages == 0);
1073
1074         /*
1075          * Fallback to domain selective flush if no PSI support or the size is
1076          * too big.
1077          * PSI requires page size to be 2 ^ x, and the base address is naturally
1078          * aligned to the size
1079          */
1080         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1081                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1082                                                 DMA_TLB_DSI_FLUSH);
1083         else
1084                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1085                                                 DMA_TLB_PSI_FLUSH);
1086
1087         /*
1088          * In caching mode, changes of pages from non-present to present require
1089          * flush. However, device IOTLB doesn't need to be flushed in this case.
1090          */
1091         if (!cap_caching_mode(iommu->cap) || !map)
1092                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1093 }
1094
1095 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1096 {
1097         u32 pmen;
1098         unsigned long flags;
1099
1100         spin_lock_irqsave(&iommu->register_lock, flags);
1101         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1102         pmen &= ~DMA_PMEN_EPM;
1103         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1104
1105         /* wait for the protected region status bit to clear */
1106         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1107                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1108
1109         spin_unlock_irqrestore(&iommu->register_lock, flags);
1110 }
1111
1112 static int iommu_enable_translation(struct intel_iommu *iommu)
1113 {
1114         u32 sts;
1115         unsigned long flags;
1116
1117         spin_lock_irqsave(&iommu->register_lock, flags);
1118         iommu->gcmd |= DMA_GCMD_TE;
1119         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1120
1121         /* Make sure hardware complete it */
1122         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1123                       readl, (sts & DMA_GSTS_TES), sts);
1124
1125         spin_unlock_irqrestore(&iommu->register_lock, flags);
1126         return 0;
1127 }
1128
1129 static int iommu_disable_translation(struct intel_iommu *iommu)
1130 {
1131         u32 sts;
1132         unsigned long flag;
1133
1134         spin_lock_irqsave(&iommu->register_lock, flag);
1135         iommu->gcmd &= ~DMA_GCMD_TE;
1136         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1137
1138         /* Make sure hardware complete it */
1139         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1140                       readl, (!(sts & DMA_GSTS_TES)), sts);
1141
1142         spin_unlock_irqrestore(&iommu->register_lock, flag);
1143         return 0;
1144 }
1145
1146
1147 static int iommu_init_domains(struct intel_iommu *iommu)
1148 {
1149         unsigned long ndomains;
1150         unsigned long nlongs;
1151
1152         ndomains = cap_ndoms(iommu->cap);
1153         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1154                         ndomains);
1155         nlongs = BITS_TO_LONGS(ndomains);
1156
1157         spin_lock_init(&iommu->lock);
1158
1159         /* TBD: there might be 64K domains,
1160          * consider other allocation for future chip
1161          */
1162         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1163         if (!iommu->domain_ids) {
1164                 printk(KERN_ERR "Allocating domain id array failed\n");
1165                 return -ENOMEM;
1166         }
1167         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1168                         GFP_KERNEL);
1169         if (!iommu->domains) {
1170                 printk(KERN_ERR "Allocating domain array failed\n");
1171                 return -ENOMEM;
1172         }
1173
1174         /*
1175          * if Caching mode is set, then invalid translations are tagged
1176          * with domainid 0. Hence we need to pre-allocate it.
1177          */
1178         if (cap_caching_mode(iommu->cap))
1179                 set_bit(0, iommu->domain_ids);
1180         return 0;
1181 }
1182
1183
1184 static void domain_exit(struct dmar_domain *domain);
1185 static void vm_domain_exit(struct dmar_domain *domain);
1186
1187 void free_dmar_iommu(struct intel_iommu *iommu)
1188 {
1189         struct dmar_domain *domain;
1190         int i;
1191         unsigned long flags;
1192
1193         if ((iommu->domains) && (iommu->domain_ids)) {
1194                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1195                         domain = iommu->domains[i];
1196                         clear_bit(i, iommu->domain_ids);
1197
1198                         spin_lock_irqsave(&domain->iommu_lock, flags);
1199                         if (--domain->iommu_count == 0) {
1200                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1201                                         vm_domain_exit(domain);
1202                                 else
1203                                         domain_exit(domain);
1204                         }
1205                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1206                 }
1207         }
1208
1209         if (iommu->gcmd & DMA_GCMD_TE)
1210                 iommu_disable_translation(iommu);
1211
1212         if (iommu->irq) {
1213                 set_irq_data(iommu->irq, NULL);
1214                 /* This will mask the irq */
1215                 free_irq(iommu->irq, iommu);
1216                 destroy_irq(iommu->irq);
1217         }
1218
1219         kfree(iommu->domains);
1220         kfree(iommu->domain_ids);
1221
1222         g_iommus[iommu->seq_id] = NULL;
1223
1224         /* if all iommus are freed, free g_iommus */
1225         for (i = 0; i < g_num_of_iommus; i++) {
1226                 if (g_iommus[i])
1227                         break;
1228         }
1229
1230         if (i == g_num_of_iommus)
1231                 kfree(g_iommus);
1232
1233         /* free context mapping */
1234         free_context_table(iommu);
1235 }
1236
1237 static struct dmar_domain *alloc_domain(void)
1238 {
1239         struct dmar_domain *domain;
1240
1241         domain = alloc_domain_mem();
1242         if (!domain)
1243                 return NULL;
1244
1245         domain->nid = -1;
1246         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1247         domain->flags = 0;
1248
1249         return domain;
1250 }
1251
1252 static int iommu_attach_domain(struct dmar_domain *domain,
1253                                struct intel_iommu *iommu)
1254 {
1255         int num;
1256         unsigned long ndomains;
1257         unsigned long flags;
1258
1259         ndomains = cap_ndoms(iommu->cap);
1260
1261         spin_lock_irqsave(&iommu->lock, flags);
1262
1263         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1264         if (num >= ndomains) {
1265                 spin_unlock_irqrestore(&iommu->lock, flags);
1266                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1267                 return -ENOMEM;
1268         }
1269
1270         domain->id = num;
1271         set_bit(num, iommu->domain_ids);
1272         set_bit(iommu->seq_id, &domain->iommu_bmp);
1273         iommu->domains[num] = domain;
1274         spin_unlock_irqrestore(&iommu->lock, flags);
1275
1276         return 0;
1277 }
1278
1279 static void iommu_detach_domain(struct dmar_domain *domain,
1280                                 struct intel_iommu *iommu)
1281 {
1282         unsigned long flags;
1283         int num, ndomains;
1284         int found = 0;
1285
1286         spin_lock_irqsave(&iommu->lock, flags);
1287         ndomains = cap_ndoms(iommu->cap);
1288         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1289                 if (iommu->domains[num] == domain) {
1290                         found = 1;
1291                         break;
1292                 }
1293         }
1294
1295         if (found) {
1296                 clear_bit(num, iommu->domain_ids);
1297                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1298                 iommu->domains[num] = NULL;
1299         }
1300         spin_unlock_irqrestore(&iommu->lock, flags);
1301 }
1302
1303 static struct iova_domain reserved_iova_list;
1304 static struct lock_class_key reserved_rbtree_key;
1305
1306 static void dmar_init_reserved_ranges(void)
1307 {
1308         struct pci_dev *pdev = NULL;
1309         struct iova *iova;
1310         int i;
1311
1312         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1313
1314         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1315                 &reserved_rbtree_key);
1316
1317         /* IOAPIC ranges shouldn't be accessed by DMA */
1318         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1319                 IOVA_PFN(IOAPIC_RANGE_END));
1320         if (!iova)
1321                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1322
1323         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1324         for_each_pci_dev(pdev) {
1325                 struct resource *r;
1326
1327                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1328                         r = &pdev->resource[i];
1329                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1330                                 continue;
1331                         iova = reserve_iova(&reserved_iova_list,
1332                                             IOVA_PFN(r->start),
1333                                             IOVA_PFN(r->end));
1334                         if (!iova)
1335                                 printk(KERN_ERR "Reserve iova failed\n");
1336                 }
1337         }
1338
1339 }
1340
1341 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1342 {
1343         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1344 }
1345
1346 static inline int guestwidth_to_adjustwidth(int gaw)
1347 {
1348         int agaw;
1349         int r = (gaw - 12) % 9;
1350
1351         if (r == 0)
1352                 agaw = gaw;
1353         else
1354                 agaw = gaw + 9 - r;
1355         if (agaw > 64)
1356                 agaw = 64;
1357         return agaw;
1358 }
1359
1360 static int domain_init(struct dmar_domain *domain, int guest_width)
1361 {
1362         struct intel_iommu *iommu;
1363         int adjust_width, agaw;
1364         unsigned long sagaw;
1365
1366         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1367         spin_lock_init(&domain->iommu_lock);
1368
1369         domain_reserve_special_ranges(domain);
1370
1371         /* calculate AGAW */
1372         iommu = domain_get_iommu(domain);
1373         if (guest_width > cap_mgaw(iommu->cap))
1374                 guest_width = cap_mgaw(iommu->cap);
1375         domain->gaw = guest_width;
1376         adjust_width = guestwidth_to_adjustwidth(guest_width);
1377         agaw = width_to_agaw(adjust_width);
1378         sagaw = cap_sagaw(iommu->cap);
1379         if (!test_bit(agaw, &sagaw)) {
1380                 /* hardware doesn't support it, choose a bigger one */
1381                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1382                 agaw = find_next_bit(&sagaw, 5, agaw);
1383                 if (agaw >= 5)
1384                         return -ENODEV;
1385         }
1386         domain->agaw = agaw;
1387         INIT_LIST_HEAD(&domain->devices);
1388
1389         if (ecap_coherent(iommu->ecap))
1390                 domain->iommu_coherency = 1;
1391         else
1392                 domain->iommu_coherency = 0;
1393
1394         if (ecap_sc_support(iommu->ecap))
1395                 domain->iommu_snooping = 1;
1396         else
1397                 domain->iommu_snooping = 0;
1398
1399         domain->iommu_count = 1;
1400         domain->nid = iommu->node;
1401
1402         /* always allocate the top pgd */
1403         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1404         if (!domain->pgd)
1405                 return -ENOMEM;
1406         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1407         return 0;
1408 }
1409
1410 static void domain_exit(struct dmar_domain *domain)
1411 {
1412         struct dmar_drhd_unit *drhd;
1413         struct intel_iommu *iommu;
1414
1415         /* Domain 0 is reserved, so dont process it */
1416         if (!domain)
1417                 return;
1418
1419         domain_remove_dev_info(domain);
1420         /* destroy iovas */
1421         put_iova_domain(&domain->iovad);
1422
1423         /* clear ptes */
1424         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1425
1426         /* free page tables */
1427         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1428
1429         for_each_active_iommu(iommu, drhd)
1430                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1431                         iommu_detach_domain(domain, iommu);
1432
1433         free_domain_mem(domain);
1434 }
1435
1436 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1437                                  u8 bus, u8 devfn, int translation)
1438 {
1439         struct context_entry *context;
1440         unsigned long flags;
1441         struct intel_iommu *iommu;
1442         struct dma_pte *pgd;
1443         unsigned long num;
1444         unsigned long ndomains;
1445         int id;
1446         int agaw;
1447         struct device_domain_info *info = NULL;
1448
1449         pr_debug("Set context mapping for %02x:%02x.%d\n",
1450                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1451
1452         BUG_ON(!domain->pgd);
1453         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1454                translation != CONTEXT_TT_MULTI_LEVEL);
1455
1456         iommu = device_to_iommu(segment, bus, devfn);
1457         if (!iommu)
1458                 return -ENODEV;
1459
1460         context = device_to_context_entry(iommu, bus, devfn);
1461         if (!context)
1462                 return -ENOMEM;
1463         spin_lock_irqsave(&iommu->lock, flags);
1464         if (context_present(context)) {
1465                 spin_unlock_irqrestore(&iommu->lock, flags);
1466                 return 0;
1467         }
1468
1469         id = domain->id;
1470         pgd = domain->pgd;
1471
1472         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1473             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1474                 int found = 0;
1475
1476                 /* find an available domain id for this device in iommu */
1477                 ndomains = cap_ndoms(iommu->cap);
1478                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1479                         if (iommu->domains[num] == domain) {
1480                                 id = num;
1481                                 found = 1;
1482                                 break;
1483                         }
1484                 }
1485
1486                 if (found == 0) {
1487                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1488                         if (num >= ndomains) {
1489                                 spin_unlock_irqrestore(&iommu->lock, flags);
1490                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1491                                 return -EFAULT;
1492                         }
1493
1494                         set_bit(num, iommu->domain_ids);
1495                         iommu->domains[num] = domain;
1496                         id = num;
1497                 }
1498
1499                 /* Skip top levels of page tables for
1500                  * iommu which has less agaw than default.
1501                  * Unnecessary for PT mode.
1502                  */
1503                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1504                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1505                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1506                                 if (!dma_pte_present(pgd)) {
1507                                         spin_unlock_irqrestore(&iommu->lock, flags);
1508                                         return -ENOMEM;
1509                                 }
1510                         }
1511                 }
1512         }
1513
1514         context_set_domain_id(context, id);
1515
1516         if (translation != CONTEXT_TT_PASS_THROUGH) {
1517                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1518                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1519                                      CONTEXT_TT_MULTI_LEVEL;
1520         }
1521         /*
1522          * In pass through mode, AW must be programmed to indicate the largest
1523          * AGAW value supported by hardware. And ASR is ignored by hardware.
1524          */
1525         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1526                 context_set_address_width(context, iommu->msagaw);
1527         else {
1528                 context_set_address_root(context, virt_to_phys(pgd));
1529                 context_set_address_width(context, iommu->agaw);
1530         }
1531
1532         context_set_translation_type(context, translation);
1533         context_set_fault_enable(context);
1534         context_set_present(context);
1535         domain_flush_cache(domain, context, sizeof(*context));
1536
1537         /*
1538          * It's a non-present to present mapping. If hardware doesn't cache
1539          * non-present entry we only need to flush the write-buffer. If the
1540          * _does_ cache non-present entries, then it does so in the special
1541          * domain #0, which we have to flush:
1542          */
1543         if (cap_caching_mode(iommu->cap)) {
1544                 iommu->flush.flush_context(iommu, 0,
1545                                            (((u16)bus) << 8) | devfn,
1546                                            DMA_CCMD_MASK_NOBIT,
1547                                            DMA_CCMD_DEVICE_INVL);
1548                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1549         } else {
1550                 iommu_flush_write_buffer(iommu);
1551         }
1552         iommu_enable_dev_iotlb(info);
1553         spin_unlock_irqrestore(&iommu->lock, flags);
1554
1555         spin_lock_irqsave(&domain->iommu_lock, flags);
1556         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1557                 domain->iommu_count++;
1558                 if (domain->iommu_count == 1)
1559                         domain->nid = iommu->node;
1560                 domain_update_iommu_cap(domain);
1561         }
1562         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1563         return 0;
1564 }
1565
1566 static int
1567 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1568                         int translation)
1569 {
1570         int ret;
1571         struct pci_dev *tmp, *parent;
1572
1573         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1574                                          pdev->bus->number, pdev->devfn,
1575                                          translation);
1576         if (ret)
1577                 return ret;
1578
1579         /* dependent device mapping */
1580         tmp = pci_find_upstream_pcie_bridge(pdev);
1581         if (!tmp)
1582                 return 0;
1583         /* Secondary interface's bus number and devfn 0 */
1584         parent = pdev->bus->self;
1585         while (parent != tmp) {
1586                 ret = domain_context_mapping_one(domain,
1587                                                  pci_domain_nr(parent->bus),
1588                                                  parent->bus->number,
1589                                                  parent->devfn, translation);
1590                 if (ret)
1591                         return ret;
1592                 parent = parent->bus->self;
1593         }
1594         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1595                 return domain_context_mapping_one(domain,
1596                                         pci_domain_nr(tmp->subordinate),
1597                                         tmp->subordinate->number, 0,
1598                                         translation);
1599         else /* this is a legacy PCI bridge */
1600                 return domain_context_mapping_one(domain,
1601                                                   pci_domain_nr(tmp->bus),
1602                                                   tmp->bus->number,
1603                                                   tmp->devfn,
1604                                                   translation);
1605 }
1606
1607 static int domain_context_mapped(struct pci_dev *pdev)
1608 {
1609         int ret;
1610         struct pci_dev *tmp, *parent;
1611         struct intel_iommu *iommu;
1612
1613         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1614                                 pdev->devfn);
1615         if (!iommu)
1616                 return -ENODEV;
1617
1618         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1619         if (!ret)
1620                 return ret;
1621         /* dependent device mapping */
1622         tmp = pci_find_upstream_pcie_bridge(pdev);
1623         if (!tmp)
1624                 return ret;
1625         /* Secondary interface's bus number and devfn 0 */
1626         parent = pdev->bus->self;
1627         while (parent != tmp) {
1628                 ret = device_context_mapped(iommu, parent->bus->number,
1629                                             parent->devfn);
1630                 if (!ret)
1631                         return ret;
1632                 parent = parent->bus->self;
1633         }
1634         if (pci_is_pcie(tmp))
1635                 return device_context_mapped(iommu, tmp->subordinate->number,
1636                                              0);
1637         else
1638                 return device_context_mapped(iommu, tmp->bus->number,
1639                                              tmp->devfn);
1640 }
1641
1642 /* Returns a number of VTD pages, but aligned to MM page size */
1643 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1644                                             size_t size)
1645 {
1646         host_addr &= ~PAGE_MASK;
1647         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1648 }
1649
1650 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1651                             struct scatterlist *sg, unsigned long phys_pfn,
1652                             unsigned long nr_pages, int prot)
1653 {
1654         struct dma_pte *first_pte = NULL, *pte = NULL;
1655         phys_addr_t uninitialized_var(pteval);
1656         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1657         unsigned long sg_res;
1658
1659         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1660
1661         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1662                 return -EINVAL;
1663
1664         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1665
1666         if (sg)
1667                 sg_res = 0;
1668         else {
1669                 sg_res = nr_pages + 1;
1670                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1671         }
1672
1673         while (nr_pages--) {
1674                 uint64_t tmp;
1675
1676                 if (!sg_res) {
1677                         sg_res = aligned_nrpages(sg->offset, sg->length);
1678                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1679                         sg->dma_length = sg->length;
1680                         pteval = page_to_phys(sg_page(sg)) | prot;
1681                 }
1682                 if (!pte) {
1683                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1684                         if (!pte)
1685                                 return -ENOMEM;
1686                 }
1687                 /* We don't need lock here, nobody else
1688                  * touches the iova range
1689                  */
1690                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1691                 if (tmp) {
1692                         static int dumps = 5;
1693                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1694                                iov_pfn, tmp, (unsigned long long)pteval);
1695                         if (dumps) {
1696                                 dumps--;
1697                                 debug_dma_dump_mappings(NULL);
1698                         }
1699                         WARN_ON(1);
1700                 }
1701                 pte++;
1702                 if (!nr_pages || first_pte_in_page(pte)) {
1703                         domain_flush_cache(domain, first_pte,
1704                                            (void *)pte - (void *)first_pte);
1705                         pte = NULL;
1706                 }
1707                 iov_pfn++;
1708                 pteval += VTD_PAGE_SIZE;
1709                 sg_res--;
1710                 if (!sg_res)
1711                         sg = sg_next(sg);
1712         }
1713         return 0;
1714 }
1715
1716 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1717                                     struct scatterlist *sg, unsigned long nr_pages,
1718                                     int prot)
1719 {
1720         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1721 }
1722
1723 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1724                                      unsigned long phys_pfn, unsigned long nr_pages,
1725                                      int prot)
1726 {
1727         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1728 }
1729
1730 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1731 {
1732         if (!iommu)
1733                 return;
1734
1735         clear_context_table(iommu, bus, devfn);
1736         iommu->flush.flush_context(iommu, 0, 0, 0,
1737                                            DMA_CCMD_GLOBAL_INVL);
1738         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1739 }
1740
1741 static void domain_remove_dev_info(struct dmar_domain *domain)
1742 {
1743         struct device_domain_info *info;
1744         unsigned long flags;
1745         struct intel_iommu *iommu;
1746
1747         spin_lock_irqsave(&device_domain_lock, flags);
1748         while (!list_empty(&domain->devices)) {
1749                 info = list_entry(domain->devices.next,
1750                         struct device_domain_info, link);
1751                 list_del(&info->link);
1752                 list_del(&info->global);
1753                 if (info->dev)
1754                         info->dev->dev.archdata.iommu = NULL;
1755                 spin_unlock_irqrestore(&device_domain_lock, flags);
1756
1757                 iommu_disable_dev_iotlb(info);
1758                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1759                 iommu_detach_dev(iommu, info->bus, info->devfn);
1760                 free_devinfo_mem(info);
1761
1762                 spin_lock_irqsave(&device_domain_lock, flags);
1763         }
1764         spin_unlock_irqrestore(&device_domain_lock, flags);
1765 }
1766
1767 /*
1768  * find_domain
1769  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1770  */
1771 static struct dmar_domain *
1772 find_domain(struct pci_dev *pdev)
1773 {
1774         struct device_domain_info *info;
1775
1776         /* No lock here, assumes no domain exit in normal case */
1777         info = pdev->dev.archdata.iommu;
1778         if (info)
1779                 return info->domain;
1780         return NULL;
1781 }
1782
1783 /* domain is initialized */
1784 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1785 {
1786         struct dmar_domain *domain, *found = NULL;
1787         struct intel_iommu *iommu;
1788         struct dmar_drhd_unit *drhd;
1789         struct device_domain_info *info, *tmp;
1790         struct pci_dev *dev_tmp;
1791         unsigned long flags;
1792         int bus = 0, devfn = 0;
1793         int segment;
1794         int ret;
1795
1796         domain = find_domain(pdev);
1797         if (domain)
1798                 return domain;
1799
1800         segment = pci_domain_nr(pdev->bus);
1801
1802         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1803         if (dev_tmp) {
1804                 if (pci_is_pcie(dev_tmp)) {
1805                         bus = dev_tmp->subordinate->number;
1806                         devfn = 0;
1807                 } else {
1808                         bus = dev_tmp->bus->number;
1809                         devfn = dev_tmp->devfn;
1810                 }
1811                 spin_lock_irqsave(&device_domain_lock, flags);
1812                 list_for_each_entry(info, &device_domain_list, global) {
1813                         if (info->segment == segment &&
1814                             info->bus == bus && info->devfn == devfn) {
1815                                 found = info->domain;
1816                                 break;
1817                         }
1818                 }
1819                 spin_unlock_irqrestore(&device_domain_lock, flags);
1820                 /* pcie-pci bridge already has a domain, uses it */
1821                 if (found) {
1822                         domain = found;
1823                         goto found_domain;
1824                 }
1825         }
1826
1827         domain = alloc_domain();
1828         if (!domain)
1829                 goto error;
1830
1831         /* Allocate new domain for the device */
1832         drhd = dmar_find_matched_drhd_unit(pdev);
1833         if (!drhd) {
1834                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1835                         pci_name(pdev));
1836                 return NULL;
1837         }
1838         iommu = drhd->iommu;
1839
1840         ret = iommu_attach_domain(domain, iommu);
1841         if (ret) {
1842                 domain_exit(domain);
1843                 goto error;
1844         }
1845
1846         if (domain_init(domain, gaw)) {
1847                 domain_exit(domain);
1848                 goto error;
1849         }
1850
1851         /* register pcie-to-pci device */
1852         if (dev_tmp) {
1853                 info = alloc_devinfo_mem();
1854                 if (!info) {
1855                         domain_exit(domain);
1856                         goto error;
1857                 }
1858                 info->segment = segment;
1859                 info->bus = bus;
1860                 info->devfn = devfn;
1861                 info->dev = NULL;
1862                 info->domain = domain;
1863                 /* This domain is shared by devices under p2p bridge */
1864                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1865
1866                 /* pcie-to-pci bridge already has a domain, uses it */
1867                 found = NULL;
1868                 spin_lock_irqsave(&device_domain_lock, flags);
1869                 list_for_each_entry(tmp, &device_domain_list, global) {
1870                         if (tmp->segment == segment &&
1871                             tmp->bus == bus && tmp->devfn == devfn) {
1872                                 found = tmp->domain;
1873                                 break;
1874                         }
1875                 }
1876                 if (found) {
1877                         free_devinfo_mem(info);
1878                         domain_exit(domain);
1879                         domain = found;
1880                 } else {
1881                         list_add(&info->link, &domain->devices);
1882                         list_add(&info->global, &device_domain_list);
1883                 }
1884                 spin_unlock_irqrestore(&device_domain_lock, flags);
1885         }
1886
1887 found_domain:
1888         info = alloc_devinfo_mem();
1889         if (!info)
1890                 goto error;
1891         info->segment = segment;
1892         info->bus = pdev->bus->number;
1893         info->devfn = pdev->devfn;
1894         info->dev = pdev;
1895         info->domain = domain;
1896         spin_lock_irqsave(&device_domain_lock, flags);
1897         /* somebody is fast */
1898         found = find_domain(pdev);
1899         if (found != NULL) {
1900                 spin_unlock_irqrestore(&device_domain_lock, flags);
1901                 if (found != domain) {
1902                         domain_exit(domain);
1903                         domain = found;
1904                 }
1905                 free_devinfo_mem(info);
1906                 return domain;
1907         }
1908         list_add(&info->link, &domain->devices);
1909         list_add(&info->global, &device_domain_list);
1910         pdev->dev.archdata.iommu = info;
1911         spin_unlock_irqrestore(&device_domain_lock, flags);
1912         return domain;
1913 error:
1914         /* recheck it here, maybe others set it */
1915         return find_domain(pdev);
1916 }
1917
1918 static int iommu_identity_mapping;
1919 #define IDENTMAP_ALL            1
1920 #define IDENTMAP_GFX            2
1921 #define IDENTMAP_AZALIA         4
1922
1923 static int iommu_domain_identity_map(struct dmar_domain *domain,
1924                                      unsigned long long start,
1925                                      unsigned long long end)
1926 {
1927         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1928         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1929
1930         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1931                           dma_to_mm_pfn(last_vpfn))) {
1932                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1933                 return -ENOMEM;
1934         }
1935
1936         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1937                  start, end, domain->id);
1938         /*
1939          * RMRR range might have overlap with physical memory range,
1940          * clear it first
1941          */
1942         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1943
1944         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1945                                   last_vpfn - first_vpfn + 1,
1946                                   DMA_PTE_READ|DMA_PTE_WRITE);
1947 }
1948
1949 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1950                                       unsigned long long start,
1951                                       unsigned long long end)
1952 {
1953         struct dmar_domain *domain;
1954         int ret;
1955
1956         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1957         if (!domain)
1958                 return -ENOMEM;
1959
1960         /* For _hardware_ passthrough, don't bother. But for software
1961            passthrough, we do it anyway -- it may indicate a memory
1962            range which is reserved in E820, so which didn't get set
1963            up to start with in si_domain */
1964         if (domain == si_domain && hw_pass_through) {
1965                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1966                        pci_name(pdev), start, end);
1967                 return 0;
1968         }
1969
1970         printk(KERN_INFO
1971                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1972                pci_name(pdev), start, end);
1973         
1974         if (end < start) {
1975                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1976                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1977                         dmi_get_system_info(DMI_BIOS_VENDOR),
1978                         dmi_get_system_info(DMI_BIOS_VERSION),
1979                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1980                 ret = -EIO;
1981                 goto error;
1982         }
1983
1984         if (end >> agaw_to_width(domain->agaw)) {
1985                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1986                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1987                      agaw_to_width(domain->agaw),
1988                      dmi_get_system_info(DMI_BIOS_VENDOR),
1989                      dmi_get_system_info(DMI_BIOS_VERSION),
1990                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1991                 ret = -EIO;
1992                 goto error;
1993         }
1994
1995         ret = iommu_domain_identity_map(domain, start, end);
1996         if (ret)
1997                 goto error;
1998
1999         /* context entry init */
2000         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2001         if (ret)
2002                 goto error;
2003
2004         return 0;
2005
2006  error:
2007         domain_exit(domain);
2008         return ret;
2009 }
2010
2011 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2012         struct pci_dev *pdev)
2013 {
2014         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2015                 return 0;
2016         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2017                 rmrr->end_address + 1);
2018 }
2019
2020 #ifdef CONFIG_DMAR_FLOPPY_WA
2021 static inline void iommu_prepare_isa(void)
2022 {
2023         struct pci_dev *pdev;
2024         int ret;
2025
2026         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2027         if (!pdev)
2028                 return;
2029
2030         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2031         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2032
2033         if (ret)
2034                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2035                        "floppy might not work\n");
2036
2037 }
2038 #else
2039 static inline void iommu_prepare_isa(void)
2040 {
2041         return;
2042 }
2043 #endif /* !CONFIG_DMAR_FLPY_WA */
2044
2045 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2046
2047 static int __init si_domain_work_fn(unsigned long start_pfn,
2048                                     unsigned long end_pfn, void *datax)
2049 {
2050         int *ret = datax;
2051
2052         *ret = iommu_domain_identity_map(si_domain,
2053                                          (uint64_t)start_pfn << PAGE_SHIFT,
2054                                          (uint64_t)end_pfn << PAGE_SHIFT);
2055         return *ret;
2056
2057 }
2058
2059 static int __init si_domain_init(int hw)
2060 {
2061         struct dmar_drhd_unit *drhd;
2062         struct intel_iommu *iommu;
2063         int nid, ret = 0;
2064
2065         si_domain = alloc_domain();
2066         if (!si_domain)
2067                 return -EFAULT;
2068
2069         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2070
2071         for_each_active_iommu(iommu, drhd) {
2072                 ret = iommu_attach_domain(si_domain, iommu);
2073                 if (ret) {
2074                         domain_exit(si_domain);
2075                         return -EFAULT;
2076                 }
2077         }
2078
2079         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2080                 domain_exit(si_domain);
2081                 return -EFAULT;
2082         }
2083
2084         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2085
2086         if (hw)
2087                 return 0;
2088
2089         for_each_online_node(nid) {
2090                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2091                 if (ret)
2092                         return ret;
2093         }
2094
2095         return 0;
2096 }
2097
2098 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2099                                           struct pci_dev *pdev);
2100 static int identity_mapping(struct pci_dev *pdev)
2101 {
2102         struct device_domain_info *info;
2103
2104         if (likely(!iommu_identity_mapping))
2105                 return 0;
2106
2107
2108         list_for_each_entry(info, &si_domain->devices, link)
2109                 if (info->dev == pdev)
2110                         return 1;
2111         return 0;
2112 }
2113
2114 static int domain_add_dev_info(struct dmar_domain *domain,
2115                                struct pci_dev *pdev,
2116                                int translation)
2117 {
2118         struct device_domain_info *info;
2119         unsigned long flags;
2120         int ret;
2121
2122         info = alloc_devinfo_mem();
2123         if (!info)
2124                 return -ENOMEM;
2125
2126         ret = domain_context_mapping(domain, pdev, translation);
2127         if (ret) {
2128                 free_devinfo_mem(info);
2129                 return ret;
2130         }
2131
2132         info->segment = pci_domain_nr(pdev->bus);
2133         info->bus = pdev->bus->number;
2134         info->devfn = pdev->devfn;
2135         info->dev = pdev;
2136         info->domain = domain;
2137
2138         spin_lock_irqsave(&device_domain_lock, flags);
2139         list_add(&info->link, &domain->devices);
2140         list_add(&info->global, &device_domain_list);
2141         pdev->dev.archdata.iommu = info;
2142         spin_unlock_irqrestore(&device_domain_lock, flags);
2143
2144         return 0;
2145 }
2146
2147 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2148 {
2149         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2150                 return 1;
2151
2152         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2153                 return 1;
2154
2155         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2156                 return 0;
2157
2158         /*
2159          * We want to start off with all devices in the 1:1 domain, and
2160          * take them out later if we find they can't access all of memory.
2161          *
2162          * However, we can't do this for PCI devices behind bridges,
2163          * because all PCI devices behind the same bridge will end up
2164          * with the same source-id on their transactions.
2165          *
2166          * Practically speaking, we can't change things around for these
2167          * devices at run-time, because we can't be sure there'll be no
2168          * DMA transactions in flight for any of their siblings.
2169          * 
2170          * So PCI devices (unless they're on the root bus) as well as
2171          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2172          * the 1:1 domain, just in _case_ one of their siblings turns out
2173          * not to be able to map all of memory.
2174          */
2175         if (!pci_is_pcie(pdev)) {
2176                 if (!pci_is_root_bus(pdev->bus))
2177                         return 0;
2178                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2179                         return 0;
2180         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2181                 return 0;
2182
2183         /* 
2184          * At boot time, we don't yet know if devices will be 64-bit capable.
2185          * Assume that they will -- if they turn out not to be, then we can 
2186          * take them out of the 1:1 domain later.
2187          */
2188         if (!startup)
2189                 return pdev->dma_mask > DMA_BIT_MASK(32);
2190
2191         return 1;
2192 }
2193
2194 static int __init iommu_prepare_static_identity_mapping(int hw)
2195 {
2196         struct pci_dev *pdev = NULL;
2197         int ret;
2198
2199         ret = si_domain_init(hw);
2200         if (ret)
2201                 return -EFAULT;
2202
2203         for_each_pci_dev(pdev) {
2204                 if (iommu_should_identity_map(pdev, 1)) {
2205                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2206                                hw ? "hardware" : "software", pci_name(pdev));
2207
2208                         ret = domain_add_dev_info(si_domain, pdev,
2209                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2210                                                      CONTEXT_TT_MULTI_LEVEL);
2211                         if (ret)
2212                                 return ret;
2213                 }
2214         }
2215
2216         return 0;
2217 }
2218
2219 int __init init_dmars(void)
2220 {
2221         struct dmar_drhd_unit *drhd;
2222         struct dmar_rmrr_unit *rmrr;
2223         struct pci_dev *pdev;
2224         struct intel_iommu *iommu;
2225         int i, ret;
2226
2227         /*
2228          * for each drhd
2229          *    allocate root
2230          *    initialize and program root entry to not present
2231          * endfor
2232          */
2233         for_each_drhd_unit(drhd) {
2234                 g_num_of_iommus++;
2235                 /*
2236                  * lock not needed as this is only incremented in the single
2237                  * threaded kernel __init code path all other access are read
2238                  * only
2239                  */
2240         }
2241
2242         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2243                         GFP_KERNEL);
2244         if (!g_iommus) {
2245                 printk(KERN_ERR "Allocating global iommu array failed\n");
2246                 ret = -ENOMEM;
2247                 goto error;
2248         }
2249
2250         deferred_flush = kzalloc(g_num_of_iommus *
2251                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2252         if (!deferred_flush) {
2253                 ret = -ENOMEM;
2254                 goto error;
2255         }
2256
2257         for_each_drhd_unit(drhd) {
2258                 if (drhd->ignored)
2259                         continue;
2260
2261                 iommu = drhd->iommu;
2262                 g_iommus[iommu->seq_id] = iommu;
2263
2264                 ret = iommu_init_domains(iommu);
2265                 if (ret)
2266                         goto error;
2267
2268                 /*
2269                  * TBD:
2270                  * we could share the same root & context tables
2271                  * amoung all IOMMU's. Need to Split it later.
2272                  */
2273                 ret = iommu_alloc_root_entry(iommu);
2274                 if (ret) {
2275                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2276                         goto error;
2277                 }
2278                 if (!ecap_pass_through(iommu->ecap))
2279                         hw_pass_through = 0;
2280         }
2281
2282         /*
2283          * Start from the sane iommu hardware state.
2284          */
2285         for_each_drhd_unit(drhd) {
2286                 if (drhd->ignored)
2287                         continue;
2288
2289                 iommu = drhd->iommu;
2290
2291                 /*
2292                  * If the queued invalidation is already initialized by us
2293                  * (for example, while enabling interrupt-remapping) then
2294                  * we got the things already rolling from a sane state.
2295                  */
2296                 if (iommu->qi)
2297                         continue;
2298
2299                 /*
2300                  * Clear any previous faults.
2301                  */
2302                 dmar_fault(-1, iommu);
2303                 /*
2304                  * Disable queued invalidation if supported and already enabled
2305                  * before OS handover.
2306                  */
2307                 dmar_disable_qi(iommu);
2308         }
2309
2310         for_each_drhd_unit(drhd) {
2311                 if (drhd->ignored)
2312                         continue;
2313
2314                 iommu = drhd->iommu;
2315
2316                 if (dmar_enable_qi(iommu)) {
2317                         /*
2318                          * Queued Invalidate not enabled, use Register Based
2319                          * Invalidate
2320                          */
2321                         iommu->flush.flush_context = __iommu_flush_context;
2322                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2323                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2324                                "invalidation\n",
2325                                 iommu->seq_id,
2326                                (unsigned long long)drhd->reg_base_addr);
2327                 } else {
2328                         iommu->flush.flush_context = qi_flush_context;
2329                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2330                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2331                                "invalidation\n",
2332                                 iommu->seq_id,
2333                                (unsigned long long)drhd->reg_base_addr);
2334                 }
2335         }
2336
2337         if (iommu_pass_through)
2338                 iommu_identity_mapping |= IDENTMAP_ALL;
2339
2340 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2341         iommu_identity_mapping |= IDENTMAP_GFX;
2342 #endif
2343
2344         check_tylersburg_isoch();
2345
2346         /*
2347          * If pass through is not set or not enabled, setup context entries for
2348          * identity mappings for rmrr, gfx, and isa and may fall back to static
2349          * identity mapping if iommu_identity_mapping is set.
2350          */
2351         if (iommu_identity_mapping) {
2352                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2353                 if (ret) {
2354                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2355                         goto error;
2356                 }
2357         }
2358         /*
2359          * For each rmrr
2360          *   for each dev attached to rmrr
2361          *   do
2362          *     locate drhd for dev, alloc domain for dev
2363          *     allocate free domain
2364          *     allocate page table entries for rmrr
2365          *     if context not allocated for bus
2366          *           allocate and init context
2367          *           set present in root table for this bus
2368          *     init context with domain, translation etc
2369          *    endfor
2370          * endfor
2371          */
2372         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2373         for_each_rmrr_units(rmrr) {
2374                 for (i = 0; i < rmrr->devices_cnt; i++) {
2375                         pdev = rmrr->devices[i];
2376                         /*
2377                          * some BIOS lists non-exist devices in DMAR
2378                          * table.
2379                          */
2380                         if (!pdev)
2381                                 continue;
2382                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2383                         if (ret)
2384                                 printk(KERN_ERR
2385                                        "IOMMU: mapping reserved region failed\n");
2386                 }
2387         }
2388
2389         iommu_prepare_isa();
2390
2391         /*
2392          * for each drhd
2393          *   enable fault log
2394          *   global invalidate context cache
2395          *   global invalidate iotlb
2396          *   enable translation
2397          */
2398         for_each_drhd_unit(drhd) {
2399                 if (drhd->ignored)
2400                         continue;
2401                 iommu = drhd->iommu;
2402
2403                 iommu_flush_write_buffer(iommu);
2404
2405                 ret = dmar_set_interrupt(iommu);
2406                 if (ret)
2407                         goto error;
2408
2409                 iommu_set_root_entry(iommu);
2410
2411                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2412                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2413
2414                 ret = iommu_enable_translation(iommu);
2415                 if (ret)
2416                         goto error;
2417
2418                 iommu_disable_protect_mem_regions(iommu);
2419         }
2420
2421         return 0;
2422 error:
2423         for_each_drhd_unit(drhd) {
2424                 if (drhd->ignored)
2425                         continue;
2426                 iommu = drhd->iommu;
2427                 free_iommu(iommu);
2428         }
2429         kfree(g_iommus);
2430         return ret;
2431 }
2432
2433 /* This takes a number of _MM_ pages, not VTD pages */
2434 static struct iova *intel_alloc_iova(struct device *dev,
2435                                      struct dmar_domain *domain,
2436                                      unsigned long nrpages, uint64_t dma_mask)
2437 {
2438         struct pci_dev *pdev = to_pci_dev(dev);
2439         struct iova *iova = NULL;
2440
2441         /* Restrict dma_mask to the width that the iommu can handle */
2442         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2443
2444         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2445                 /*
2446                  * First try to allocate an io virtual address in
2447                  * DMA_BIT_MASK(32) and if that fails then try allocating
2448                  * from higher range
2449                  */
2450                 iova = alloc_iova(&domain->iovad, nrpages,
2451                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2452                 if (iova)
2453                         return iova;
2454         }
2455         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2456         if (unlikely(!iova)) {
2457                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2458                        nrpages, pci_name(pdev));
2459                 return NULL;
2460         }
2461
2462         return iova;
2463 }
2464
2465 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2466 {
2467         struct dmar_domain *domain;
2468         int ret;
2469
2470         domain = get_domain_for_dev(pdev,
2471                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2472         if (!domain) {
2473                 printk(KERN_ERR
2474                         "Allocating domain for %s failed", pci_name(pdev));
2475                 return NULL;
2476         }
2477
2478         /* make sure context mapping is ok */
2479         if (unlikely(!domain_context_mapped(pdev))) {
2480                 ret = domain_context_mapping(domain, pdev,
2481                                              CONTEXT_TT_MULTI_LEVEL);
2482                 if (ret) {
2483                         printk(KERN_ERR
2484                                 "Domain context map for %s failed",
2485                                 pci_name(pdev));
2486                         return NULL;
2487                 }
2488         }
2489
2490         return domain;
2491 }
2492
2493 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2494 {
2495         struct device_domain_info *info;
2496
2497         /* No lock here, assumes no domain exit in normal case */
2498         info = dev->dev.archdata.iommu;
2499         if (likely(info))
2500                 return info->domain;
2501
2502         return __get_valid_domain_for_dev(dev);
2503 }
2504
2505 static int iommu_dummy(struct pci_dev *pdev)
2506 {
2507         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2508 }
2509
2510 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2511 static int iommu_no_mapping(struct device *dev)
2512 {
2513         struct pci_dev *pdev;
2514         int found;
2515
2516         if (unlikely(dev->bus != &pci_bus_type))
2517                 return 1;
2518
2519         pdev = to_pci_dev(dev);
2520         if (iommu_dummy(pdev))
2521                 return 1;
2522
2523         if (!iommu_identity_mapping)
2524                 return 0;
2525
2526         found = identity_mapping(pdev);
2527         if (found) {
2528                 if (iommu_should_identity_map(pdev, 0))
2529                         return 1;
2530                 else {
2531                         /*
2532                          * 32 bit DMA is removed from si_domain and fall back
2533                          * to non-identity mapping.
2534                          */
2535                         domain_remove_one_dev_info(si_domain, pdev);
2536                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2537                                pci_name(pdev));
2538                         return 0;
2539                 }
2540         } else {
2541                 /*
2542                  * In case of a detached 64 bit DMA device from vm, the device
2543                  * is put into si_domain for identity mapping.
2544                  */
2545                 if (iommu_should_identity_map(pdev, 0)) {
2546                         int ret;
2547                         ret = domain_add_dev_info(si_domain, pdev,
2548                                                   hw_pass_through ?
2549                                                   CONTEXT_TT_PASS_THROUGH :
2550                                                   CONTEXT_TT_MULTI_LEVEL);
2551                         if (!ret) {
2552                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2553                                        pci_name(pdev));
2554                                 return 1;
2555                         }
2556                 }
2557         }
2558
2559         return 0;
2560 }
2561
2562 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2563                                      size_t size, int dir, u64 dma_mask)
2564 {
2565         struct pci_dev *pdev = to_pci_dev(hwdev);
2566         struct dmar_domain *domain;
2567         phys_addr_t start_paddr;
2568         struct iova *iova;
2569         int prot = 0;
2570         int ret;
2571         struct intel_iommu *iommu;
2572         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2573
2574         BUG_ON(dir == DMA_NONE);
2575
2576         if (iommu_no_mapping(hwdev))
2577                 return paddr;
2578
2579         domain = get_valid_domain_for_dev(pdev);
2580         if (!domain)
2581                 return 0;
2582
2583         iommu = domain_get_iommu(domain);
2584         size = aligned_nrpages(paddr, size);
2585
2586         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2587                                 pdev->dma_mask);
2588         if (!iova)
2589                 goto error;
2590
2591         /*
2592          * Check if DMAR supports zero-length reads on write only
2593          * mappings..
2594          */
2595         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2596                         !cap_zlr(iommu->cap))
2597                 prot |= DMA_PTE_READ;
2598         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2599                 prot |= DMA_PTE_WRITE;
2600         /*
2601          * paddr - (paddr + size) might be partial page, we should map the whole
2602          * page.  Note: if two part of one page are separately mapped, we
2603          * might have two guest_addr mapping to the same host paddr, but this
2604          * is not a big problem
2605          */
2606         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2607                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2608         if (ret)
2609                 goto error;
2610
2611         /* it's a non-present to present mapping. Only flush if caching mode */
2612         if (cap_caching_mode(iommu->cap))
2613                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2614         else
2615                 iommu_flush_write_buffer(iommu);
2616
2617         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2618         start_paddr += paddr & ~PAGE_MASK;
2619         return start_paddr;
2620
2621 error:
2622         if (iova)
2623                 __free_iova(&domain->iovad, iova);
2624         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2625                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2626         return 0;
2627 }
2628
2629 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2630                                  unsigned long offset, size_t size,
2631                                  enum dma_data_direction dir,
2632                                  struct dma_attrs *attrs)
2633 {
2634         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2635                                   dir, to_pci_dev(dev)->dma_mask);
2636 }
2637
2638 static void flush_unmaps(void)
2639 {
2640         int i, j;
2641
2642         timer_on = 0;
2643
2644         /* just flush them all */
2645         for (i = 0; i < g_num_of_iommus; i++) {
2646                 struct intel_iommu *iommu = g_iommus[i];
2647                 if (!iommu)
2648                         continue;
2649
2650                 if (!deferred_flush[i].next)
2651                         continue;
2652
2653                 /* In caching mode, global flushes turn emulation expensive */
2654                 if (!cap_caching_mode(iommu->cap))
2655                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2656                                          DMA_TLB_GLOBAL_FLUSH);
2657                 for (j = 0; j < deferred_flush[i].next; j++) {
2658                         unsigned long mask;
2659                         struct iova *iova = deferred_flush[i].iova[j];
2660                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2661
2662                         /* On real hardware multiple invalidations are expensive */
2663                         if (cap_caching_mode(iommu->cap))
2664                                 iommu_flush_iotlb_psi(iommu, domain->id,
2665                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2666                         else {
2667                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2668                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2669                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2670                         }
2671                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2672                 }
2673                 deferred_flush[i].next = 0;
2674         }
2675
2676         list_size = 0;
2677 }
2678
2679 static void flush_unmaps_timeout(unsigned long data)
2680 {
2681         unsigned long flags;
2682
2683         spin_lock_irqsave(&async_umap_flush_lock, flags);
2684         flush_unmaps();
2685         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2686 }
2687
2688 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2689 {
2690         unsigned long flags;
2691         int next, iommu_id;
2692         struct intel_iommu *iommu;
2693
2694         spin_lock_irqsave(&async_umap_flush_lock, flags);
2695         if (list_size == HIGH_WATER_MARK)
2696                 flush_unmaps();
2697
2698         iommu = domain_get_iommu(dom);
2699         iommu_id = iommu->seq_id;
2700
2701         next = deferred_flush[iommu_id].next;
2702         deferred_flush[iommu_id].domain[next] = dom;
2703         deferred_flush[iommu_id].iova[next] = iova;
2704         deferred_flush[iommu_id].next++;
2705
2706         if (!timer_on) {
2707                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2708                 timer_on = 1;
2709         }
2710         list_size++;
2711         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2712 }
2713
2714 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2715                              size_t size, enum dma_data_direction dir,
2716                              struct dma_attrs *attrs)
2717 {
2718         struct pci_dev *pdev = to_pci_dev(dev);
2719         struct dmar_domain *domain;
2720         unsigned long start_pfn, last_pfn;
2721         struct iova *iova;
2722         struct intel_iommu *iommu;
2723
2724         if (iommu_no_mapping(dev))
2725                 return;
2726
2727         domain = find_domain(pdev);
2728         BUG_ON(!domain);
2729
2730         iommu = domain_get_iommu(domain);
2731
2732         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2733         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2734                       (unsigned long long)dev_addr))
2735                 return;
2736
2737         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2738         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2739
2740         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2741                  pci_name(pdev), start_pfn, last_pfn);
2742
2743         /*  clear the whole page */
2744         dma_pte_clear_range(domain, start_pfn, last_pfn);
2745
2746         /* free page tables */
2747         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2748
2749         if (intel_iommu_strict) {
2750                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2751                                       last_pfn - start_pfn + 1, 0);
2752                 /* free iova */
2753                 __free_iova(&domain->iovad, iova);
2754         } else {
2755                 add_unmap(domain, iova);
2756                 /*
2757                  * queue up the release of the unmap to save the 1/6th of the
2758                  * cpu used up by the iotlb flush operation...
2759                  */
2760         }
2761 }
2762
2763 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2764                                   dma_addr_t *dma_handle, gfp_t flags)
2765 {
2766         void *vaddr;
2767         int order;
2768
2769         size = PAGE_ALIGN(size);
2770         order = get_order(size);
2771
2772         if (!iommu_no_mapping(hwdev))
2773                 flags &= ~(GFP_DMA | GFP_DMA32);
2774         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2775                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2776                         flags |= GFP_DMA;
2777                 else
2778                         flags |= GFP_DMA32;
2779         }
2780
2781         vaddr = (void *)__get_free_pages(flags, order);
2782         if (!vaddr)
2783                 return NULL;
2784         memset(vaddr, 0, size);
2785
2786         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2787                                          DMA_BIDIRECTIONAL,
2788                                          hwdev->coherent_dma_mask);
2789         if (*dma_handle)
2790                 return vaddr;
2791         free_pages((unsigned long)vaddr, order);
2792         return NULL;
2793 }
2794
2795 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2796                                 dma_addr_t dma_handle)
2797 {
2798         int order;
2799
2800         size = PAGE_ALIGN(size);
2801         order = get_order(size);
2802
2803         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2804         free_pages((unsigned long)vaddr, order);
2805 }
2806
2807 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2808                            int nelems, enum dma_data_direction dir,
2809                            struct dma_attrs *attrs)
2810 {
2811         struct pci_dev *pdev = to_pci_dev(hwdev);
2812         struct dmar_domain *domain;
2813         unsigned long start_pfn, last_pfn;
2814         struct iova *iova;
2815         struct intel_iommu *iommu;
2816
2817         if (iommu_no_mapping(hwdev))
2818                 return;
2819
2820         domain = find_domain(pdev);
2821         BUG_ON(!domain);
2822
2823         iommu = domain_get_iommu(domain);
2824
2825         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2826         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2827                       (unsigned long long)sglist[0].dma_address))
2828                 return;
2829
2830         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2831         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2832
2833         /*  clear the whole page */
2834         dma_pte_clear_range(domain, start_pfn, last_pfn);
2835
2836         /* free page tables */
2837         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2838
2839         if (intel_iommu_strict) {
2840                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2841                                       last_pfn - start_pfn + 1, 0);
2842                 /* free iova */
2843                 __free_iova(&domain->iovad, iova);
2844         } else {
2845                 add_unmap(domain, iova);
2846                 /*
2847                  * queue up the release of the unmap to save the 1/6th of the
2848                  * cpu used up by the iotlb flush operation...
2849                  */
2850         }
2851 }
2852
2853 static int intel_nontranslate_map_sg(struct device *hddev,
2854         struct scatterlist *sglist, int nelems, int dir)
2855 {
2856         int i;
2857         struct scatterlist *sg;
2858
2859         for_each_sg(sglist, sg, nelems, i) {
2860                 BUG_ON(!sg_page(sg));
2861                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2862                 sg->dma_length = sg->length;
2863         }
2864         return nelems;
2865 }
2866
2867 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2868                         enum dma_data_direction dir, struct dma_attrs *attrs)
2869 {
2870         int i;
2871         struct pci_dev *pdev = to_pci_dev(hwdev);
2872         struct dmar_domain *domain;
2873         size_t size = 0;
2874         int prot = 0;
2875         struct iova *iova = NULL;
2876         int ret;
2877         struct scatterlist *sg;
2878         unsigned long start_vpfn;
2879         struct intel_iommu *iommu;
2880
2881         BUG_ON(dir == DMA_NONE);
2882         if (iommu_no_mapping(hwdev))
2883                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2884
2885         domain = get_valid_domain_for_dev(pdev);
2886         if (!domain)
2887                 return 0;
2888
2889         iommu = domain_get_iommu(domain);
2890
2891         for_each_sg(sglist, sg, nelems, i)
2892                 size += aligned_nrpages(sg->offset, sg->length);
2893
2894         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2895                                 pdev->dma_mask);
2896         if (!iova) {
2897                 sglist->dma_length = 0;
2898                 return 0;
2899         }
2900
2901         /*
2902          * Check if DMAR supports zero-length reads on write only
2903          * mappings..
2904          */
2905         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2906                         !cap_zlr(iommu->cap))
2907                 prot |= DMA_PTE_READ;
2908         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2909                 prot |= DMA_PTE_WRITE;
2910
2911         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2912
2913         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2914         if (unlikely(ret)) {
2915                 /*  clear the page */
2916                 dma_pte_clear_range(domain, start_vpfn,
2917                                     start_vpfn + size - 1);
2918                 /* free page tables */
2919                 dma_pte_free_pagetable(domain, start_vpfn,
2920                                        start_vpfn + size - 1);
2921                 /* free iova */
2922                 __free_iova(&domain->iovad, iova);
2923                 return 0;
2924         }
2925
2926         /* it's a non-present to present mapping. Only flush if caching mode */
2927         if (cap_caching_mode(iommu->cap))
2928                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
2929         else
2930                 iommu_flush_write_buffer(iommu);
2931
2932         return nelems;
2933 }
2934
2935 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2936 {
2937         return !dma_addr;
2938 }
2939
2940 struct dma_map_ops intel_dma_ops = {
2941         .alloc_coherent = intel_alloc_coherent,
2942         .free_coherent = intel_free_coherent,
2943         .map_sg = intel_map_sg,
2944         .unmap_sg = intel_unmap_sg,
2945         .map_page = intel_map_page,
2946         .unmap_page = intel_unmap_page,
2947         .mapping_error = intel_mapping_error,
2948 };
2949
2950 static inline int iommu_domain_cache_init(void)
2951 {
2952         int ret = 0;
2953
2954         iommu_domain_cache = kmem_cache_create("iommu_domain",
2955                                          sizeof(struct dmar_domain),
2956                                          0,
2957                                          SLAB_HWCACHE_ALIGN,
2958
2959                                          NULL);
2960         if (!iommu_domain_cache) {
2961                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2962                 ret = -ENOMEM;
2963         }
2964
2965         return ret;
2966 }
2967
2968 static inline int iommu_devinfo_cache_init(void)
2969 {
2970         int ret = 0;
2971
2972         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2973                                          sizeof(struct device_domain_info),
2974                                          0,
2975                                          SLAB_HWCACHE_ALIGN,
2976                                          NULL);
2977         if (!iommu_devinfo_cache) {
2978                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2979                 ret = -ENOMEM;
2980         }
2981
2982         return ret;
2983 }
2984
2985 static inline int iommu_iova_cache_init(void)
2986 {
2987         int ret = 0;
2988
2989         iommu_iova_cache = kmem_cache_create("iommu_iova",
2990                                          sizeof(struct iova),
2991                                          0,
2992                                          SLAB_HWCACHE_ALIGN,
2993                                          NULL);
2994         if (!iommu_iova_cache) {
2995                 printk(KERN_ERR "Couldn't create iova cache\n");
2996                 ret = -ENOMEM;
2997         }
2998
2999         return ret;
3000 }
3001
3002 static int __init iommu_init_mempool(void)
3003 {
3004         int ret;
3005         ret = iommu_iova_cache_init();
3006         if (ret)
3007                 return ret;
3008
3009         ret = iommu_domain_cache_init();
3010         if (ret)
3011                 goto domain_error;
3012
3013         ret = iommu_devinfo_cache_init();
3014         if (!ret)
3015                 return ret;
3016
3017         kmem_cache_destroy(iommu_domain_cache);
3018 domain_error:
3019         kmem_cache_destroy(iommu_iova_cache);
3020
3021         return -ENOMEM;
3022 }
3023
3024 static void __init iommu_exit_mempool(void)
3025 {
3026         kmem_cache_destroy(iommu_devinfo_cache);
3027         kmem_cache_destroy(iommu_domain_cache);
3028         kmem_cache_destroy(iommu_iova_cache);
3029
3030 }
3031
3032 static void __init init_no_remapping_devices(void)
3033 {
3034         struct dmar_drhd_unit *drhd;
3035
3036         for_each_drhd_unit(drhd) {
3037                 if (!drhd->include_all) {
3038                         int i;
3039                         for (i = 0; i < drhd->devices_cnt; i++)
3040                                 if (drhd->devices[i] != NULL)
3041                                         break;
3042                         /* ignore DMAR unit if no pci devices exist */
3043                         if (i == drhd->devices_cnt)
3044                                 drhd->ignored = 1;
3045                 }
3046         }
3047
3048         if (dmar_map_gfx)
3049                 return;
3050
3051         for_each_drhd_unit(drhd) {
3052                 int i;
3053                 if (drhd->ignored || drhd->include_all)
3054                         continue;
3055
3056                 for (i = 0; i < drhd->devices_cnt; i++)
3057                         if (drhd->devices[i] &&
3058                                 !IS_GFX_DEVICE(drhd->devices[i]))
3059                                 break;
3060
3061                 if (i < drhd->devices_cnt)
3062                         continue;
3063
3064                 /* bypass IOMMU if it is just for gfx devices */
3065                 drhd->ignored = 1;
3066                 for (i = 0; i < drhd->devices_cnt; i++) {
3067                         if (!drhd->devices[i])
3068                                 continue;
3069                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3070                 }
3071         }
3072 }
3073
3074 #ifdef CONFIG_SUSPEND
3075 static int init_iommu_hw(void)
3076 {
3077         struct dmar_drhd_unit *drhd;
3078         struct intel_iommu *iommu = NULL;
3079
3080         for_each_active_iommu(iommu, drhd)
3081                 if (iommu->qi)
3082                         dmar_reenable_qi(iommu);
3083
3084         for_each_active_iommu(iommu, drhd) {
3085                 iommu_flush_write_buffer(iommu);
3086
3087                 iommu_set_root_entry(iommu);
3088
3089                 iommu->flush.flush_context(iommu, 0, 0, 0,
3090                                            DMA_CCMD_GLOBAL_INVL);
3091                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3092                                          DMA_TLB_GLOBAL_FLUSH);
3093                 iommu_enable_translation(iommu);
3094                 iommu_disable_protect_mem_regions(iommu);
3095         }
3096
3097         return 0;
3098 }
3099
3100 static void iommu_flush_all(void)
3101 {
3102         struct dmar_drhd_unit *drhd;
3103         struct intel_iommu *iommu;
3104
3105         for_each_active_iommu(iommu, drhd) {
3106                 iommu->flush.flush_context(iommu, 0, 0, 0,
3107                                            DMA_CCMD_GLOBAL_INVL);
3108                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3109                                          DMA_TLB_GLOBAL_FLUSH);
3110         }
3111 }
3112
3113 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
3114 {
3115         struct dmar_drhd_unit *drhd;
3116         struct intel_iommu *iommu = NULL;
3117         unsigned long flag;
3118
3119         for_each_active_iommu(iommu, drhd) {
3120                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3121                                                  GFP_ATOMIC);
3122                 if (!iommu->iommu_state)
3123                         goto nomem;
3124         }
3125
3126         iommu_flush_all();
3127
3128         for_each_active_iommu(iommu, drhd) {
3129                 iommu_disable_translation(iommu);
3130
3131                 spin_lock_irqsave(&iommu->register_lock, flag);
3132
3133                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3134                         readl(iommu->reg + DMAR_FECTL_REG);
3135                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3136                         readl(iommu->reg + DMAR_FEDATA_REG);
3137                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3138                         readl(iommu->reg + DMAR_FEADDR_REG);
3139                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3140                         readl(iommu->reg + DMAR_FEUADDR_REG);
3141
3142                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3143         }
3144         return 0;
3145
3146 nomem:
3147         for_each_active_iommu(iommu, drhd)
3148                 kfree(iommu->iommu_state);
3149
3150         return -ENOMEM;
3151 }
3152
3153 static int iommu_resume(struct sys_device *dev)
3154 {
3155         struct dmar_drhd_unit *drhd;
3156         struct intel_iommu *iommu = NULL;
3157         unsigned long flag;
3158
3159         if (init_iommu_hw()) {
3160                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3161                 return -EIO;
3162         }
3163
3164         for_each_active_iommu(iommu, drhd) {
3165
3166                 spin_lock_irqsave(&iommu->register_lock, flag);
3167
3168                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3169                         iommu->reg + DMAR_FECTL_REG);
3170                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3171                         iommu->reg + DMAR_FEDATA_REG);
3172                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3173                         iommu->reg + DMAR_FEADDR_REG);
3174                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3175                         iommu->reg + DMAR_FEUADDR_REG);
3176
3177                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3178         }
3179
3180         for_each_active_iommu(iommu, drhd)
3181                 kfree(iommu->iommu_state);
3182
3183         return 0;
3184 }
3185
3186 static struct sysdev_class iommu_sysclass = {
3187         .name           = "iommu",
3188         .resume         = iommu_resume,
3189         .suspend        = iommu_suspend,
3190 };
3191
3192 static struct sys_device device_iommu = {
3193         .cls    = &iommu_sysclass,
3194 };
3195
3196 static int __init init_iommu_sysfs(void)
3197 {
3198         int error;
3199
3200         error = sysdev_class_register(&iommu_sysclass);
3201         if (error)
3202                 return error;
3203
3204         error = sysdev_register(&device_iommu);
3205         if (error)
3206                 sysdev_class_unregister(&iommu_sysclass);
3207
3208         return error;
3209 }
3210
3211 #else
3212 static int __init init_iommu_sysfs(void)
3213 {
3214         return 0;
3215 }
3216 #endif  /* CONFIG_PM */
3217
3218 /*
3219  * Here we only respond to action of unbound device from driver.
3220  *
3221  * Added device is not attached to its DMAR domain here yet. That will happen
3222  * when mapping the device to iova.
3223  */
3224 static int device_notifier(struct notifier_block *nb,
3225                                   unsigned long action, void *data)
3226 {
3227         struct device *dev = data;
3228         struct pci_dev *pdev = to_pci_dev(dev);
3229         struct dmar_domain *domain;
3230
3231         if (iommu_no_mapping(dev))
3232                 return 0;
3233
3234         domain = find_domain(pdev);
3235         if (!domain)
3236                 return 0;
3237
3238         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through)
3239                 domain_remove_one_dev_info(domain, pdev);
3240
3241         return 0;
3242 }
3243
3244 static struct notifier_block device_nb = {
3245         .notifier_call = device_notifier,
3246 };
3247
3248 int __init intel_iommu_init(void)
3249 {
3250         int ret = 0;
3251         int force_on = 0;
3252
3253         /* VT-d is required for a TXT/tboot launch, so enforce that */
3254         force_on = tboot_force_iommu();
3255
3256         if (dmar_table_init()) {
3257                 if (force_on)
3258                         panic("tboot: Failed to initialize DMAR table\n");
3259                 return  -ENODEV;
3260         }
3261
3262         if (dmar_dev_scope_init()) {
3263                 if (force_on)
3264                         panic("tboot: Failed to initialize DMAR device scope\n");
3265                 return  -ENODEV;
3266         }
3267
3268         /*
3269          * Check the need for DMA-remapping initialization now.
3270          * Above initialization will also be used by Interrupt-remapping.
3271          */
3272         if (no_iommu || dmar_disabled)
3273                 return -ENODEV;
3274
3275         iommu_init_mempool();
3276         dmar_init_reserved_ranges();
3277
3278         init_no_remapping_devices();
3279
3280         ret = init_dmars();
3281         if (ret) {
3282                 if (force_on)
3283                         panic("tboot: Failed to initialize DMARs\n");
3284                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3285                 put_iova_domain(&reserved_iova_list);
3286                 iommu_exit_mempool();
3287                 return ret;
3288         }
3289         printk(KERN_INFO
3290         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3291
3292         init_timer(&unmap_timer);
3293 #ifdef CONFIG_SWIOTLB
3294         swiotlb = 0;
3295 #endif
3296         dma_ops = &intel_dma_ops;
3297
3298         init_iommu_sysfs();
3299
3300         register_iommu(&intel_iommu_ops);
3301
3302         bus_register_notifier(&pci_bus_type, &device_nb);
3303
3304         return 0;
3305 }
3306
3307 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3308                                            struct pci_dev *pdev)
3309 {
3310         struct pci_dev *tmp, *parent;
3311
3312         if (!iommu || !pdev)
3313                 return;
3314
3315         /* dependent device detach */
3316         tmp = pci_find_upstream_pcie_bridge(pdev);
3317         /* Secondary interface's bus number and devfn 0 */
3318         if (tmp) {
3319                 parent = pdev->bus->self;
3320                 while (parent != tmp) {
3321                         iommu_detach_dev(iommu, parent->bus->number,
3322                                          parent->devfn);
3323                         parent = parent->bus->self;
3324                 }
3325                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3326                         iommu_detach_dev(iommu,
3327                                 tmp->subordinate->number, 0);
3328                 else /* this is a legacy PCI bridge */
3329                         iommu_detach_dev(iommu, tmp->bus->number,
3330                                          tmp->devfn);
3331         }
3332 }
3333
3334 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3335                                           struct pci_dev *pdev)
3336 {
3337         struct device_domain_info *info;
3338         struct intel_iommu *iommu;
3339         unsigned long flags;
3340         int found = 0;
3341         struct list_head *entry, *tmp;
3342
3343         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3344                                 pdev->devfn);
3345         if (!iommu)
3346                 return;
3347
3348         spin_lock_irqsave(&device_domain_lock, flags);
3349         list_for_each_safe(entry, tmp, &domain->devices) {
3350                 info = list_entry(entry, struct device_domain_info, link);
3351                 /* No need to compare PCI domain; it has to be the same */
3352                 if (info->bus == pdev->bus->number &&
3353                     info->devfn == pdev->devfn) {
3354                         list_del(&info->link);
3355                         list_del(&info->global);
3356                         if (info->dev)
3357                                 info->dev->dev.archdata.iommu = NULL;
3358                         spin_unlock_irqrestore(&device_domain_lock, flags);
3359
3360                         iommu_disable_dev_iotlb(info);
3361                         iommu_detach_dev(iommu, info->bus, info->devfn);
3362                         iommu_detach_dependent_devices(iommu, pdev);
3363                         free_devinfo_mem(info);
3364
3365                         spin_lock_irqsave(&device_domain_lock, flags);
3366
3367                         if (found)
3368                                 break;
3369                         else
3370                                 continue;
3371                 }
3372
3373                 /* if there is no other devices under the same iommu
3374                  * owned by this domain, clear this iommu in iommu_bmp
3375                  * update iommu count and coherency
3376                  */
3377                 if (iommu == device_to_iommu(info->segment, info->bus,
3378                                             info->devfn))
3379                         found = 1;
3380         }
3381
3382         if (found == 0) {
3383                 unsigned long tmp_flags;
3384                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3385                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3386                 domain->iommu_count--;
3387                 domain_update_iommu_cap(domain);
3388                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3389         }
3390
3391         spin_unlock_irqrestore(&device_domain_lock, flags);
3392 }
3393
3394 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3395 {
3396         struct device_domain_info *info;
3397         struct intel_iommu *iommu;
3398         unsigned long flags1, flags2;
3399
3400         spin_lock_irqsave(&device_domain_lock, flags1);
3401         while (!list_empty(&domain->devices)) {
3402                 info = list_entry(domain->devices.next,
3403                         struct device_domain_info, link);
3404                 list_del(&info->link);
3405                 list_del(&info->global);
3406                 if (info->dev)
3407                         info->dev->dev.archdata.iommu = NULL;
3408
3409                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3410
3411                 iommu_disable_dev_iotlb(info);
3412                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3413                 iommu_detach_dev(iommu, info->bus, info->devfn);
3414                 iommu_detach_dependent_devices(iommu, info->dev);
3415
3416                 /* clear this iommu in iommu_bmp, update iommu count
3417                  * and capabilities
3418                  */
3419                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3420                 if (test_and_clear_bit(iommu->seq_id,
3421                                        &domain->iommu_bmp)) {
3422                         domain->iommu_count--;
3423                         domain_update_iommu_cap(domain);
3424                 }
3425                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3426
3427                 free_devinfo_mem(info);
3428                 spin_lock_irqsave(&device_domain_lock, flags1);
3429         }
3430         spin_unlock_irqrestore(&device_domain_lock, flags1);
3431 }
3432
3433 /* domain id for virtual machine, it won't be set in context */
3434 static unsigned long vm_domid;
3435
3436 static struct dmar_domain *iommu_alloc_vm_domain(void)
3437 {
3438         struct dmar_domain *domain;
3439
3440         domain = alloc_domain_mem();
3441         if (!domain)
3442                 return NULL;
3443
3444         domain->id = vm_domid++;
3445         domain->nid = -1;
3446         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3447         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3448
3449         return domain;
3450 }
3451
3452 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3453 {
3454         int adjust_width;
3455
3456         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3457         spin_lock_init(&domain->iommu_lock);
3458
3459         domain_reserve_special_ranges(domain);
3460
3461         /* calculate AGAW */
3462         domain->gaw = guest_width;
3463         adjust_width = guestwidth_to_adjustwidth(guest_width);
3464         domain->agaw = width_to_agaw(adjust_width);
3465
3466         INIT_LIST_HEAD(&domain->devices);
3467
3468         domain->iommu_count = 0;
3469         domain->iommu_coherency = 0;
3470         domain->iommu_snooping = 0;
3471         domain->max_addr = 0;
3472         domain->nid = -1;
3473
3474         /* always allocate the top pgd */
3475         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3476         if (!domain->pgd)
3477                 return -ENOMEM;
3478         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3479         return 0;
3480 }
3481
3482 static void iommu_free_vm_domain(struct dmar_domain *domain)
3483 {
3484         unsigned long flags;
3485         struct dmar_drhd_unit *drhd;
3486         struct intel_iommu *iommu;
3487         unsigned long i;
3488         unsigned long ndomains;
3489
3490         for_each_drhd_unit(drhd) {
3491                 if (drhd->ignored)
3492                         continue;
3493                 iommu = drhd->iommu;
3494
3495                 ndomains = cap_ndoms(iommu->cap);
3496                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3497                         if (iommu->domains[i] == domain) {
3498                                 spin_lock_irqsave(&iommu->lock, flags);
3499                                 clear_bit(i, iommu->domain_ids);
3500                                 iommu->domains[i] = NULL;
3501                                 spin_unlock_irqrestore(&iommu->lock, flags);
3502                                 break;
3503                         }
3504                 }
3505         }
3506 }
3507
3508 static void vm_domain_exit(struct dmar_domain *domain)
3509 {
3510         /* Domain 0 is reserved, so dont process it */
3511         if (!domain)
3512                 return;
3513
3514         vm_domain_remove_all_dev_info(domain);
3515         /* destroy iovas */
3516         put_iova_domain(&domain->iovad);
3517
3518         /* clear ptes */
3519         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3520
3521         /* free page tables */
3522         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3523
3524         iommu_free_vm_domain(domain);
3525         free_domain_mem(domain);
3526 }
3527
3528 static int intel_iommu_domain_init(struct iommu_domain *domain)
3529 {
3530         struct dmar_domain *dmar_domain;
3531
3532         dmar_domain = iommu_alloc_vm_domain();
3533         if (!dmar_domain) {
3534                 printk(KERN_ERR
3535                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3536                 return -ENOMEM;
3537         }
3538         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3539                 printk(KERN_ERR
3540                         "intel_iommu_domain_init() failed\n");
3541                 vm_domain_exit(dmar_domain);
3542                 return -ENOMEM;
3543         }
3544         domain->priv = dmar_domain;
3545
3546         return 0;
3547 }
3548
3549 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3550 {
3551         struct dmar_domain *dmar_domain = domain->priv;
3552
3553         domain->priv = NULL;
3554         vm_domain_exit(dmar_domain);
3555 }
3556
3557 static int intel_iommu_attach_device(struct iommu_domain *domain,
3558                                      struct device *dev)
3559 {
3560         struct dmar_domain *dmar_domain = domain->priv;
3561         struct pci_dev *pdev = to_pci_dev(dev);
3562         struct intel_iommu *iommu;
3563         int addr_width;
3564
3565         /* normally pdev is not mapped */
3566         if (unlikely(domain_context_mapped(pdev))) {
3567                 struct dmar_domain *old_domain;
3568
3569                 old_domain = find_domain(pdev);
3570                 if (old_domain) {
3571                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3572                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3573                                 domain_remove_one_dev_info(old_domain, pdev);
3574                         else
3575                                 domain_remove_dev_info(old_domain);
3576                 }
3577         }
3578
3579         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3580                                 pdev->devfn);
3581         if (!iommu)
3582                 return -ENODEV;
3583
3584         /* check if this iommu agaw is sufficient for max mapped address */
3585         addr_width = agaw_to_width(iommu->agaw);
3586         if (addr_width > cap_mgaw(iommu->cap))
3587                 addr_width = cap_mgaw(iommu->cap);
3588
3589         if (dmar_domain->max_addr > (1LL << addr_width)) {
3590                 printk(KERN_ERR "%s: iommu width (%d) is not "
3591                        "sufficient for the mapped address (%llx)\n",
3592                        __func__, addr_width, dmar_domain->max_addr);
3593                 return -EFAULT;
3594         }
3595         dmar_domain->gaw = addr_width;
3596
3597         /*
3598          * Knock out extra levels of page tables if necessary
3599          */
3600         while (iommu->agaw < dmar_domain->agaw) {
3601                 struct dma_pte *pte;
3602
3603                 pte = dmar_domain->pgd;
3604                 if (dma_pte_present(pte)) {
3605                         free_pgtable_page(dmar_domain->pgd);
3606                         dmar_domain->pgd = (struct dma_pte *)dma_pte_addr(pte);
3607                 }
3608                 dmar_domain->agaw--;
3609         }
3610
3611         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3612 }
3613
3614 static void intel_iommu_detach_device(struct iommu_domain *domain,
3615                                       struct device *dev)
3616 {
3617         struct dmar_domain *dmar_domain = domain->priv;
3618         struct pci_dev *pdev = to_pci_dev(dev);
3619
3620         domain_remove_one_dev_info(dmar_domain, pdev);
3621 }
3622
3623 static int intel_iommu_map(struct iommu_domain *domain,
3624                            unsigned long iova, phys_addr_t hpa,
3625                            int gfp_order, int iommu_prot)
3626 {
3627         struct dmar_domain *dmar_domain = domain->priv;
3628         u64 max_addr;
3629         int prot = 0;
3630         size_t size;
3631         int ret;
3632
3633         if (iommu_prot & IOMMU_READ)
3634                 prot |= DMA_PTE_READ;
3635         if (iommu_prot & IOMMU_WRITE)
3636                 prot |= DMA_PTE_WRITE;
3637         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3638                 prot |= DMA_PTE_SNP;
3639
3640         size     = PAGE_SIZE << gfp_order;
3641         max_addr = iova + size;
3642         if (dmar_domain->max_addr < max_addr) {
3643                 u64 end;
3644
3645                 /* check if minimum agaw is sufficient for mapped address */
3646                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3647                 if (end < max_addr) {
3648                         printk(KERN_ERR "%s: iommu width (%d) is not "
3649                                "sufficient for the mapped address (%llx)\n",
3650                                __func__, dmar_domain->gaw, max_addr);
3651                         return -EFAULT;
3652                 }
3653                 dmar_domain->max_addr = max_addr;
3654         }
3655         /* Round up size to next multiple of PAGE_SIZE, if it and
3656            the low bits of hpa would take us onto the next page */
3657         size = aligned_nrpages(hpa, size);
3658         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3659                                  hpa >> VTD_PAGE_SHIFT, size, prot);
3660         return ret;
3661 }
3662
3663 static int intel_iommu_unmap(struct iommu_domain *domain,
3664                              unsigned long iova, int gfp_order)
3665 {
3666         struct dmar_domain *dmar_domain = domain->priv;
3667         size_t size = PAGE_SIZE << gfp_order;
3668
3669         dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3670                             (iova + size - 1) >> VTD_PAGE_SHIFT);
3671
3672         if (dmar_domain->max_addr == iova + size)
3673                 dmar_domain->max_addr = iova;
3674
3675         return gfp_order;
3676 }
3677
3678 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3679                                             unsigned long iova)
3680 {
3681         struct dmar_domain *dmar_domain = domain->priv;
3682         struct dma_pte *pte;
3683         u64 phys = 0;
3684
3685         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3686         if (pte)
3687                 phys = dma_pte_addr(pte);
3688
3689         return phys;
3690 }
3691
3692 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3693                                       unsigned long cap)
3694 {
3695         struct dmar_domain *dmar_domain = domain->priv;
3696
3697         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3698                 return dmar_domain->iommu_snooping;
3699
3700         return 0;
3701 }
3702
3703 static struct iommu_ops intel_iommu_ops = {
3704         .domain_init    = intel_iommu_domain_init,
3705         .domain_destroy = intel_iommu_domain_destroy,
3706         .attach_dev     = intel_iommu_attach_device,
3707         .detach_dev     = intel_iommu_detach_device,
3708         .map            = intel_iommu_map,
3709         .unmap          = intel_iommu_unmap,
3710         .iova_to_phys   = intel_iommu_iova_to_phys,
3711         .domain_has_cap = intel_iommu_domain_has_cap,
3712 };
3713
3714 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3715 {
3716         /*
3717          * Mobile 4 Series Chipset neglects to set RWBF capability,
3718          * but needs it:
3719          */
3720         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3721         rwbf_quirk = 1;
3722 }
3723
3724 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3725
3726 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3727    ISOCH DMAR unit for the Azalia sound device, but not give it any
3728    TLB entries, which causes it to deadlock. Check for that.  We do
3729    this in a function called from init_dmars(), instead of in a PCI
3730    quirk, because we don't want to print the obnoxious "BIOS broken"
3731    message if VT-d is actually disabled.
3732 */
3733 static void __init check_tylersburg_isoch(void)
3734 {
3735         struct pci_dev *pdev;
3736         uint32_t vtisochctrl;
3737
3738         /* If there's no Azalia in the system anyway, forget it. */
3739         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3740         if (!pdev)
3741                 return;
3742         pci_dev_put(pdev);
3743
3744         /* System Management Registers. Might be hidden, in which case
3745            we can't do the sanity check. But that's OK, because the
3746            known-broken BIOSes _don't_ actually hide it, so far. */
3747         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3748         if (!pdev)
3749                 return;
3750
3751         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3752                 pci_dev_put(pdev);
3753                 return;
3754         }
3755
3756         pci_dev_put(pdev);
3757
3758         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3759         if (vtisochctrl & 1)
3760                 return;
3761
3762         /* Drop all bits other than the number of TLB entries */
3763         vtisochctrl &= 0x1c;
3764
3765         /* If we have the recommended number of TLB entries (16), fine. */
3766         if (vtisochctrl == 0x10)
3767                 return;
3768
3769         /* Zero TLB entries? You get to ride the short bus to school. */
3770         if (!vtisochctrl) {
3771                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3772                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3773                      dmi_get_system_info(DMI_BIOS_VENDOR),
3774                      dmi_get_system_info(DMI_BIOS_VERSION),
3775                      dmi_get_system_info(DMI_PRODUCT_VERSION));
3776                 iommu_identity_mapping |= IDENTMAP_AZALIA;
3777                 return;
3778         }
3779         
3780         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3781                vtisochctrl);
3782 }