2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
43 #define ROOT_SIZE VTD_PAGE_SIZE
44 #define CONTEXT_SIZE VTD_PAGE_SIZE
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
49 #define IOAPIC_RANGE_START (0xfee00000)
50 #define IOAPIC_RANGE_END (0xfeefffff)
51 #define IOVA_START_ADDR (0x1000)
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
57 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK)
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
64 static int rwbf_quirk;
69 * 12-63: Context Ptr (12 - (haw-1))
76 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
77 static inline bool root_present(struct root_entry *root)
79 return (root->val & 1);
81 static inline void set_root_present(struct root_entry *root)
85 static inline void set_root_value(struct root_entry *root, unsigned long value)
87 root->val |= value & VTD_PAGE_MASK;
90 static inline struct context_entry *
91 get_context_addr_from_root(struct root_entry *root)
93 return (struct context_entry *)
94 (root_present(root)?phys_to_virt(
95 root->val & VTD_PAGE_MASK) :
102 * 1: fault processing disable
103 * 2-3: translation type
104 * 12-63: address space root
110 struct context_entry {
115 static inline bool context_present(struct context_entry *context)
117 return (context->lo & 1);
119 static inline void context_set_present(struct context_entry *context)
124 static inline void context_set_fault_enable(struct context_entry *context)
126 context->lo &= (((u64)-1) << 2) | 1;
129 #define CONTEXT_TT_MULTI_LEVEL 0
131 static inline void context_set_translation_type(struct context_entry *context,
134 context->lo &= (((u64)-1) << 4) | 3;
135 context->lo |= (value & 3) << 2;
138 static inline void context_set_address_root(struct context_entry *context,
141 context->lo |= value & VTD_PAGE_MASK;
144 static inline void context_set_address_width(struct context_entry *context,
147 context->hi |= value & 7;
150 static inline void context_set_domain_id(struct context_entry *context,
153 context->hi |= (value & ((1 << 16) - 1)) << 8;
156 static inline void context_clear_entry(struct context_entry *context)
169 * 12-63: Host physcial address
175 static inline void dma_clear_pte(struct dma_pte *pte)
180 static inline void dma_set_pte_readable(struct dma_pte *pte)
182 pte->val |= DMA_PTE_READ;
185 static inline void dma_set_pte_writable(struct dma_pte *pte)
187 pte->val |= DMA_PTE_WRITE;
190 static inline void dma_set_pte_snp(struct dma_pte *pte)
192 pte->val |= DMA_PTE_SNP;
195 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
197 pte->val = (pte->val & ~3) | (prot & 3);
200 static inline u64 dma_pte_addr(struct dma_pte *pte)
202 return (pte->val & VTD_PAGE_MASK);
205 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
207 pte->val |= (addr & VTD_PAGE_MASK);
210 static inline bool dma_pte_present(struct dma_pte *pte)
212 return (pte->val & 3) != 0;
215 /* devices under the same p2p bridge are owned in one domain */
216 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
218 /* domain represents a virtual machine, more than one devices
219 * across iommus may be owned in one domain, e.g. kvm guest.
221 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
224 int id; /* domain id */
225 unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/
227 struct list_head devices; /* all devices' list */
228 struct iova_domain iovad; /* iova's that belong to this domain */
230 struct dma_pte *pgd; /* virtual address */
231 spinlock_t mapping_lock; /* page table lock */
232 int gaw; /* max guest address width */
234 /* adjusted guest address width, 0 is level 2 30-bit */
237 int flags; /* flags to find out type of domain */
239 int iommu_coherency;/* indicate coherency of iommu access */
240 int iommu_snooping; /* indicate snooping control feature*/
241 int iommu_count; /* reference count of iommu */
242 spinlock_t iommu_lock; /* protect iommu set in domain */
243 u64 max_addr; /* maximum mapped address */
246 /* PCI domain-device relationship */
247 struct device_domain_info {
248 struct list_head link; /* link to domain siblings */
249 struct list_head global; /* link to global list */
250 u8 bus; /* PCI bus numer */
251 u8 devfn; /* PCI devfn number */
252 struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
253 struct dmar_domain *domain; /* pointer to domain */
256 static void flush_unmaps_timeout(unsigned long data);
258 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
260 #define HIGH_WATER_MARK 250
261 struct deferred_flush_tables {
263 struct iova *iova[HIGH_WATER_MARK];
264 struct dmar_domain *domain[HIGH_WATER_MARK];
267 static struct deferred_flush_tables *deferred_flush;
269 /* bitmap for indexing intel_iommus */
270 static int g_num_of_iommus;
272 static DEFINE_SPINLOCK(async_umap_flush_lock);
273 static LIST_HEAD(unmaps_to_do);
276 static long list_size;
278 static void domain_remove_dev_info(struct dmar_domain *domain);
280 #ifdef CONFIG_DMAR_DEFAULT_ON
281 int dmar_disabled = 0;
283 int dmar_disabled = 1;
284 #endif /*CONFIG_DMAR_DEFAULT_ON*/
286 static int __initdata dmar_map_gfx = 1;
287 static int dmar_forcedac;
288 static int intel_iommu_strict;
290 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
291 static DEFINE_SPINLOCK(device_domain_lock);
292 static LIST_HEAD(device_domain_list);
294 static struct iommu_ops intel_iommu_ops;
296 static int __init intel_iommu_setup(char *str)
301 if (!strncmp(str, "on", 2)) {
303 printk(KERN_INFO "Intel-IOMMU: enabled\n");
304 } else if (!strncmp(str, "off", 3)) {
306 printk(KERN_INFO "Intel-IOMMU: disabled\n");
307 } else if (!strncmp(str, "igfx_off", 8)) {
310 "Intel-IOMMU: disable GFX device mapping\n");
311 } else if (!strncmp(str, "forcedac", 8)) {
313 "Intel-IOMMU: Forcing DAC for PCI devices\n");
315 } else if (!strncmp(str, "strict", 6)) {
317 "Intel-IOMMU: disable batched IOTLB flush\n");
318 intel_iommu_strict = 1;
321 str += strcspn(str, ",");
327 __setup("intel_iommu=", intel_iommu_setup);
329 static struct kmem_cache *iommu_domain_cache;
330 static struct kmem_cache *iommu_devinfo_cache;
331 static struct kmem_cache *iommu_iova_cache;
333 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
338 /* trying to avoid low memory issues */
339 flags = current->flags & PF_MEMALLOC;
340 current->flags |= PF_MEMALLOC;
341 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
342 current->flags &= (~PF_MEMALLOC | flags);
347 static inline void *alloc_pgtable_page(void)
352 /* trying to avoid low memory issues */
353 flags = current->flags & PF_MEMALLOC;
354 current->flags |= PF_MEMALLOC;
355 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
356 current->flags &= (~PF_MEMALLOC | flags);
360 static inline void free_pgtable_page(void *vaddr)
362 free_page((unsigned long)vaddr);
365 static inline void *alloc_domain_mem(void)
367 return iommu_kmem_cache_alloc(iommu_domain_cache);
370 static void free_domain_mem(void *vaddr)
372 kmem_cache_free(iommu_domain_cache, vaddr);
375 static inline void * alloc_devinfo_mem(void)
377 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
380 static inline void free_devinfo_mem(void *vaddr)
382 kmem_cache_free(iommu_devinfo_cache, vaddr);
385 struct iova *alloc_iova_mem(void)
387 return iommu_kmem_cache_alloc(iommu_iova_cache);
390 void free_iova_mem(struct iova *iova)
392 kmem_cache_free(iommu_iova_cache, iova);
396 static inline int width_to_agaw(int width);
398 /* calculate agaw for each iommu.
399 * "SAGAW" may be different across iommus, use a default agaw, and
400 * get a supported less agaw for iommus that don't support the default agaw.
402 int iommu_calculate_agaw(struct intel_iommu *iommu)
407 sagaw = cap_sagaw(iommu->cap);
408 for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
410 if (test_bit(agaw, &sagaw))
417 /* in native case, each domain is related to only one iommu */
418 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
422 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
424 iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
425 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
428 return g_iommus[iommu_id];
431 static void domain_update_iommu_coherency(struct dmar_domain *domain)
435 domain->iommu_coherency = 1;
437 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
438 for (; i < g_num_of_iommus; ) {
439 if (!ecap_coherent(g_iommus[i]->ecap)) {
440 domain->iommu_coherency = 0;
443 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
447 static void domain_update_iommu_snooping(struct dmar_domain *domain)
451 domain->iommu_snooping = 1;
453 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
454 for (; i < g_num_of_iommus; ) {
455 if (!ecap_sc_support(g_iommus[i]->ecap)) {
456 domain->iommu_snooping = 0;
459 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
463 /* Some capabilities may be different across iommus */
464 static void domain_update_iommu_cap(struct dmar_domain *domain)
466 domain_update_iommu_coherency(domain);
467 domain_update_iommu_snooping(domain);
470 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
472 struct dmar_drhd_unit *drhd = NULL;
475 for_each_drhd_unit(drhd) {
479 for (i = 0; i < drhd->devices_cnt; i++)
480 if (drhd->devices[i] &&
481 drhd->devices[i]->bus->number == bus &&
482 drhd->devices[i]->devfn == devfn)
485 if (drhd->include_all)
492 static void domain_flush_cache(struct dmar_domain *domain,
493 void *addr, int size)
495 if (!domain->iommu_coherency)
496 clflush_cache_range(addr, size);
499 /* Gets context entry for a given bus and devfn */
500 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
503 struct root_entry *root;
504 struct context_entry *context;
505 unsigned long phy_addr;
508 spin_lock_irqsave(&iommu->lock, flags);
509 root = &iommu->root_entry[bus];
510 context = get_context_addr_from_root(root);
512 context = (struct context_entry *)alloc_pgtable_page();
514 spin_unlock_irqrestore(&iommu->lock, flags);
517 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
518 phy_addr = virt_to_phys((void *)context);
519 set_root_value(root, phy_addr);
520 set_root_present(root);
521 __iommu_flush_cache(iommu, root, sizeof(*root));
523 spin_unlock_irqrestore(&iommu->lock, flags);
524 return &context[devfn];
527 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
529 struct root_entry *root;
530 struct context_entry *context;
534 spin_lock_irqsave(&iommu->lock, flags);
535 root = &iommu->root_entry[bus];
536 context = get_context_addr_from_root(root);
541 ret = context_present(&context[devfn]);
543 spin_unlock_irqrestore(&iommu->lock, flags);
547 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
549 struct root_entry *root;
550 struct context_entry *context;
553 spin_lock_irqsave(&iommu->lock, flags);
554 root = &iommu->root_entry[bus];
555 context = get_context_addr_from_root(root);
557 context_clear_entry(&context[devfn]);
558 __iommu_flush_cache(iommu, &context[devfn], \
561 spin_unlock_irqrestore(&iommu->lock, flags);
564 static void free_context_table(struct intel_iommu *iommu)
566 struct root_entry *root;
569 struct context_entry *context;
571 spin_lock_irqsave(&iommu->lock, flags);
572 if (!iommu->root_entry) {
575 for (i = 0; i < ROOT_ENTRY_NR; i++) {
576 root = &iommu->root_entry[i];
577 context = get_context_addr_from_root(root);
579 free_pgtable_page(context);
581 free_pgtable_page(iommu->root_entry);
582 iommu->root_entry = NULL;
584 spin_unlock_irqrestore(&iommu->lock, flags);
587 /* page table handling */
588 #define LEVEL_STRIDE (9)
589 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
591 static inline int agaw_to_level(int agaw)
596 static inline int agaw_to_width(int agaw)
598 return 30 + agaw * LEVEL_STRIDE;
602 static inline int width_to_agaw(int width)
604 return (width - 30) / LEVEL_STRIDE;
607 static inline unsigned int level_to_offset_bits(int level)
609 return (12 + (level - 1) * LEVEL_STRIDE);
612 static inline int address_level_offset(u64 addr, int level)
614 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
617 static inline u64 level_mask(int level)
619 return ((u64)-1 << level_to_offset_bits(level));
622 static inline u64 level_size(int level)
624 return ((u64)1 << level_to_offset_bits(level));
627 static inline u64 align_to_level(u64 addr, int level)
629 return ((addr + level_size(level) - 1) & level_mask(level));
632 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
634 int addr_width = agaw_to_width(domain->agaw);
635 struct dma_pte *parent, *pte = NULL;
636 int level = agaw_to_level(domain->agaw);
640 BUG_ON(!domain->pgd);
642 addr &= (((u64)1) << addr_width) - 1;
643 parent = domain->pgd;
645 spin_lock_irqsave(&domain->mapping_lock, flags);
649 offset = address_level_offset(addr, level);
650 pte = &parent[offset];
654 if (!dma_pte_present(pte)) {
655 tmp_page = alloc_pgtable_page();
658 spin_unlock_irqrestore(&domain->mapping_lock,
662 domain_flush_cache(domain, tmp_page, PAGE_SIZE);
663 dma_set_pte_addr(pte, virt_to_phys(tmp_page));
665 * high level table always sets r/w, last level page
666 * table control read/write
668 dma_set_pte_readable(pte);
669 dma_set_pte_writable(pte);
670 domain_flush_cache(domain, pte, sizeof(*pte));
672 parent = phys_to_virt(dma_pte_addr(pte));
676 spin_unlock_irqrestore(&domain->mapping_lock, flags);
680 /* return address's pte at specific level */
681 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
684 struct dma_pte *parent, *pte = NULL;
685 int total = agaw_to_level(domain->agaw);
688 parent = domain->pgd;
689 while (level <= total) {
690 offset = address_level_offset(addr, total);
691 pte = &parent[offset];
695 if (!dma_pte_present(pte))
697 parent = phys_to_virt(dma_pte_addr(pte));
703 /* clear one page's page table */
704 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
706 struct dma_pte *pte = NULL;
708 /* get last level pte */
709 pte = dma_addr_level_pte(domain, addr, 1);
713 domain_flush_cache(domain, pte, sizeof(*pte));
717 /* clear last level pte, a tlb flush should be followed */
718 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
720 int addr_width = agaw_to_width(domain->agaw);
723 start &= (((u64)1) << addr_width) - 1;
724 end &= (((u64)1) << addr_width) - 1;
725 /* in case it's partial page */
726 start = PAGE_ALIGN(start);
728 npages = (end - start) / VTD_PAGE_SIZE;
730 /* we don't need lock here, nobody else touches the iova range */
732 dma_pte_clear_one(domain, start);
733 start += VTD_PAGE_SIZE;
737 /* free page table pages. last level pte should already be cleared */
738 static void dma_pte_free_pagetable(struct dmar_domain *domain,
741 int addr_width = agaw_to_width(domain->agaw);
743 int total = agaw_to_level(domain->agaw);
747 start &= (((u64)1) << addr_width) - 1;
748 end &= (((u64)1) << addr_width) - 1;
750 /* we don't need lock here, nobody else touches the iova range */
752 while (level <= total) {
753 tmp = align_to_level(start, level);
754 if (tmp >= end || (tmp + level_size(level) > end))
758 pte = dma_addr_level_pte(domain, tmp, level);
761 phys_to_virt(dma_pte_addr(pte)));
763 domain_flush_cache(domain, pte, sizeof(*pte));
765 tmp += level_size(level);
770 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
771 free_pgtable_page(domain->pgd);
777 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
779 struct root_entry *root;
782 root = (struct root_entry *)alloc_pgtable_page();
786 __iommu_flush_cache(iommu, root, ROOT_SIZE);
788 spin_lock_irqsave(&iommu->lock, flags);
789 iommu->root_entry = root;
790 spin_unlock_irqrestore(&iommu->lock, flags);
795 static void iommu_set_root_entry(struct intel_iommu *iommu)
801 addr = iommu->root_entry;
803 spin_lock_irqsave(&iommu->register_lock, flag);
804 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
806 cmd = iommu->gcmd | DMA_GCMD_SRTP;
807 writel(cmd, iommu->reg + DMAR_GCMD_REG);
809 /* Make sure hardware complete it */
810 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
811 readl, (sts & DMA_GSTS_RTPS), sts);
813 spin_unlock_irqrestore(&iommu->register_lock, flag);
816 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
821 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
823 val = iommu->gcmd | DMA_GCMD_WBF;
825 spin_lock_irqsave(&iommu->register_lock, flag);
826 writel(val, iommu->reg + DMAR_GCMD_REG);
828 /* Make sure hardware complete it */
829 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
830 readl, (!(val & DMA_GSTS_WBFS)), val);
832 spin_unlock_irqrestore(&iommu->register_lock, flag);
835 /* return value determine if we need a write buffer flush */
836 static int __iommu_flush_context(struct intel_iommu *iommu,
837 u16 did, u16 source_id, u8 function_mask, u64 type,
838 int non_present_entry_flush)
844 * In the non-present entry flush case, if hardware doesn't cache
845 * non-present entry we do nothing and if hardware cache non-present
846 * entry, we flush entries of domain 0 (the domain id is used to cache
847 * any non-present entries)
849 if (non_present_entry_flush) {
850 if (!cap_caching_mode(iommu->cap))
857 case DMA_CCMD_GLOBAL_INVL:
858 val = DMA_CCMD_GLOBAL_INVL;
860 case DMA_CCMD_DOMAIN_INVL:
861 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
863 case DMA_CCMD_DEVICE_INVL:
864 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
865 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
872 spin_lock_irqsave(&iommu->register_lock, flag);
873 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
875 /* Make sure hardware complete it */
876 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
877 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
879 spin_unlock_irqrestore(&iommu->register_lock, flag);
881 /* flush context entry will implicitly flush write buffer */
885 /* return value determine if we need a write buffer flush */
886 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
887 u64 addr, unsigned int size_order, u64 type,
888 int non_present_entry_flush)
890 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
891 u64 val = 0, val_iva = 0;
895 * In the non-present entry flush case, if hardware doesn't cache
896 * non-present entry we do nothing and if hardware cache non-present
897 * entry, we flush entries of domain 0 (the domain id is used to cache
898 * any non-present entries)
900 if (non_present_entry_flush) {
901 if (!cap_caching_mode(iommu->cap))
908 case DMA_TLB_GLOBAL_FLUSH:
909 /* global flush doesn't need set IVA_REG */
910 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
912 case DMA_TLB_DSI_FLUSH:
913 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
915 case DMA_TLB_PSI_FLUSH:
916 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
917 /* Note: always flush non-leaf currently */
918 val_iva = size_order | addr;
923 /* Note: set drain read/write */
926 * This is probably to be super secure.. Looks like we can
927 * ignore it without any impact.
929 if (cap_read_drain(iommu->cap))
930 val |= DMA_TLB_READ_DRAIN;
932 if (cap_write_drain(iommu->cap))
933 val |= DMA_TLB_WRITE_DRAIN;
935 spin_lock_irqsave(&iommu->register_lock, flag);
936 /* Note: Only uses first TLB reg currently */
938 dmar_writeq(iommu->reg + tlb_offset, val_iva);
939 dmar_writeq(iommu->reg + tlb_offset + 8, val);
941 /* Make sure hardware complete it */
942 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
943 dmar_readq, (!(val & DMA_TLB_IVT)), val);
945 spin_unlock_irqrestore(&iommu->register_lock, flag);
947 /* check IOTLB invalidation granularity */
948 if (DMA_TLB_IAIG(val) == 0)
949 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
950 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
951 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
952 (unsigned long long)DMA_TLB_IIRG(type),
953 (unsigned long long)DMA_TLB_IAIG(val));
954 /* flush iotlb entry will implicitly flush write buffer */
958 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
959 u64 addr, unsigned int pages, int non_present_entry_flush)
963 BUG_ON(addr & (~VTD_PAGE_MASK));
966 /* Fallback to domain selective flush if no PSI support */
967 if (!cap_pgsel_inv(iommu->cap))
968 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
970 non_present_entry_flush);
973 * PSI requires page size to be 2 ^ x, and the base address is naturally
974 * aligned to the size
976 mask = ilog2(__roundup_pow_of_two(pages));
977 /* Fallback to domain selective flush if size is too big */
978 if (mask > cap_max_amask_val(iommu->cap))
979 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
980 DMA_TLB_DSI_FLUSH, non_present_entry_flush);
982 return iommu->flush.flush_iotlb(iommu, did, addr, mask,
984 non_present_entry_flush);
987 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
992 spin_lock_irqsave(&iommu->register_lock, flags);
993 pmen = readl(iommu->reg + DMAR_PMEN_REG);
994 pmen &= ~DMA_PMEN_EPM;
995 writel(pmen, iommu->reg + DMAR_PMEN_REG);
997 /* wait for the protected region status bit to clear */
998 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
999 readl, !(pmen & DMA_PMEN_PRS), pmen);
1001 spin_unlock_irqrestore(&iommu->register_lock, flags);
1004 static int iommu_enable_translation(struct intel_iommu *iommu)
1007 unsigned long flags;
1009 spin_lock_irqsave(&iommu->register_lock, flags);
1010 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
1012 /* Make sure hardware complete it */
1013 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1014 readl, (sts & DMA_GSTS_TES), sts);
1016 iommu->gcmd |= DMA_GCMD_TE;
1017 spin_unlock_irqrestore(&iommu->register_lock, flags);
1021 static int iommu_disable_translation(struct intel_iommu *iommu)
1026 spin_lock_irqsave(&iommu->register_lock, flag);
1027 iommu->gcmd &= ~DMA_GCMD_TE;
1028 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1030 /* Make sure hardware complete it */
1031 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1032 readl, (!(sts & DMA_GSTS_TES)), sts);
1034 spin_unlock_irqrestore(&iommu->register_lock, flag);
1038 /* iommu interrupt handling. Most stuff are MSI-like. */
1040 static const char *fault_reason_strings[] =
1043 "Present bit in root entry is clear",
1044 "Present bit in context entry is clear",
1045 "Invalid context entry",
1046 "Access beyond MGAW",
1047 "PTE Write access is not set",
1048 "PTE Read access is not set",
1049 "Next page table ptr is invalid",
1050 "Root table address invalid",
1051 "Context table ptr is invalid",
1052 "non-zero reserved fields in RTP",
1053 "non-zero reserved fields in CTP",
1054 "non-zero reserved fields in PTE",
1056 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
1058 const char *dmar_get_fault_reason(u8 fault_reason)
1060 if (fault_reason > MAX_FAULT_REASON_IDX)
1063 return fault_reason_strings[fault_reason];
1066 void dmar_msi_unmask(unsigned int irq)
1068 struct intel_iommu *iommu = get_irq_data(irq);
1072 spin_lock_irqsave(&iommu->register_lock, flag);
1073 writel(0, iommu->reg + DMAR_FECTL_REG);
1074 /* Read a reg to force flush the post write */
1075 readl(iommu->reg + DMAR_FECTL_REG);
1076 spin_unlock_irqrestore(&iommu->register_lock, flag);
1079 void dmar_msi_mask(unsigned int irq)
1082 struct intel_iommu *iommu = get_irq_data(irq);
1085 spin_lock_irqsave(&iommu->register_lock, flag);
1086 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1087 /* Read a reg to force flush the post write */
1088 readl(iommu->reg + DMAR_FECTL_REG);
1089 spin_unlock_irqrestore(&iommu->register_lock, flag);
1092 void dmar_msi_write(int irq, struct msi_msg *msg)
1094 struct intel_iommu *iommu = get_irq_data(irq);
1097 spin_lock_irqsave(&iommu->register_lock, flag);
1098 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1099 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1100 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1101 spin_unlock_irqrestore(&iommu->register_lock, flag);
1104 void dmar_msi_read(int irq, struct msi_msg *msg)
1106 struct intel_iommu *iommu = get_irq_data(irq);
1109 spin_lock_irqsave(&iommu->register_lock, flag);
1110 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1111 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1112 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1113 spin_unlock_irqrestore(&iommu->register_lock, flag);
1116 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1117 u8 fault_reason, u16 source_id, unsigned long long addr)
1121 reason = dmar_get_fault_reason(fault_reason);
1124 "DMAR:[%s] Request device [%02x:%02x.%d] "
1125 "fault addr %llx \n"
1126 "DMAR:[fault reason %02d] %s\n",
1127 (type ? "DMA Read" : "DMA Write"),
1128 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1129 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1133 #define PRIMARY_FAULT_REG_LEN (16)
1134 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1136 struct intel_iommu *iommu = dev_id;
1137 int reg, fault_index;
1141 spin_lock_irqsave(&iommu->register_lock, flag);
1142 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1144 /* TBD: ignore advanced fault log currently */
1145 if (!(fault_status & DMA_FSTS_PPF))
1146 goto clear_overflow;
1148 fault_index = dma_fsts_fault_record_index(fault_status);
1149 reg = cap_fault_reg_offset(iommu->cap);
1157 /* highest 32 bits */
1158 data = readl(iommu->reg + reg +
1159 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1160 if (!(data & DMA_FRCD_F))
1163 fault_reason = dma_frcd_fault_reason(data);
1164 type = dma_frcd_type(data);
1166 data = readl(iommu->reg + reg +
1167 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1168 source_id = dma_frcd_source_id(data);
1170 guest_addr = dmar_readq(iommu->reg + reg +
1171 fault_index * PRIMARY_FAULT_REG_LEN);
1172 guest_addr = dma_frcd_page_addr(guest_addr);
1173 /* clear the fault */
1174 writel(DMA_FRCD_F, iommu->reg + reg +
1175 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1177 spin_unlock_irqrestore(&iommu->register_lock, flag);
1179 iommu_page_fault_do_one(iommu, type, fault_reason,
1180 source_id, guest_addr);
1183 if (fault_index > cap_num_fault_regs(iommu->cap))
1185 spin_lock_irqsave(&iommu->register_lock, flag);
1188 /* clear primary fault overflow */
1189 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1190 if (fault_status & DMA_FSTS_PFO)
1191 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1193 spin_unlock_irqrestore(&iommu->register_lock, flag);
1197 int dmar_set_interrupt(struct intel_iommu *iommu)
1203 printk(KERN_ERR "IOMMU: no free vectors\n");
1207 set_irq_data(irq, iommu);
1210 ret = arch_setup_dmar_msi(irq);
1212 set_irq_data(irq, NULL);
1218 /* Force fault register is cleared */
1219 iommu_page_fault(irq, iommu);
1221 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1223 printk(KERN_ERR "IOMMU: can't request irq\n");
1227 static int iommu_init_domains(struct intel_iommu *iommu)
1229 unsigned long ndomains;
1230 unsigned long nlongs;
1232 ndomains = cap_ndoms(iommu->cap);
1233 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1234 nlongs = BITS_TO_LONGS(ndomains);
1236 /* TBD: there might be 64K domains,
1237 * consider other allocation for future chip
1239 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1240 if (!iommu->domain_ids) {
1241 printk(KERN_ERR "Allocating domain id array failed\n");
1244 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1246 if (!iommu->domains) {
1247 printk(KERN_ERR "Allocating domain array failed\n");
1248 kfree(iommu->domain_ids);
1252 spin_lock_init(&iommu->lock);
1255 * if Caching mode is set, then invalid translations are tagged
1256 * with domainid 0. Hence we need to pre-allocate it.
1258 if (cap_caching_mode(iommu->cap))
1259 set_bit(0, iommu->domain_ids);
1264 static void domain_exit(struct dmar_domain *domain);
1265 static void vm_domain_exit(struct dmar_domain *domain);
1267 void free_dmar_iommu(struct intel_iommu *iommu)
1269 struct dmar_domain *domain;
1271 unsigned long flags;
1273 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1274 for (; i < cap_ndoms(iommu->cap); ) {
1275 domain = iommu->domains[i];
1276 clear_bit(i, iommu->domain_ids);
1278 spin_lock_irqsave(&domain->iommu_lock, flags);
1279 if (--domain->iommu_count == 0) {
1280 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1281 vm_domain_exit(domain);
1283 domain_exit(domain);
1285 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1287 i = find_next_bit(iommu->domain_ids,
1288 cap_ndoms(iommu->cap), i+1);
1291 if (iommu->gcmd & DMA_GCMD_TE)
1292 iommu_disable_translation(iommu);
1295 set_irq_data(iommu->irq, NULL);
1296 /* This will mask the irq */
1297 free_irq(iommu->irq, iommu);
1298 destroy_irq(iommu->irq);
1301 kfree(iommu->domains);
1302 kfree(iommu->domain_ids);
1304 g_iommus[iommu->seq_id] = NULL;
1306 /* if all iommus are freed, free g_iommus */
1307 for (i = 0; i < g_num_of_iommus; i++) {
1312 if (i == g_num_of_iommus)
1315 /* free context mapping */
1316 free_context_table(iommu);
1319 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1322 unsigned long ndomains;
1323 struct dmar_domain *domain;
1324 unsigned long flags;
1326 domain = alloc_domain_mem();
1330 ndomains = cap_ndoms(iommu->cap);
1332 spin_lock_irqsave(&iommu->lock, flags);
1333 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1334 if (num >= ndomains) {
1335 spin_unlock_irqrestore(&iommu->lock, flags);
1336 free_domain_mem(domain);
1337 printk(KERN_ERR "IOMMU: no free domain ids\n");
1341 set_bit(num, iommu->domain_ids);
1343 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1344 set_bit(iommu->seq_id, &domain->iommu_bmp);
1346 iommu->domains[num] = domain;
1347 spin_unlock_irqrestore(&iommu->lock, flags);
1352 static void iommu_free_domain(struct dmar_domain *domain)
1354 unsigned long flags;
1355 struct intel_iommu *iommu;
1357 iommu = domain_get_iommu(domain);
1359 spin_lock_irqsave(&iommu->lock, flags);
1360 clear_bit(domain->id, iommu->domain_ids);
1361 spin_unlock_irqrestore(&iommu->lock, flags);
1364 static struct iova_domain reserved_iova_list;
1365 static struct lock_class_key reserved_alloc_key;
1366 static struct lock_class_key reserved_rbtree_key;
1368 static void dmar_init_reserved_ranges(void)
1370 struct pci_dev *pdev = NULL;
1375 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1377 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1378 &reserved_alloc_key);
1379 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1380 &reserved_rbtree_key);
1382 /* IOAPIC ranges shouldn't be accessed by DMA */
1383 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1384 IOVA_PFN(IOAPIC_RANGE_END));
1386 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1388 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1389 for_each_pci_dev(pdev) {
1392 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1393 r = &pdev->resource[i];
1394 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1398 size = r->end - addr;
1399 size = PAGE_ALIGN(size);
1400 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1401 IOVA_PFN(size + addr) - 1);
1403 printk(KERN_ERR "Reserve iova failed\n");
1409 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1411 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1414 static inline int guestwidth_to_adjustwidth(int gaw)
1417 int r = (gaw - 12) % 9;
1428 static int domain_init(struct dmar_domain *domain, int guest_width)
1430 struct intel_iommu *iommu;
1431 int adjust_width, agaw;
1432 unsigned long sagaw;
1434 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1435 spin_lock_init(&domain->mapping_lock);
1436 spin_lock_init(&domain->iommu_lock);
1438 domain_reserve_special_ranges(domain);
1440 /* calculate AGAW */
1441 iommu = domain_get_iommu(domain);
1442 if (guest_width > cap_mgaw(iommu->cap))
1443 guest_width = cap_mgaw(iommu->cap);
1444 domain->gaw = guest_width;
1445 adjust_width = guestwidth_to_adjustwidth(guest_width);
1446 agaw = width_to_agaw(adjust_width);
1447 sagaw = cap_sagaw(iommu->cap);
1448 if (!test_bit(agaw, &sagaw)) {
1449 /* hardware doesn't support it, choose a bigger one */
1450 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1451 agaw = find_next_bit(&sagaw, 5, agaw);
1455 domain->agaw = agaw;
1456 INIT_LIST_HEAD(&domain->devices);
1458 if (ecap_coherent(iommu->ecap))
1459 domain->iommu_coherency = 1;
1461 domain->iommu_coherency = 0;
1463 if (ecap_sc_support(iommu->ecap))
1464 domain->iommu_snooping = 1;
1466 domain->iommu_snooping = 0;
1468 domain->iommu_count = 1;
1470 /* always allocate the top pgd */
1471 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1474 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1478 static void domain_exit(struct dmar_domain *domain)
1482 /* Domain 0 is reserved, so dont process it */
1486 domain_remove_dev_info(domain);
1488 put_iova_domain(&domain->iovad);
1489 end = DOMAIN_MAX_ADDR(domain->gaw);
1490 end = end & (~PAGE_MASK);
1493 dma_pte_clear_range(domain, 0, end);
1495 /* free page tables */
1496 dma_pte_free_pagetable(domain, 0, end);
1498 iommu_free_domain(domain);
1499 free_domain_mem(domain);
1502 static int domain_context_mapping_one(struct dmar_domain *domain,
1505 struct context_entry *context;
1506 unsigned long flags;
1507 struct intel_iommu *iommu;
1508 struct dma_pte *pgd;
1510 unsigned long ndomains;
1514 pr_debug("Set context mapping for %02x:%02x.%d\n",
1515 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1516 BUG_ON(!domain->pgd);
1518 iommu = device_to_iommu(bus, devfn);
1522 context = device_to_context_entry(iommu, bus, devfn);
1525 spin_lock_irqsave(&iommu->lock, flags);
1526 if (context_present(context)) {
1527 spin_unlock_irqrestore(&iommu->lock, flags);
1534 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1537 /* find an available domain id for this device in iommu */
1538 ndomains = cap_ndoms(iommu->cap);
1539 num = find_first_bit(iommu->domain_ids, ndomains);
1540 for (; num < ndomains; ) {
1541 if (iommu->domains[num] == domain) {
1546 num = find_next_bit(iommu->domain_ids,
1547 cap_ndoms(iommu->cap), num+1);
1551 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1552 if (num >= ndomains) {
1553 spin_unlock_irqrestore(&iommu->lock, flags);
1554 printk(KERN_ERR "IOMMU: no free domain ids\n");
1558 set_bit(num, iommu->domain_ids);
1559 iommu->domains[num] = domain;
1563 /* Skip top levels of page tables for
1564 * iommu which has less agaw than default.
1566 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1567 pgd = phys_to_virt(dma_pte_addr(pgd));
1568 if (!dma_pte_present(pgd)) {
1569 spin_unlock_irqrestore(&iommu->lock, flags);
1575 context_set_domain_id(context, id);
1576 context_set_address_width(context, iommu->agaw);
1577 context_set_address_root(context, virt_to_phys(pgd));
1578 context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1579 context_set_fault_enable(context);
1580 context_set_present(context);
1581 domain_flush_cache(domain, context, sizeof(*context));
1583 /* it's a non-present to present mapping */
1584 if (iommu->flush.flush_context(iommu, domain->id,
1585 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1586 DMA_CCMD_DEVICE_INVL, 1))
1587 iommu_flush_write_buffer(iommu);
1589 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1591 spin_unlock_irqrestore(&iommu->lock, flags);
1593 spin_lock_irqsave(&domain->iommu_lock, flags);
1594 if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1595 domain->iommu_count++;
1596 domain_update_iommu_cap(domain);
1598 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1603 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1606 struct pci_dev *tmp, *parent;
1608 ret = domain_context_mapping_one(domain, pdev->bus->number,
1613 /* dependent device mapping */
1614 tmp = pci_find_upstream_pcie_bridge(pdev);
1617 /* Secondary interface's bus number and devfn 0 */
1618 parent = pdev->bus->self;
1619 while (parent != tmp) {
1620 ret = domain_context_mapping_one(domain, parent->bus->number,
1624 parent = parent->bus->self;
1626 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1627 return domain_context_mapping_one(domain,
1628 tmp->subordinate->number, 0);
1629 else /* this is a legacy PCI bridge */
1630 return domain_context_mapping_one(domain,
1631 tmp->bus->number, tmp->devfn);
1634 static int domain_context_mapped(struct pci_dev *pdev)
1637 struct pci_dev *tmp, *parent;
1638 struct intel_iommu *iommu;
1640 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1644 ret = device_context_mapped(iommu,
1645 pdev->bus->number, pdev->devfn);
1648 /* dependent device mapping */
1649 tmp = pci_find_upstream_pcie_bridge(pdev);
1652 /* Secondary interface's bus number and devfn 0 */
1653 parent = pdev->bus->self;
1654 while (parent != tmp) {
1655 ret = device_context_mapped(iommu, parent->bus->number,
1659 parent = parent->bus->self;
1662 return device_context_mapped(iommu,
1663 tmp->subordinate->number, 0);
1665 return device_context_mapped(iommu,
1666 tmp->bus->number, tmp->devfn);
1670 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1671 u64 hpa, size_t size, int prot)
1673 u64 start_pfn, end_pfn;
1674 struct dma_pte *pte;
1676 int addr_width = agaw_to_width(domain->agaw);
1678 hpa &= (((u64)1) << addr_width) - 1;
1680 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1683 start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1684 end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1686 while (start_pfn < end_pfn) {
1687 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1690 /* We don't need lock here, nobody else
1691 * touches the iova range
1693 BUG_ON(dma_pte_addr(pte));
1694 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1695 dma_set_pte_prot(pte, prot);
1696 if (prot & DMA_PTE_SNP)
1697 dma_set_pte_snp(pte);
1698 domain_flush_cache(domain, pte, sizeof(*pte));
1705 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1710 clear_context_table(iommu, bus, devfn);
1711 iommu->flush.flush_context(iommu, 0, 0, 0,
1712 DMA_CCMD_GLOBAL_INVL, 0);
1713 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1714 DMA_TLB_GLOBAL_FLUSH, 0);
1717 static void domain_remove_dev_info(struct dmar_domain *domain)
1719 struct device_domain_info *info;
1720 unsigned long flags;
1721 struct intel_iommu *iommu;
1723 spin_lock_irqsave(&device_domain_lock, flags);
1724 while (!list_empty(&domain->devices)) {
1725 info = list_entry(domain->devices.next,
1726 struct device_domain_info, link);
1727 list_del(&info->link);
1728 list_del(&info->global);
1730 info->dev->dev.archdata.iommu = NULL;
1731 spin_unlock_irqrestore(&device_domain_lock, flags);
1733 iommu = device_to_iommu(info->bus, info->devfn);
1734 iommu_detach_dev(iommu, info->bus, info->devfn);
1735 free_devinfo_mem(info);
1737 spin_lock_irqsave(&device_domain_lock, flags);
1739 spin_unlock_irqrestore(&device_domain_lock, flags);
1744 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1746 static struct dmar_domain *
1747 find_domain(struct pci_dev *pdev)
1749 struct device_domain_info *info;
1751 /* No lock here, assumes no domain exit in normal case */
1752 info = pdev->dev.archdata.iommu;
1754 return info->domain;
1758 /* domain is initialized */
1759 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1761 struct dmar_domain *domain, *found = NULL;
1762 struct intel_iommu *iommu;
1763 struct dmar_drhd_unit *drhd;
1764 struct device_domain_info *info, *tmp;
1765 struct pci_dev *dev_tmp;
1766 unsigned long flags;
1767 int bus = 0, devfn = 0;
1769 domain = find_domain(pdev);
1773 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1775 if (dev_tmp->is_pcie) {
1776 bus = dev_tmp->subordinate->number;
1779 bus = dev_tmp->bus->number;
1780 devfn = dev_tmp->devfn;
1782 spin_lock_irqsave(&device_domain_lock, flags);
1783 list_for_each_entry(info, &device_domain_list, global) {
1784 if (info->bus == bus && info->devfn == devfn) {
1785 found = info->domain;
1789 spin_unlock_irqrestore(&device_domain_lock, flags);
1790 /* pcie-pci bridge already has a domain, uses it */
1797 /* Allocate new domain for the device */
1798 drhd = dmar_find_matched_drhd_unit(pdev);
1800 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1804 iommu = drhd->iommu;
1806 domain = iommu_alloc_domain(iommu);
1810 if (domain_init(domain, gaw)) {
1811 domain_exit(domain);
1815 /* register pcie-to-pci device */
1817 info = alloc_devinfo_mem();
1819 domain_exit(domain);
1823 info->devfn = devfn;
1825 info->domain = domain;
1826 /* This domain is shared by devices under p2p bridge */
1827 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1829 /* pcie-to-pci bridge already has a domain, uses it */
1831 spin_lock_irqsave(&device_domain_lock, flags);
1832 list_for_each_entry(tmp, &device_domain_list, global) {
1833 if (tmp->bus == bus && tmp->devfn == devfn) {
1834 found = tmp->domain;
1839 free_devinfo_mem(info);
1840 domain_exit(domain);
1843 list_add(&info->link, &domain->devices);
1844 list_add(&info->global, &device_domain_list);
1846 spin_unlock_irqrestore(&device_domain_lock, flags);
1850 info = alloc_devinfo_mem();
1853 info->bus = pdev->bus->number;
1854 info->devfn = pdev->devfn;
1856 info->domain = domain;
1857 spin_lock_irqsave(&device_domain_lock, flags);
1858 /* somebody is fast */
1859 found = find_domain(pdev);
1860 if (found != NULL) {
1861 spin_unlock_irqrestore(&device_domain_lock, flags);
1862 if (found != domain) {
1863 domain_exit(domain);
1866 free_devinfo_mem(info);
1869 list_add(&info->link, &domain->devices);
1870 list_add(&info->global, &device_domain_list);
1871 pdev->dev.archdata.iommu = info;
1872 spin_unlock_irqrestore(&device_domain_lock, flags);
1875 /* recheck it here, maybe others set it */
1876 return find_domain(pdev);
1879 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1880 unsigned long long start,
1881 unsigned long long end)
1883 struct dmar_domain *domain;
1885 unsigned long long base;
1889 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1890 pci_name(pdev), start, end);
1891 /* page table init */
1892 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1896 /* The address might not be aligned */
1897 base = start & PAGE_MASK;
1899 size = PAGE_ALIGN(size);
1900 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1901 IOVA_PFN(base + size) - 1)) {
1902 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1907 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1908 size, base, pci_name(pdev));
1910 * RMRR range might have overlap with physical memory range,
1913 dma_pte_clear_range(domain, base, base + size);
1915 ret = domain_page_mapping(domain, base, base, size,
1916 DMA_PTE_READ|DMA_PTE_WRITE);
1920 /* context entry init */
1921 ret = domain_context_mapping(domain, pdev);
1925 domain_exit(domain);
1930 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1931 struct pci_dev *pdev)
1933 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1935 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1936 rmrr->end_address + 1);
1939 #ifdef CONFIG_DMAR_GFX_WA
1940 struct iommu_prepare_data {
1941 struct pci_dev *pdev;
1945 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1946 unsigned long end_pfn, void *datax)
1948 struct iommu_prepare_data *data;
1950 data = (struct iommu_prepare_data *)datax;
1952 data->ret = iommu_prepare_identity_map(data->pdev,
1953 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1958 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1961 struct iommu_prepare_data data;
1966 for_each_online_node(nid) {
1967 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1974 static void __init iommu_prepare_gfx_mapping(void)
1976 struct pci_dev *pdev = NULL;
1979 for_each_pci_dev(pdev) {
1980 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1981 !IS_GFX_DEVICE(pdev))
1983 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1985 ret = iommu_prepare_with_active_regions(pdev);
1987 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1990 #else /* !CONFIG_DMAR_GFX_WA */
1991 static inline void iommu_prepare_gfx_mapping(void)
1997 #ifdef CONFIG_DMAR_FLOPPY_WA
1998 static inline void iommu_prepare_isa(void)
2000 struct pci_dev *pdev;
2003 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2007 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
2008 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2011 printk("IOMMU: Failed to create 0-64M identity map, "
2012 "floppy might not work\n");
2016 static inline void iommu_prepare_isa(void)
2020 #endif /* !CONFIG_DMAR_FLPY_WA */
2022 static int __init init_dmars(void)
2024 struct dmar_drhd_unit *drhd;
2025 struct dmar_rmrr_unit *rmrr;
2026 struct pci_dev *pdev;
2027 struct intel_iommu *iommu;
2028 int i, ret, unit = 0;
2033 * initialize and program root entry to not present
2036 for_each_drhd_unit(drhd) {
2039 * lock not needed as this is only incremented in the single
2040 * threaded kernel __init code path all other access are read
2045 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2048 printk(KERN_ERR "Allocating global iommu array failed\n");
2053 deferred_flush = kzalloc(g_num_of_iommus *
2054 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2055 if (!deferred_flush) {
2061 for_each_drhd_unit(drhd) {
2065 iommu = drhd->iommu;
2066 g_iommus[iommu->seq_id] = iommu;
2068 ret = iommu_init_domains(iommu);
2074 * we could share the same root & context tables
2075 * amoung all IOMMU's. Need to Split it later.
2077 ret = iommu_alloc_root_entry(iommu);
2079 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2084 for_each_drhd_unit(drhd) {
2088 iommu = drhd->iommu;
2089 if (dmar_enable_qi(iommu)) {
2091 * Queued Invalidate not enabled, use Register Based
2094 iommu->flush.flush_context = __iommu_flush_context;
2095 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2096 printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2098 (unsigned long long)drhd->reg_base_addr);
2100 iommu->flush.flush_context = qi_flush_context;
2101 iommu->flush.flush_iotlb = qi_flush_iotlb;
2102 printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2104 (unsigned long long)drhd->reg_base_addr);
2110 * for each dev attached to rmrr
2112 * locate drhd for dev, alloc domain for dev
2113 * allocate free domain
2114 * allocate page table entries for rmrr
2115 * if context not allocated for bus
2116 * allocate and init context
2117 * set present in root table for this bus
2118 * init context with domain, translation etc
2122 for_each_rmrr_units(rmrr) {
2123 for (i = 0; i < rmrr->devices_cnt; i++) {
2124 pdev = rmrr->devices[i];
2125 /* some BIOS lists non-exist devices in DMAR table */
2128 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2131 "IOMMU: mapping reserved region failed\n");
2135 iommu_prepare_gfx_mapping();
2137 iommu_prepare_isa();
2142 * global invalidate context cache
2143 * global invalidate iotlb
2144 * enable translation
2146 for_each_drhd_unit(drhd) {
2149 iommu = drhd->iommu;
2150 sprintf (iommu->name, "dmar%d", unit++);
2152 iommu_flush_write_buffer(iommu);
2154 ret = dmar_set_interrupt(iommu);
2158 iommu_set_root_entry(iommu);
2160 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2162 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2164 iommu_disable_protect_mem_regions(iommu);
2166 ret = iommu_enable_translation(iommu);
2173 for_each_drhd_unit(drhd) {
2176 iommu = drhd->iommu;
2183 static inline u64 aligned_size(u64 host_addr, size_t size)
2186 addr = (host_addr & (~PAGE_MASK)) + size;
2187 return PAGE_ALIGN(addr);
2191 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2195 /* Make sure it's in range */
2196 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2197 if (!size || (IOVA_START_ADDR + size > end))
2200 piova = alloc_iova(&domain->iovad,
2201 size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2205 static struct iova *
2206 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2207 size_t size, u64 dma_mask)
2209 struct pci_dev *pdev = to_pci_dev(dev);
2210 struct iova *iova = NULL;
2212 if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2213 iova = iommu_alloc_iova(domain, size, dma_mask);
2216 * First try to allocate an io virtual address in
2217 * DMA_32BIT_MASK and if that fails then try allocating
2220 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2222 iova = iommu_alloc_iova(domain, size, dma_mask);
2226 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2233 static struct dmar_domain *
2234 get_valid_domain_for_dev(struct pci_dev *pdev)
2236 struct dmar_domain *domain;
2239 domain = get_domain_for_dev(pdev,
2240 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2243 "Allocating domain for %s failed", pci_name(pdev));
2247 /* make sure context mapping is ok */
2248 if (unlikely(!domain_context_mapped(pdev))) {
2249 ret = domain_context_mapping(domain, pdev);
2252 "Domain context map for %s failed",
2261 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2262 size_t size, int dir, u64 dma_mask)
2264 struct pci_dev *pdev = to_pci_dev(hwdev);
2265 struct dmar_domain *domain;
2266 phys_addr_t start_paddr;
2270 struct intel_iommu *iommu;
2272 BUG_ON(dir == DMA_NONE);
2273 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2276 domain = get_valid_domain_for_dev(pdev);
2280 iommu = domain_get_iommu(domain);
2281 size = aligned_size((u64)paddr, size);
2283 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2287 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2290 * Check if DMAR supports zero-length reads on write only
2293 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2294 !cap_zlr(iommu->cap))
2295 prot |= DMA_PTE_READ;
2296 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2297 prot |= DMA_PTE_WRITE;
2299 * paddr - (paddr + size) might be partial page, we should map the whole
2300 * page. Note: if two part of one page are separately mapped, we
2301 * might have two guest_addr mapping to the same host paddr, but this
2302 * is not a big problem
2304 ret = domain_page_mapping(domain, start_paddr,
2305 ((u64)paddr) & PAGE_MASK, size, prot);
2309 /* it's a non-present to present mapping */
2310 ret = iommu_flush_iotlb_psi(iommu, domain->id,
2311 start_paddr, size >> VTD_PAGE_SHIFT, 1);
2313 iommu_flush_write_buffer(iommu);
2315 return start_paddr + ((u64)paddr & (~PAGE_MASK));
2319 __free_iova(&domain->iovad, iova);
2320 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2321 pci_name(pdev), size, (unsigned long long)paddr, dir);
2325 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2326 size_t size, int dir)
2328 return __intel_map_single(hwdev, paddr, size, dir,
2329 to_pci_dev(hwdev)->dma_mask);
2332 static void flush_unmaps(void)
2338 /* just flush them all */
2339 for (i = 0; i < g_num_of_iommus; i++) {
2340 struct intel_iommu *iommu = g_iommus[i];
2344 if (deferred_flush[i].next) {
2345 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2346 DMA_TLB_GLOBAL_FLUSH, 0);
2347 for (j = 0; j < deferred_flush[i].next; j++) {
2348 __free_iova(&deferred_flush[i].domain[j]->iovad,
2349 deferred_flush[i].iova[j]);
2351 deferred_flush[i].next = 0;
2358 static void flush_unmaps_timeout(unsigned long data)
2360 unsigned long flags;
2362 spin_lock_irqsave(&async_umap_flush_lock, flags);
2364 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2367 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2369 unsigned long flags;
2371 struct intel_iommu *iommu;
2373 spin_lock_irqsave(&async_umap_flush_lock, flags);
2374 if (list_size == HIGH_WATER_MARK)
2377 iommu = domain_get_iommu(dom);
2378 iommu_id = iommu->seq_id;
2380 next = deferred_flush[iommu_id].next;
2381 deferred_flush[iommu_id].domain[next] = dom;
2382 deferred_flush[iommu_id].iova[next] = iova;
2383 deferred_flush[iommu_id].next++;
2386 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2390 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2393 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2396 struct pci_dev *pdev = to_pci_dev(dev);
2397 struct dmar_domain *domain;
2398 unsigned long start_addr;
2400 struct intel_iommu *iommu;
2402 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2404 domain = find_domain(pdev);
2407 iommu = domain_get_iommu(domain);
2409 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2413 start_addr = iova->pfn_lo << PAGE_SHIFT;
2414 size = aligned_size((u64)dev_addr, size);
2416 pr_debug("Device %s unmapping: %zx@%llx\n",
2417 pci_name(pdev), size, (unsigned long long)start_addr);
2419 /* clear the whole page */
2420 dma_pte_clear_range(domain, start_addr, start_addr + size);
2421 /* free page tables */
2422 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2423 if (intel_iommu_strict) {
2424 if (iommu_flush_iotlb_psi(iommu,
2425 domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2426 iommu_flush_write_buffer(iommu);
2428 __free_iova(&domain->iovad, iova);
2430 add_unmap(domain, iova);
2432 * queue up the release of the unmap to save the 1/6th of the
2433 * cpu used up by the iotlb flush operation...
2438 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2439 dma_addr_t *dma_handle, gfp_t flags)
2444 size = PAGE_ALIGN(size);
2445 order = get_order(size);
2446 flags &= ~(GFP_DMA | GFP_DMA32);
2448 vaddr = (void *)__get_free_pages(flags, order);
2451 memset(vaddr, 0, size);
2453 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2455 hwdev->coherent_dma_mask);
2458 free_pages((unsigned long)vaddr, order);
2462 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2463 dma_addr_t dma_handle)
2467 size = PAGE_ALIGN(size);
2468 order = get_order(size);
2470 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2471 free_pages((unsigned long)vaddr, order);
2474 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2475 int nelems, int dir)
2478 struct pci_dev *pdev = to_pci_dev(hwdev);
2479 struct dmar_domain *domain;
2480 unsigned long start_addr;
2484 struct scatterlist *sg;
2485 struct intel_iommu *iommu;
2487 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2490 domain = find_domain(pdev);
2493 iommu = domain_get_iommu(domain);
2495 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2498 for_each_sg(sglist, sg, nelems, i) {
2499 addr = page_to_phys(sg_page(sg)) + sg->offset;
2500 size += aligned_size((u64)addr, sg->length);
2503 start_addr = iova->pfn_lo << PAGE_SHIFT;
2505 /* clear the whole page */
2506 dma_pte_clear_range(domain, start_addr, start_addr + size);
2507 /* free page tables */
2508 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2510 if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2511 size >> VTD_PAGE_SHIFT, 0))
2512 iommu_flush_write_buffer(iommu);
2515 __free_iova(&domain->iovad, iova);
2518 static int intel_nontranslate_map_sg(struct device *hddev,
2519 struct scatterlist *sglist, int nelems, int dir)
2522 struct scatterlist *sg;
2524 for_each_sg(sglist, sg, nelems, i) {
2525 BUG_ON(!sg_page(sg));
2526 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2527 sg->dma_length = sg->length;
2532 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2537 struct pci_dev *pdev = to_pci_dev(hwdev);
2538 struct dmar_domain *domain;
2542 struct iova *iova = NULL;
2544 struct scatterlist *sg;
2545 unsigned long start_addr;
2546 struct intel_iommu *iommu;
2548 BUG_ON(dir == DMA_NONE);
2549 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2550 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2552 domain = get_valid_domain_for_dev(pdev);
2556 iommu = domain_get_iommu(domain);
2558 for_each_sg(sglist, sg, nelems, i) {
2559 addr = page_to_phys(sg_page(sg)) + sg->offset;
2560 size += aligned_size((u64)addr, sg->length);
2563 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2565 sglist->dma_length = 0;
2570 * Check if DMAR supports zero-length reads on write only
2573 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2574 !cap_zlr(iommu->cap))
2575 prot |= DMA_PTE_READ;
2576 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2577 prot |= DMA_PTE_WRITE;
2579 start_addr = iova->pfn_lo << PAGE_SHIFT;
2581 for_each_sg(sglist, sg, nelems, i) {
2582 addr = page_to_phys(sg_page(sg)) + sg->offset;
2583 size = aligned_size((u64)addr, sg->length);
2584 ret = domain_page_mapping(domain, start_addr + offset,
2585 ((u64)addr) & PAGE_MASK,
2588 /* clear the page */
2589 dma_pte_clear_range(domain, start_addr,
2590 start_addr + offset);
2591 /* free page tables */
2592 dma_pte_free_pagetable(domain, start_addr,
2593 start_addr + offset);
2595 __free_iova(&domain->iovad, iova);
2598 sg->dma_address = start_addr + offset +
2599 ((u64)addr & (~PAGE_MASK));
2600 sg->dma_length = sg->length;
2604 /* it's a non-present to present mapping */
2605 if (iommu_flush_iotlb_psi(iommu, domain->id,
2606 start_addr, offset >> VTD_PAGE_SHIFT, 1))
2607 iommu_flush_write_buffer(iommu);
2611 static struct dma_mapping_ops intel_dma_ops = {
2612 .alloc_coherent = intel_alloc_coherent,
2613 .free_coherent = intel_free_coherent,
2614 .map_single = intel_map_single,
2615 .unmap_single = intel_unmap_single,
2616 .map_sg = intel_map_sg,
2617 .unmap_sg = intel_unmap_sg,
2620 static inline int iommu_domain_cache_init(void)
2624 iommu_domain_cache = kmem_cache_create("iommu_domain",
2625 sizeof(struct dmar_domain),
2630 if (!iommu_domain_cache) {
2631 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2638 static inline int iommu_devinfo_cache_init(void)
2642 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2643 sizeof(struct device_domain_info),
2647 if (!iommu_devinfo_cache) {
2648 printk(KERN_ERR "Couldn't create devinfo cache\n");
2655 static inline int iommu_iova_cache_init(void)
2659 iommu_iova_cache = kmem_cache_create("iommu_iova",
2660 sizeof(struct iova),
2664 if (!iommu_iova_cache) {
2665 printk(KERN_ERR "Couldn't create iova cache\n");
2672 static int __init iommu_init_mempool(void)
2675 ret = iommu_iova_cache_init();
2679 ret = iommu_domain_cache_init();
2683 ret = iommu_devinfo_cache_init();
2687 kmem_cache_destroy(iommu_domain_cache);
2689 kmem_cache_destroy(iommu_iova_cache);
2694 static void __init iommu_exit_mempool(void)
2696 kmem_cache_destroy(iommu_devinfo_cache);
2697 kmem_cache_destroy(iommu_domain_cache);
2698 kmem_cache_destroy(iommu_iova_cache);
2702 static void __init init_no_remapping_devices(void)
2704 struct dmar_drhd_unit *drhd;
2706 for_each_drhd_unit(drhd) {
2707 if (!drhd->include_all) {
2709 for (i = 0; i < drhd->devices_cnt; i++)
2710 if (drhd->devices[i] != NULL)
2712 /* ignore DMAR unit if no pci devices exist */
2713 if (i == drhd->devices_cnt)
2721 for_each_drhd_unit(drhd) {
2723 if (drhd->ignored || drhd->include_all)
2726 for (i = 0; i < drhd->devices_cnt; i++)
2727 if (drhd->devices[i] &&
2728 !IS_GFX_DEVICE(drhd->devices[i]))
2731 if (i < drhd->devices_cnt)
2734 /* bypass IOMMU if it is just for gfx devices */
2736 for (i = 0; i < drhd->devices_cnt; i++) {
2737 if (!drhd->devices[i])
2739 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2744 int __init intel_iommu_init(void)
2748 if (dmar_table_init())
2751 if (dmar_dev_scope_init())
2755 * Check the need for DMA-remapping initialization now.
2756 * Above initialization will also be used by Interrupt-remapping.
2758 if (no_iommu || swiotlb || dmar_disabled)
2761 iommu_init_mempool();
2762 dmar_init_reserved_ranges();
2764 init_no_remapping_devices();
2768 printk(KERN_ERR "IOMMU: dmar init failed\n");
2769 put_iova_domain(&reserved_iova_list);
2770 iommu_exit_mempool();
2774 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2776 init_timer(&unmap_timer);
2778 dma_ops = &intel_dma_ops;
2780 register_iommu(&intel_iommu_ops);
2785 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2786 struct pci_dev *pdev)
2788 struct device_domain_info *info;
2789 unsigned long flags;
2791 info = alloc_devinfo_mem();
2795 info->bus = pdev->bus->number;
2796 info->devfn = pdev->devfn;
2798 info->domain = domain;
2800 spin_lock_irqsave(&device_domain_lock, flags);
2801 list_add(&info->link, &domain->devices);
2802 list_add(&info->global, &device_domain_list);
2803 pdev->dev.archdata.iommu = info;
2804 spin_unlock_irqrestore(&device_domain_lock, flags);
2809 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
2810 struct pci_dev *pdev)
2812 struct pci_dev *tmp, *parent;
2814 if (!iommu || !pdev)
2817 /* dependent device detach */
2818 tmp = pci_find_upstream_pcie_bridge(pdev);
2819 /* Secondary interface's bus number and devfn 0 */
2821 parent = pdev->bus->self;
2822 while (parent != tmp) {
2823 iommu_detach_dev(iommu, parent->bus->number,
2825 parent = parent->bus->self;
2827 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
2828 iommu_detach_dev(iommu,
2829 tmp->subordinate->number, 0);
2830 else /* this is a legacy PCI bridge */
2831 iommu_detach_dev(iommu,
2832 tmp->bus->number, tmp->devfn);
2836 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2837 struct pci_dev *pdev)
2839 struct device_domain_info *info;
2840 struct intel_iommu *iommu;
2841 unsigned long flags;
2843 struct list_head *entry, *tmp;
2845 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2849 spin_lock_irqsave(&device_domain_lock, flags);
2850 list_for_each_safe(entry, tmp, &domain->devices) {
2851 info = list_entry(entry, struct device_domain_info, link);
2852 if (info->bus == pdev->bus->number &&
2853 info->devfn == pdev->devfn) {
2854 list_del(&info->link);
2855 list_del(&info->global);
2857 info->dev->dev.archdata.iommu = NULL;
2858 spin_unlock_irqrestore(&device_domain_lock, flags);
2860 iommu_detach_dev(iommu, info->bus, info->devfn);
2861 iommu_detach_dependent_devices(iommu, pdev);
2862 free_devinfo_mem(info);
2864 spin_lock_irqsave(&device_domain_lock, flags);
2872 /* if there is no other devices under the same iommu
2873 * owned by this domain, clear this iommu in iommu_bmp
2874 * update iommu count and coherency
2876 if (device_to_iommu(info->bus, info->devfn) == iommu)
2881 unsigned long tmp_flags;
2882 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2883 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2884 domain->iommu_count--;
2885 domain_update_iommu_cap(domain);
2886 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2889 spin_unlock_irqrestore(&device_domain_lock, flags);
2892 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2894 struct device_domain_info *info;
2895 struct intel_iommu *iommu;
2896 unsigned long flags1, flags2;
2898 spin_lock_irqsave(&device_domain_lock, flags1);
2899 while (!list_empty(&domain->devices)) {
2900 info = list_entry(domain->devices.next,
2901 struct device_domain_info, link);
2902 list_del(&info->link);
2903 list_del(&info->global);
2905 info->dev->dev.archdata.iommu = NULL;
2907 spin_unlock_irqrestore(&device_domain_lock, flags1);
2909 iommu = device_to_iommu(info->bus, info->devfn);
2910 iommu_detach_dev(iommu, info->bus, info->devfn);
2911 iommu_detach_dependent_devices(iommu, info->dev);
2913 /* clear this iommu in iommu_bmp, update iommu count
2916 spin_lock_irqsave(&domain->iommu_lock, flags2);
2917 if (test_and_clear_bit(iommu->seq_id,
2918 &domain->iommu_bmp)) {
2919 domain->iommu_count--;
2920 domain_update_iommu_cap(domain);
2922 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2924 free_devinfo_mem(info);
2925 spin_lock_irqsave(&device_domain_lock, flags1);
2927 spin_unlock_irqrestore(&device_domain_lock, flags1);
2930 /* domain id for virtual machine, it won't be set in context */
2931 static unsigned long vm_domid;
2933 static int vm_domain_min_agaw(struct dmar_domain *domain)
2936 int min_agaw = domain->agaw;
2938 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2939 for (; i < g_num_of_iommus; ) {
2940 if (min_agaw > g_iommus[i]->agaw)
2941 min_agaw = g_iommus[i]->agaw;
2943 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2949 static struct dmar_domain *iommu_alloc_vm_domain(void)
2951 struct dmar_domain *domain;
2953 domain = alloc_domain_mem();
2957 domain->id = vm_domid++;
2958 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2959 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2964 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2968 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2969 spin_lock_init(&domain->mapping_lock);
2970 spin_lock_init(&domain->iommu_lock);
2972 domain_reserve_special_ranges(domain);
2974 /* calculate AGAW */
2975 domain->gaw = guest_width;
2976 adjust_width = guestwidth_to_adjustwidth(guest_width);
2977 domain->agaw = width_to_agaw(adjust_width);
2979 INIT_LIST_HEAD(&domain->devices);
2981 domain->iommu_count = 0;
2982 domain->iommu_coherency = 0;
2983 domain->max_addr = 0;
2985 /* always allocate the top pgd */
2986 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2989 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2993 static void iommu_free_vm_domain(struct dmar_domain *domain)
2995 unsigned long flags;
2996 struct dmar_drhd_unit *drhd;
2997 struct intel_iommu *iommu;
2999 unsigned long ndomains;
3001 for_each_drhd_unit(drhd) {
3004 iommu = drhd->iommu;
3006 ndomains = cap_ndoms(iommu->cap);
3007 i = find_first_bit(iommu->domain_ids, ndomains);
3008 for (; i < ndomains; ) {
3009 if (iommu->domains[i] == domain) {
3010 spin_lock_irqsave(&iommu->lock, flags);
3011 clear_bit(i, iommu->domain_ids);
3012 iommu->domains[i] = NULL;
3013 spin_unlock_irqrestore(&iommu->lock, flags);
3016 i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3021 static void vm_domain_exit(struct dmar_domain *domain)
3025 /* Domain 0 is reserved, so dont process it */
3029 vm_domain_remove_all_dev_info(domain);
3031 put_iova_domain(&domain->iovad);
3032 end = DOMAIN_MAX_ADDR(domain->gaw);
3033 end = end & (~VTD_PAGE_MASK);
3036 dma_pte_clear_range(domain, 0, end);
3038 /* free page tables */
3039 dma_pte_free_pagetable(domain, 0, end);
3041 iommu_free_vm_domain(domain);
3042 free_domain_mem(domain);
3045 static int intel_iommu_domain_init(struct iommu_domain *domain)
3047 struct dmar_domain *dmar_domain;
3049 dmar_domain = iommu_alloc_vm_domain();
3052 "intel_iommu_domain_init: dmar_domain == NULL\n");
3055 if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3057 "intel_iommu_domain_init() failed\n");
3058 vm_domain_exit(dmar_domain);
3061 domain->priv = dmar_domain;
3066 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3068 struct dmar_domain *dmar_domain = domain->priv;
3070 domain->priv = NULL;
3071 vm_domain_exit(dmar_domain);
3074 static int intel_iommu_attach_device(struct iommu_domain *domain,
3077 struct dmar_domain *dmar_domain = domain->priv;
3078 struct pci_dev *pdev = to_pci_dev(dev);
3079 struct intel_iommu *iommu;
3084 /* normally pdev is not mapped */
3085 if (unlikely(domain_context_mapped(pdev))) {
3086 struct dmar_domain *old_domain;
3088 old_domain = find_domain(pdev);
3090 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3091 vm_domain_remove_one_dev_info(old_domain, pdev);
3093 domain_remove_dev_info(old_domain);
3097 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3101 /* check if this iommu agaw is sufficient for max mapped address */
3102 addr_width = agaw_to_width(iommu->agaw);
3103 end = DOMAIN_MAX_ADDR(addr_width);
3104 end = end & VTD_PAGE_MASK;
3105 if (end < dmar_domain->max_addr) {
3106 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3107 "sufficient for the mapped address (%llx)\n",
3108 __func__, iommu->agaw, dmar_domain->max_addr);
3112 ret = domain_context_mapping(dmar_domain, pdev);
3116 ret = vm_domain_add_dev_info(dmar_domain, pdev);
3120 static void intel_iommu_detach_device(struct iommu_domain *domain,
3123 struct dmar_domain *dmar_domain = domain->priv;
3124 struct pci_dev *pdev = to_pci_dev(dev);
3126 vm_domain_remove_one_dev_info(dmar_domain, pdev);
3129 static int intel_iommu_map_range(struct iommu_domain *domain,
3130 unsigned long iova, phys_addr_t hpa,
3131 size_t size, int iommu_prot)
3133 struct dmar_domain *dmar_domain = domain->priv;
3139 if (iommu_prot & IOMMU_READ)
3140 prot |= DMA_PTE_READ;
3141 if (iommu_prot & IOMMU_WRITE)
3142 prot |= DMA_PTE_WRITE;
3143 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3144 prot |= DMA_PTE_SNP;
3146 max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3147 if (dmar_domain->max_addr < max_addr) {
3151 /* check if minimum agaw is sufficient for mapped address */
3152 min_agaw = vm_domain_min_agaw(dmar_domain);
3153 addr_width = agaw_to_width(min_agaw);
3154 end = DOMAIN_MAX_ADDR(addr_width);
3155 end = end & VTD_PAGE_MASK;
3156 if (end < max_addr) {
3157 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3158 "sufficient for the mapped address (%llx)\n",
3159 __func__, min_agaw, max_addr);
3162 dmar_domain->max_addr = max_addr;
3165 ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3169 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3170 unsigned long iova, size_t size)
3172 struct dmar_domain *dmar_domain = domain->priv;
3175 /* The address might not be aligned */
3176 base = iova & VTD_PAGE_MASK;
3177 size = VTD_PAGE_ALIGN(size);
3178 dma_pte_clear_range(dmar_domain, base, base + size);
3180 if (dmar_domain->max_addr == base + size)
3181 dmar_domain->max_addr = base;
3184 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3187 struct dmar_domain *dmar_domain = domain->priv;
3188 struct dma_pte *pte;
3191 pte = addr_to_dma_pte(dmar_domain, iova);
3193 phys = dma_pte_addr(pte);
3198 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3201 struct dmar_domain *dmar_domain = domain->priv;
3203 if (cap == IOMMU_CAP_CACHE_COHERENCY)
3204 return dmar_domain->iommu_snooping;
3209 static struct iommu_ops intel_iommu_ops = {
3210 .domain_init = intel_iommu_domain_init,
3211 .domain_destroy = intel_iommu_domain_destroy,
3212 .attach_dev = intel_iommu_attach_device,
3213 .detach_dev = intel_iommu_detach_device,
3214 .map = intel_iommu_map_range,
3215 .unmap = intel_iommu_unmap_range,
3216 .iova_to_phys = intel_iommu_iova_to_phys,
3217 .domain_has_cap = intel_iommu_domain_has_cap,
3220 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3223 * Mobile 4 Series Chipset neglects to set RWBF capability,
3226 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3230 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);