2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/slab.h>
27 #include <linux/irq.h>
28 #include <linux/interrupt.h>
29 #include <linux/sysdev.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
37 #include "intel-iommu.h"
38 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
43 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
44 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
46 #define IOAPIC_RANGE_START (0xfee00000)
47 #define IOAPIC_RANGE_END (0xfeefffff)
48 #define IOVA_START_ADDR (0x1000)
50 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
52 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) /* 10sec */
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
57 static void flush_unmaps_timeout(unsigned long data);
59 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
61 static struct intel_iommu *g_iommus;
63 #define HIGH_WATER_MARK 250
64 struct deferred_flush_tables {
66 struct iova *iova[HIGH_WATER_MARK];
67 struct dmar_domain *domain[HIGH_WATER_MARK];
70 static struct deferred_flush_tables *deferred_flush;
72 /* bitmap for indexing intel_iommus */
73 static int g_num_of_iommus;
75 static int rwbf_quirk = 0;
77 static DEFINE_SPINLOCK(async_umap_flush_lock);
78 static LIST_HEAD(unmaps_to_do);
81 static long list_size;
83 static void domain_remove_dev_info(struct dmar_domain *domain);
85 static int dmar_disabled;
86 static int __initdata dmar_map_gfx = 1;
87 static int dmar_forcedac;
88 static int intel_iommu_strict;
90 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
91 static DEFINE_SPINLOCK(device_domain_lock);
92 static LIST_HEAD(device_domain_list);
94 static int __init intel_iommu_setup(char *str)
99 if (!strncmp(str, "off", 3)) {
101 printk(KERN_INFO"Intel-IOMMU: disabled\n");
102 } else if (!strncmp(str, "igfx_off", 8)) {
105 "Intel-IOMMU: disable GFX device mapping\n");
106 } else if (!strncmp(str, "forcedac", 8)) {
108 "Intel-IOMMU: Forcing DAC for PCI devices\n");
110 } else if (!strncmp(str, "strict", 6)) {
112 "Intel-IOMMU: disable batched IOTLB flush\n");
113 intel_iommu_strict = 1;
116 str += strcspn(str, ",");
122 __setup("intel_iommu=", intel_iommu_setup);
124 static struct kmem_cache *iommu_domain_cache;
125 static struct kmem_cache *iommu_devinfo_cache;
126 static struct kmem_cache *iommu_iova_cache;
128 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
133 /* trying to avoid low memory issues */
134 flags = current->flags & PF_MEMALLOC;
135 current->flags |= PF_MEMALLOC;
136 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
137 current->flags &= (~PF_MEMALLOC | flags);
142 static inline void *alloc_pgtable_page(void)
147 /* trying to avoid low memory issues */
148 flags = current->flags & PF_MEMALLOC;
149 current->flags |= PF_MEMALLOC;
150 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
151 current->flags &= (~PF_MEMALLOC | flags);
155 static inline void free_pgtable_page(void *vaddr)
157 free_page((unsigned long)vaddr);
160 static inline void *alloc_domain_mem(void)
162 return iommu_kmem_cache_alloc(iommu_domain_cache);
165 static inline void free_domain_mem(void *vaddr)
167 kmem_cache_free(iommu_domain_cache, vaddr);
170 static inline void * alloc_devinfo_mem(void)
172 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
175 static inline void free_devinfo_mem(void *vaddr)
177 kmem_cache_free(iommu_devinfo_cache, vaddr);
180 struct iova *alloc_iova_mem(void)
182 return iommu_kmem_cache_alloc(iommu_iova_cache);
185 void free_iova_mem(struct iova *iova)
187 kmem_cache_free(iommu_iova_cache, iova);
190 static inline void __iommu_flush_cache(
191 struct intel_iommu *iommu, void *addr, int size)
193 if (!ecap_coherent(iommu->ecap))
194 clflush_cache_range(addr, size);
197 /* Gets context entry for a given bus and devfn */
198 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
201 struct root_entry *root;
202 struct context_entry *context;
203 unsigned long phy_addr;
206 spin_lock_irqsave(&iommu->lock, flags);
207 root = &iommu->root_entry[bus];
208 context = get_context_addr_from_root(root);
210 context = (struct context_entry *)alloc_pgtable_page();
212 spin_unlock_irqrestore(&iommu->lock, flags);
215 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
216 phy_addr = virt_to_phys((void *)context);
217 set_root_value(root, phy_addr);
218 set_root_present(root);
219 __iommu_flush_cache(iommu, root, sizeof(*root));
221 spin_unlock_irqrestore(&iommu->lock, flags);
222 return &context[devfn];
225 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
227 struct root_entry *root;
228 struct context_entry *context;
232 spin_lock_irqsave(&iommu->lock, flags);
233 root = &iommu->root_entry[bus];
234 context = get_context_addr_from_root(root);
239 ret = context_present(context[devfn]);
241 spin_unlock_irqrestore(&iommu->lock, flags);
245 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
247 struct root_entry *root;
248 struct context_entry *context;
251 spin_lock_irqsave(&iommu->lock, flags);
252 root = &iommu->root_entry[bus];
253 context = get_context_addr_from_root(root);
255 context_clear_entry(context[devfn]);
256 __iommu_flush_cache(iommu, &context[devfn], \
259 spin_unlock_irqrestore(&iommu->lock, flags);
262 static void free_context_table(struct intel_iommu *iommu)
264 struct root_entry *root;
267 struct context_entry *context;
269 spin_lock_irqsave(&iommu->lock, flags);
270 if (!iommu->root_entry) {
273 for (i = 0; i < ROOT_ENTRY_NR; i++) {
274 root = &iommu->root_entry[i];
275 context = get_context_addr_from_root(root);
277 free_pgtable_page(context);
279 free_pgtable_page(iommu->root_entry);
280 iommu->root_entry = NULL;
282 spin_unlock_irqrestore(&iommu->lock, flags);
285 /* page table handling */
286 #define LEVEL_STRIDE (9)
287 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
289 static inline int agaw_to_level(int agaw)
294 static inline int agaw_to_width(int agaw)
296 return 30 + agaw * LEVEL_STRIDE;
300 static inline int width_to_agaw(int width)
302 return (width - 30) / LEVEL_STRIDE;
305 static inline unsigned int level_to_offset_bits(int level)
307 return (12 + (level - 1) * LEVEL_STRIDE);
310 static inline int address_level_offset(u64 addr, int level)
312 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
315 static inline u64 level_mask(int level)
317 return ((u64)-1 << level_to_offset_bits(level));
320 static inline u64 level_size(int level)
322 return ((u64)1 << level_to_offset_bits(level));
325 static inline u64 align_to_level(u64 addr, int level)
327 return ((addr + level_size(level) - 1) & level_mask(level));
330 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
332 int addr_width = agaw_to_width(domain->agaw);
333 struct dma_pte *parent, *pte = NULL;
334 int level = agaw_to_level(domain->agaw);
338 BUG_ON(!domain->pgd);
340 addr &= (((u64)1) << addr_width) - 1;
341 parent = domain->pgd;
343 spin_lock_irqsave(&domain->mapping_lock, flags);
347 offset = address_level_offset(addr, level);
348 pte = &parent[offset];
352 if (!dma_pte_present(*pte)) {
353 tmp_page = alloc_pgtable_page();
356 spin_unlock_irqrestore(&domain->mapping_lock,
360 __iommu_flush_cache(domain->iommu, tmp_page,
362 dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
364 * high level table always sets r/w, last level page
365 * table control read/write
367 dma_set_pte_readable(*pte);
368 dma_set_pte_writable(*pte);
369 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
371 parent = phys_to_virt(dma_pte_addr(*pte));
375 spin_unlock_irqrestore(&domain->mapping_lock, flags);
379 /* return address's pte at specific level */
380 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
383 struct dma_pte *parent, *pte = NULL;
384 int total = agaw_to_level(domain->agaw);
387 parent = domain->pgd;
388 while (level <= total) {
389 offset = address_level_offset(addr, total);
390 pte = &parent[offset];
394 if (!dma_pte_present(*pte))
396 parent = phys_to_virt(dma_pte_addr(*pte));
402 /* clear one page's page table */
403 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
405 struct dma_pte *pte = NULL;
407 /* get last level pte */
408 pte = dma_addr_level_pte(domain, addr, 1);
412 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
416 /* clear last level pte, a tlb flush should be followed */
417 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
419 int addr_width = agaw_to_width(domain->agaw);
421 start &= (((u64)1) << addr_width) - 1;
422 end &= (((u64)1) << addr_width) - 1;
423 /* in case it's partial page */
424 start = PAGE_ALIGN_4K(start);
427 /* we don't need lock here, nobody else touches the iova range */
428 while (start < end) {
429 dma_pte_clear_one(domain, start);
430 start += PAGE_SIZE_4K;
434 /* free page table pages. last level pte should already be cleared */
435 static void dma_pte_free_pagetable(struct dmar_domain *domain,
438 int addr_width = agaw_to_width(domain->agaw);
440 int total = agaw_to_level(domain->agaw);
444 start &= (((u64)1) << addr_width) - 1;
445 end &= (((u64)1) << addr_width) - 1;
447 /* we don't need lock here, nobody else touches the iova range */
449 while (level <= total) {
450 tmp = align_to_level(start, level);
451 if (tmp >= end || (tmp + level_size(level) > end))
455 pte = dma_addr_level_pte(domain, tmp, level);
458 phys_to_virt(dma_pte_addr(*pte)));
460 __iommu_flush_cache(domain->iommu,
463 tmp += level_size(level);
468 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
469 free_pgtable_page(domain->pgd);
475 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
477 struct root_entry *root;
480 root = (struct root_entry *)alloc_pgtable_page();
484 __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
486 spin_lock_irqsave(&iommu->lock, flags);
487 iommu->root_entry = root;
488 spin_unlock_irqrestore(&iommu->lock, flags);
493 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
495 cycles_t start_time = get_cycles();\
497 sts = op (iommu->reg + offset);\
500 if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
501 panic("DMAR hardware is malfunctioning\n");\
506 static void iommu_set_root_entry(struct intel_iommu *iommu)
512 addr = iommu->root_entry;
514 spin_lock_irqsave(&iommu->register_lock, flag);
515 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
517 cmd = iommu->gcmd | DMA_GCMD_SRTP;
518 writel(cmd, iommu->reg + DMAR_GCMD_REG);
520 /* Make sure hardware complete it */
521 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
522 readl, (sts & DMA_GSTS_RTPS), sts);
524 spin_unlock_irqrestore(&iommu->register_lock, flag);
527 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
532 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
534 val = iommu->gcmd | DMA_GCMD_WBF;
536 spin_lock_irqsave(&iommu->register_lock, flag);
537 writel(val, iommu->reg + DMAR_GCMD_REG);
539 /* Make sure hardware complete it */
540 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
541 readl, (!(val & DMA_GSTS_WBFS)), val);
543 spin_unlock_irqrestore(&iommu->register_lock, flag);
546 /* return value determine if we need a write buffer flush */
547 static int __iommu_flush_context(struct intel_iommu *iommu,
548 u16 did, u16 source_id, u8 function_mask, u64 type,
549 int non_present_entry_flush)
555 * In the non-present entry flush case, if hardware doesn't cache
556 * non-present entry we do nothing and if hardware cache non-present
557 * entry, we flush entries of domain 0 (the domain id is used to cache
558 * any non-present entries)
560 if (non_present_entry_flush) {
561 if (!cap_caching_mode(iommu->cap))
568 case DMA_CCMD_GLOBAL_INVL:
569 val = DMA_CCMD_GLOBAL_INVL;
571 case DMA_CCMD_DOMAIN_INVL:
572 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
574 case DMA_CCMD_DEVICE_INVL:
575 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
576 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
583 spin_lock_irqsave(&iommu->register_lock, flag);
584 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
586 /* Make sure hardware complete it */
587 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
588 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
590 spin_unlock_irqrestore(&iommu->register_lock, flag);
592 /* flush context entry will implictly flush write buffer */
596 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
597 int non_present_entry_flush)
599 return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
600 non_present_entry_flush);
603 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
604 int non_present_entry_flush)
606 return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
607 non_present_entry_flush);
610 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
611 u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
613 return __iommu_flush_context(iommu, did, source_id, function_mask,
614 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
617 /* return value determine if we need a write buffer flush */
618 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
619 u64 addr, unsigned int size_order, u64 type,
620 int non_present_entry_flush)
622 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
623 u64 val = 0, val_iva = 0;
627 * In the non-present entry flush case, if hardware doesn't cache
628 * non-present entry we do nothing and if hardware cache non-present
629 * entry, we flush entries of domain 0 (the domain id is used to cache
630 * any non-present entries)
632 if (non_present_entry_flush) {
633 if (!cap_caching_mode(iommu->cap))
640 case DMA_TLB_GLOBAL_FLUSH:
641 /* global flush doesn't need set IVA_REG */
642 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
644 case DMA_TLB_DSI_FLUSH:
645 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
647 case DMA_TLB_PSI_FLUSH:
648 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
649 /* Note: always flush non-leaf currently */
650 val_iva = size_order | addr;
655 /* Note: set drain read/write */
658 * This is probably to be super secure.. Looks like we can
659 * ignore it without any impact.
661 if (cap_read_drain(iommu->cap))
662 val |= DMA_TLB_READ_DRAIN;
664 if (cap_write_drain(iommu->cap))
665 val |= DMA_TLB_WRITE_DRAIN;
667 spin_lock_irqsave(&iommu->register_lock, flag);
668 /* Note: Only uses first TLB reg currently */
670 dmar_writeq(iommu->reg + tlb_offset, val_iva);
671 dmar_writeq(iommu->reg + tlb_offset + 8, val);
673 /* Make sure hardware complete it */
674 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
675 dmar_readq, (!(val & DMA_TLB_IVT)), val);
677 spin_unlock_irqrestore(&iommu->register_lock, flag);
679 /* check IOTLB invalidation granularity */
680 if (DMA_TLB_IAIG(val) == 0)
681 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
682 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
683 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
684 DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
685 /* flush context entry will implictly flush write buffer */
689 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
690 int non_present_entry_flush)
692 return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
693 non_present_entry_flush);
696 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
697 int non_present_entry_flush)
699 return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
700 non_present_entry_flush);
703 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
704 u64 addr, unsigned int pages, int non_present_entry_flush)
708 BUG_ON(addr & (~PAGE_MASK_4K));
711 /* Fallback to domain selective flush if no PSI support */
712 if (!cap_pgsel_inv(iommu->cap))
713 return iommu_flush_iotlb_dsi(iommu, did,
714 non_present_entry_flush);
717 * PSI requires page size to be 2 ^ x, and the base address is naturally
718 * aligned to the size
720 mask = ilog2(__roundup_pow_of_two(pages));
721 /* Fallback to domain selective flush if size is too big */
722 if (mask > cap_max_amask_val(iommu->cap))
723 return iommu_flush_iotlb_dsi(iommu, did,
724 non_present_entry_flush);
726 return __iommu_flush_iotlb(iommu, did, addr, mask,
727 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
730 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
735 spin_lock_irqsave(&iommu->register_lock, flags);
736 pmen = readl(iommu->reg + DMAR_PMEN_REG);
737 pmen &= ~DMA_PMEN_EPM;
738 writel(pmen, iommu->reg + DMAR_PMEN_REG);
740 /* wait for the protected region status bit to clear */
741 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
742 readl, !(pmen & DMA_PMEN_PRS), pmen);
744 spin_unlock_irqrestore(&iommu->register_lock, flags);
747 static int iommu_enable_translation(struct intel_iommu *iommu)
752 spin_lock_irqsave(&iommu->register_lock, flags);
753 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
755 /* Make sure hardware complete it */
756 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
757 readl, (sts & DMA_GSTS_TES), sts);
759 iommu->gcmd |= DMA_GCMD_TE;
760 spin_unlock_irqrestore(&iommu->register_lock, flags);
764 static int iommu_disable_translation(struct intel_iommu *iommu)
769 spin_lock_irqsave(&iommu->register_lock, flag);
770 iommu->gcmd &= ~DMA_GCMD_TE;
771 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
773 /* Make sure hardware complete it */
774 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
775 readl, (!(sts & DMA_GSTS_TES)), sts);
777 spin_unlock_irqrestore(&iommu->register_lock, flag);
781 /* iommu interrupt handling. Most stuff are MSI-like. */
783 static const char *fault_reason_strings[] =
786 "Present bit in root entry is clear",
787 "Present bit in context entry is clear",
788 "Invalid context entry",
789 "Access beyond MGAW",
790 "PTE Write access is not set",
791 "PTE Read access is not set",
792 "Next page table ptr is invalid",
793 "Root table address invalid",
794 "Context table ptr is invalid",
795 "non-zero reserved fields in RTP",
796 "non-zero reserved fields in CTP",
797 "non-zero reserved fields in PTE",
799 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
801 const char *dmar_get_fault_reason(u8 fault_reason)
803 if (fault_reason > MAX_FAULT_REASON_IDX)
806 return fault_reason_strings[fault_reason];
809 void dmar_msi_unmask(unsigned int irq)
811 struct intel_iommu *iommu = get_irq_data(irq);
815 spin_lock_irqsave(&iommu->register_lock, flag);
816 writel(0, iommu->reg + DMAR_FECTL_REG);
817 /* Read a reg to force flush the post write */
818 readl(iommu->reg + DMAR_FECTL_REG);
819 spin_unlock_irqrestore(&iommu->register_lock, flag);
822 void dmar_msi_mask(unsigned int irq)
825 struct intel_iommu *iommu = get_irq_data(irq);
828 spin_lock_irqsave(&iommu->register_lock, flag);
829 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
830 /* Read a reg to force flush the post write */
831 readl(iommu->reg + DMAR_FECTL_REG);
832 spin_unlock_irqrestore(&iommu->register_lock, flag);
835 void dmar_msi_write(int irq, struct msi_msg *msg)
837 struct intel_iommu *iommu = get_irq_data(irq);
840 spin_lock_irqsave(&iommu->register_lock, flag);
841 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
842 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
843 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
844 spin_unlock_irqrestore(&iommu->register_lock, flag);
847 void dmar_msi_read(int irq, struct msi_msg *msg)
849 struct intel_iommu *iommu = get_irq_data(irq);
852 spin_lock_irqsave(&iommu->register_lock, flag);
853 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
854 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
855 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
856 spin_unlock_irqrestore(&iommu->register_lock, flag);
859 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
860 u8 fault_reason, u16 source_id, u64 addr)
864 reason = dmar_get_fault_reason(fault_reason);
867 "DMAR:[%s] Request device [%02x:%02x.%d] "
869 "DMAR:[fault reason %02d] %s\n",
870 (type ? "DMA Read" : "DMA Write"),
871 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
872 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
876 #define PRIMARY_FAULT_REG_LEN (16)
877 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
879 struct intel_iommu *iommu = dev_id;
880 int reg, fault_index;
884 spin_lock_irqsave(&iommu->register_lock, flag);
885 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
887 /* TBD: ignore advanced fault log currently */
888 if (!(fault_status & DMA_FSTS_PPF))
891 fault_index = dma_fsts_fault_record_index(fault_status);
892 reg = cap_fault_reg_offset(iommu->cap);
900 /* highest 32 bits */
901 data = readl(iommu->reg + reg +
902 fault_index * PRIMARY_FAULT_REG_LEN + 12);
903 if (!(data & DMA_FRCD_F))
906 fault_reason = dma_frcd_fault_reason(data);
907 type = dma_frcd_type(data);
909 data = readl(iommu->reg + reg +
910 fault_index * PRIMARY_FAULT_REG_LEN + 8);
911 source_id = dma_frcd_source_id(data);
913 guest_addr = dmar_readq(iommu->reg + reg +
914 fault_index * PRIMARY_FAULT_REG_LEN);
915 guest_addr = dma_frcd_page_addr(guest_addr);
916 /* clear the fault */
917 writel(DMA_FRCD_F, iommu->reg + reg +
918 fault_index * PRIMARY_FAULT_REG_LEN + 12);
920 spin_unlock_irqrestore(&iommu->register_lock, flag);
922 iommu_page_fault_do_one(iommu, type, fault_reason,
923 source_id, guest_addr);
926 if (fault_index > cap_num_fault_regs(iommu->cap))
928 spin_lock_irqsave(&iommu->register_lock, flag);
931 /* clear primary fault overflow */
932 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
933 if (fault_status & DMA_FSTS_PFO)
934 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
936 spin_unlock_irqrestore(&iommu->register_lock, flag);
940 int dmar_set_interrupt(struct intel_iommu *iommu)
946 printk(KERN_ERR "IOMMU: no free vectors\n");
950 set_irq_data(irq, iommu);
953 ret = arch_setup_dmar_msi(irq);
955 set_irq_data(irq, NULL);
961 /* Force fault register is cleared */
962 iommu_page_fault(irq, iommu);
964 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
966 printk(KERN_ERR "IOMMU: can't request irq\n");
970 static int iommu_init_domains(struct intel_iommu *iommu)
972 unsigned long ndomains;
973 unsigned long nlongs;
975 ndomains = cap_ndoms(iommu->cap);
976 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
977 nlongs = BITS_TO_LONGS(ndomains);
979 /* TBD: there might be 64K domains,
980 * consider other allocation for future chip
982 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
983 if (!iommu->domain_ids) {
984 printk(KERN_ERR "Allocating domain id array failed\n");
987 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
989 if (!iommu->domains) {
990 printk(KERN_ERR "Allocating domain array failed\n");
991 kfree(iommu->domain_ids);
996 * if Caching mode is set, then invalid translations are tagged
997 * with domainid 0. Hence we need to pre-allocate it.
999 if (cap_caching_mode(iommu->cap))
1000 set_bit(0, iommu->domain_ids);
1003 static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
1004 struct dmar_drhd_unit *drhd)
1010 iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
1012 printk(KERN_ERR "IOMMU: can't map the region\n");
1015 iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
1016 iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
1018 /* the registers might be more than one page */
1019 map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
1020 cap_max_fault_reg_offset(iommu->cap));
1021 map_size = PAGE_ALIGN_4K(map_size);
1022 if (map_size > PAGE_SIZE_4K) {
1023 iounmap(iommu->reg);
1024 iommu->reg = ioremap(drhd->reg_base_addr, map_size);
1026 printk(KERN_ERR "IOMMU: can't map the region\n");
1031 ver = readl(iommu->reg + DMAR_VER_REG);
1032 pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
1033 drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
1034 iommu->cap, iommu->ecap);
1035 ret = iommu_init_domains(iommu);
1038 spin_lock_init(&iommu->lock);
1039 spin_lock_init(&iommu->register_lock);
1041 drhd->iommu = iommu;
1044 iounmap(iommu->reg);
1050 static void domain_exit(struct dmar_domain *domain);
1051 static void free_iommu(struct intel_iommu *iommu)
1053 struct dmar_domain *domain;
1059 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1060 for (; i < cap_ndoms(iommu->cap); ) {
1061 domain = iommu->domains[i];
1062 clear_bit(i, iommu->domain_ids);
1063 domain_exit(domain);
1064 i = find_next_bit(iommu->domain_ids,
1065 cap_ndoms(iommu->cap), i+1);
1068 if (iommu->gcmd & DMA_GCMD_TE)
1069 iommu_disable_translation(iommu);
1072 set_irq_data(iommu->irq, NULL);
1073 /* This will mask the irq */
1074 free_irq(iommu->irq, iommu);
1075 destroy_irq(iommu->irq);
1078 kfree(iommu->domains);
1079 kfree(iommu->domain_ids);
1081 /* free context mapping */
1082 free_context_table(iommu);
1085 iounmap(iommu->reg);
1089 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1092 unsigned long ndomains;
1093 struct dmar_domain *domain;
1094 unsigned long flags;
1096 domain = alloc_domain_mem();
1100 ndomains = cap_ndoms(iommu->cap);
1102 spin_lock_irqsave(&iommu->lock, flags);
1103 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1104 if (num >= ndomains) {
1105 spin_unlock_irqrestore(&iommu->lock, flags);
1106 free_domain_mem(domain);
1107 printk(KERN_ERR "IOMMU: no free domain ids\n");
1111 set_bit(num, iommu->domain_ids);
1113 domain->iommu = iommu;
1114 iommu->domains[num] = domain;
1115 spin_unlock_irqrestore(&iommu->lock, flags);
1120 static void iommu_free_domain(struct dmar_domain *domain)
1122 unsigned long flags;
1124 spin_lock_irqsave(&domain->iommu->lock, flags);
1125 clear_bit(domain->id, domain->iommu->domain_ids);
1126 spin_unlock_irqrestore(&domain->iommu->lock, flags);
1129 static struct iova_domain reserved_iova_list;
1130 static struct lock_class_key reserved_alloc_key;
1131 static struct lock_class_key reserved_rbtree_key;
1133 static void dmar_init_reserved_ranges(void)
1135 struct pci_dev *pdev = NULL;
1140 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1142 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1143 &reserved_alloc_key);
1144 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1145 &reserved_rbtree_key);
1147 /* IOAPIC ranges shouldn't be accessed by DMA */
1148 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1149 IOVA_PFN(IOAPIC_RANGE_END));
1151 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1153 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1154 for_each_pci_dev(pdev) {
1157 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1158 r = &pdev->resource[i];
1159 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1162 addr &= PAGE_MASK_4K;
1163 size = r->end - addr;
1164 size = PAGE_ALIGN_4K(size);
1165 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1166 IOVA_PFN(size + addr) - 1);
1168 printk(KERN_ERR "Reserve iova failed\n");
1174 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1176 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1179 static inline int guestwidth_to_adjustwidth(int gaw)
1182 int r = (gaw - 12) % 9;
1193 static int domain_init(struct dmar_domain *domain, int guest_width)
1195 struct intel_iommu *iommu;
1196 int adjust_width, agaw;
1197 unsigned long sagaw;
1199 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1200 spin_lock_init(&domain->mapping_lock);
1202 domain_reserve_special_ranges(domain);
1204 /* calculate AGAW */
1205 iommu = domain->iommu;
1206 if (guest_width > cap_mgaw(iommu->cap))
1207 guest_width = cap_mgaw(iommu->cap);
1208 domain->gaw = guest_width;
1209 adjust_width = guestwidth_to_adjustwidth(guest_width);
1210 agaw = width_to_agaw(adjust_width);
1211 sagaw = cap_sagaw(iommu->cap);
1212 if (!test_bit(agaw, &sagaw)) {
1213 /* hardware doesn't support it, choose a bigger one */
1214 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1215 agaw = find_next_bit(&sagaw, 5, agaw);
1219 domain->agaw = agaw;
1220 INIT_LIST_HEAD(&domain->devices);
1222 /* always allocate the top pgd */
1223 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1226 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1230 static void domain_exit(struct dmar_domain *domain)
1234 /* Domain 0 is reserved, so dont process it */
1238 domain_remove_dev_info(domain);
1240 put_iova_domain(&domain->iovad);
1241 end = DOMAIN_MAX_ADDR(domain->gaw);
1242 end = end & (~PAGE_MASK_4K);
1245 dma_pte_clear_range(domain, 0, end);
1247 /* free page tables */
1248 dma_pte_free_pagetable(domain, 0, end);
1250 iommu_free_domain(domain);
1251 free_domain_mem(domain);
1254 static int domain_context_mapping_one(struct dmar_domain *domain,
1257 struct context_entry *context;
1258 struct intel_iommu *iommu = domain->iommu;
1259 unsigned long flags;
1261 pr_debug("Set context mapping for %02x:%02x.%d\n",
1262 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1263 BUG_ON(!domain->pgd);
1264 context = device_to_context_entry(iommu, bus, devfn);
1267 spin_lock_irqsave(&iommu->lock, flags);
1268 if (context_present(*context)) {
1269 spin_unlock_irqrestore(&iommu->lock, flags);
1273 context_set_domain_id(*context, domain->id);
1274 context_set_address_width(*context, domain->agaw);
1275 context_set_address_root(*context, virt_to_phys(domain->pgd));
1276 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1277 context_set_fault_enable(*context);
1278 context_set_present(*context);
1279 __iommu_flush_cache(iommu, context, sizeof(*context));
1281 /* it's a non-present to present mapping */
1282 if (iommu_flush_context_device(iommu, domain->id,
1283 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1284 iommu_flush_write_buffer(iommu);
1286 iommu_flush_iotlb_dsi(iommu, 0, 0);
1287 spin_unlock_irqrestore(&iommu->lock, flags);
1292 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1295 struct pci_dev *tmp, *parent;
1297 ret = domain_context_mapping_one(domain, pdev->bus->number,
1302 /* dependent device mapping */
1303 tmp = pci_find_upstream_pcie_bridge(pdev);
1306 /* Secondary interface's bus number and devfn 0 */
1307 parent = pdev->bus->self;
1308 while (parent != tmp) {
1309 ret = domain_context_mapping_one(domain, parent->bus->number,
1313 parent = parent->bus->self;
1315 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1316 return domain_context_mapping_one(domain,
1317 tmp->subordinate->number, 0);
1318 else /* this is a legacy PCI bridge */
1319 return domain_context_mapping_one(domain,
1320 tmp->bus->number, tmp->devfn);
1323 static int domain_context_mapped(struct dmar_domain *domain,
1324 struct pci_dev *pdev)
1327 struct pci_dev *tmp, *parent;
1329 ret = device_context_mapped(domain->iommu,
1330 pdev->bus->number, pdev->devfn);
1333 /* dependent device mapping */
1334 tmp = pci_find_upstream_pcie_bridge(pdev);
1337 /* Secondary interface's bus number and devfn 0 */
1338 parent = pdev->bus->self;
1339 while (parent != tmp) {
1340 ret = device_context_mapped(domain->iommu, parent->bus->number,
1344 parent = parent->bus->self;
1347 return device_context_mapped(domain->iommu,
1348 tmp->subordinate->number, 0);
1350 return device_context_mapped(domain->iommu,
1351 tmp->bus->number, tmp->devfn);
1355 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1356 u64 hpa, size_t size, int prot)
1358 u64 start_pfn, end_pfn;
1359 struct dma_pte *pte;
1362 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1364 iova &= PAGE_MASK_4K;
1365 start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1366 end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1368 while (start_pfn < end_pfn) {
1369 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1372 /* We don't need lock here, nobody else
1373 * touches the iova range
1375 BUG_ON(dma_pte_addr(*pte));
1376 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1377 dma_set_pte_prot(*pte, prot);
1378 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1385 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1387 clear_context_table(domain->iommu, bus, devfn);
1388 iommu_flush_context_global(domain->iommu, 0);
1389 iommu_flush_iotlb_global(domain->iommu, 0);
1392 static void domain_remove_dev_info(struct dmar_domain *domain)
1394 struct device_domain_info *info;
1395 unsigned long flags;
1397 spin_lock_irqsave(&device_domain_lock, flags);
1398 while (!list_empty(&domain->devices)) {
1399 info = list_entry(domain->devices.next,
1400 struct device_domain_info, link);
1401 list_del(&info->link);
1402 list_del(&info->global);
1404 info->dev->dev.archdata.iommu = NULL;
1405 spin_unlock_irqrestore(&device_domain_lock, flags);
1407 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1408 free_devinfo_mem(info);
1410 spin_lock_irqsave(&device_domain_lock, flags);
1412 spin_unlock_irqrestore(&device_domain_lock, flags);
1417 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1419 struct dmar_domain *
1420 find_domain(struct pci_dev *pdev)
1422 struct device_domain_info *info;
1424 /* No lock here, assumes no domain exit in normal case */
1425 info = pdev->dev.archdata.iommu;
1427 return info->domain;
1431 static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
1432 struct pci_dev *dev)
1437 for (index = 0; index < cnt; index++)
1438 if (dev == devices[index])
1441 /* Check our parent */
1442 dev = dev->bus->self;
1448 static struct dmar_drhd_unit *
1449 dmar_find_matched_drhd_unit(struct pci_dev *dev)
1451 struct dmar_drhd_unit *drhd = NULL;
1453 list_for_each_entry(drhd, &dmar_drhd_units, list) {
1454 if (drhd->include_all || dmar_pci_device_match(drhd->devices,
1455 drhd->devices_cnt, dev))
1462 /* domain is initialized */
1463 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1465 struct dmar_domain *domain, *found = NULL;
1466 struct intel_iommu *iommu;
1467 struct dmar_drhd_unit *drhd;
1468 struct device_domain_info *info, *tmp;
1469 struct pci_dev *dev_tmp;
1470 unsigned long flags;
1471 int bus = 0, devfn = 0;
1473 domain = find_domain(pdev);
1477 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1479 if (dev_tmp->is_pcie) {
1480 bus = dev_tmp->subordinate->number;
1483 bus = dev_tmp->bus->number;
1484 devfn = dev_tmp->devfn;
1486 spin_lock_irqsave(&device_domain_lock, flags);
1487 list_for_each_entry(info, &device_domain_list, global) {
1488 if (info->bus == bus && info->devfn == devfn) {
1489 found = info->domain;
1493 spin_unlock_irqrestore(&device_domain_lock, flags);
1494 /* pcie-pci bridge already has a domain, uses it */
1501 /* Allocate new domain for the device */
1502 drhd = dmar_find_matched_drhd_unit(pdev);
1504 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1508 iommu = drhd->iommu;
1510 domain = iommu_alloc_domain(iommu);
1514 if (domain_init(domain, gaw)) {
1515 domain_exit(domain);
1519 /* register pcie-to-pci device */
1521 info = alloc_devinfo_mem();
1523 domain_exit(domain);
1527 info->devfn = devfn;
1529 info->domain = domain;
1530 /* This domain is shared by devices under p2p bridge */
1531 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1533 /* pcie-to-pci bridge already has a domain, uses it */
1535 spin_lock_irqsave(&device_domain_lock, flags);
1536 list_for_each_entry(tmp, &device_domain_list, global) {
1537 if (tmp->bus == bus && tmp->devfn == devfn) {
1538 found = tmp->domain;
1543 free_devinfo_mem(info);
1544 domain_exit(domain);
1547 list_add(&info->link, &domain->devices);
1548 list_add(&info->global, &device_domain_list);
1550 spin_unlock_irqrestore(&device_domain_lock, flags);
1554 info = alloc_devinfo_mem();
1557 info->bus = pdev->bus->number;
1558 info->devfn = pdev->devfn;
1560 info->domain = domain;
1561 spin_lock_irqsave(&device_domain_lock, flags);
1562 /* somebody is fast */
1563 found = find_domain(pdev);
1564 if (found != NULL) {
1565 spin_unlock_irqrestore(&device_domain_lock, flags);
1566 if (found != domain) {
1567 domain_exit(domain);
1570 free_devinfo_mem(info);
1573 list_add(&info->link, &domain->devices);
1574 list_add(&info->global, &device_domain_list);
1575 pdev->dev.archdata.iommu = info;
1576 spin_unlock_irqrestore(&device_domain_lock, flags);
1579 /* recheck it here, maybe others set it */
1580 return find_domain(pdev);
1583 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1585 struct dmar_domain *domain;
1591 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1592 pci_name(pdev), start, end);
1593 /* page table init */
1594 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1598 /* The address might not be aligned */
1599 base = start & PAGE_MASK_4K;
1601 size = PAGE_ALIGN_4K(size);
1602 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1603 IOVA_PFN(base + size) - 1)) {
1604 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1609 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1610 size, base, pci_name(pdev));
1612 * RMRR range might have overlap with physical memory range,
1615 dma_pte_clear_range(domain, base, base + size);
1617 ret = domain_page_mapping(domain, base, base, size,
1618 DMA_PTE_READ|DMA_PTE_WRITE);
1622 /* context entry init */
1623 ret = domain_context_mapping(domain, pdev);
1627 domain_exit(domain);
1632 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1633 struct pci_dev *pdev)
1635 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1637 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1638 rmrr->end_address + 1);
1641 #ifdef CONFIG_DMAR_GFX_WA
1642 struct iommu_prepare_data {
1643 struct pci_dev *pdev;
1647 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1648 unsigned long end_pfn, void *datax)
1650 struct iommu_prepare_data *data;
1652 data = (struct iommu_prepare_data *)datax;
1654 data->ret = iommu_prepare_identity_map(data->pdev,
1655 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1660 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1663 struct iommu_prepare_data data;
1668 for_each_online_node(nid) {
1669 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1676 static void __init iommu_prepare_gfx_mapping(void)
1678 struct pci_dev *pdev = NULL;
1681 for_each_pci_dev(pdev) {
1682 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1683 !IS_GFX_DEVICE(pdev))
1685 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1687 ret = iommu_prepare_with_active_regions(pdev);
1689 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1694 #ifdef CONFIG_DMAR_FLOPPY_WA
1695 static inline void iommu_prepare_isa(void)
1697 struct pci_dev *pdev;
1700 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1704 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1705 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1708 printk("IOMMU: Failed to create 0-64M identity map, "
1709 "floppy might not work\n");
1713 static inline void iommu_prepare_isa(void)
1717 #endif /* !CONFIG_DMAR_FLPY_WA */
1719 int __init init_dmars(void)
1721 struct dmar_drhd_unit *drhd;
1722 struct dmar_rmrr_unit *rmrr;
1723 struct pci_dev *pdev;
1724 struct intel_iommu *iommu;
1725 int i, ret, unit = 0;
1730 * initialize and program root entry to not present
1733 for_each_drhd_unit(drhd) {
1738 * lock not needed as this is only incremented in the single
1739 * threaded kernel __init code path all other access are read
1744 g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
1750 deferred_flush = kzalloc(g_num_of_iommus *
1751 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1752 if (!deferred_flush) {
1758 for_each_drhd_unit(drhd) {
1761 iommu = alloc_iommu(&g_iommus[i], drhd);
1770 * we could share the same root & context tables
1771 * amoung all IOMMU's. Need to Split it later.
1773 ret = iommu_alloc_root_entry(iommu);
1775 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1782 * for each dev attached to rmrr
1784 * locate drhd for dev, alloc domain for dev
1785 * allocate free domain
1786 * allocate page table entries for rmrr
1787 * if context not allocated for bus
1788 * allocate and init context
1789 * set present in root table for this bus
1790 * init context with domain, translation etc
1794 for_each_rmrr_units(rmrr) {
1795 for (i = 0; i < rmrr->devices_cnt; i++) {
1796 pdev = rmrr->devices[i];
1797 /* some BIOS lists non-exist devices in DMAR table */
1800 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1803 "IOMMU: mapping reserved region failed\n");
1807 iommu_prepare_gfx_mapping();
1809 iommu_prepare_isa();
1814 * global invalidate context cache
1815 * global invalidate iotlb
1816 * enable translation
1818 for_each_drhd_unit(drhd) {
1821 iommu = drhd->iommu;
1822 sprintf (iommu->name, "dmar%d", unit++);
1824 iommu_flush_write_buffer(iommu);
1826 ret = dmar_set_interrupt(iommu);
1830 iommu_set_root_entry(iommu);
1832 iommu_flush_context_global(iommu, 0);
1833 iommu_flush_iotlb_global(iommu, 0);
1835 iommu_disable_protect_mem_regions(iommu);
1837 ret = iommu_enable_translation(iommu);
1844 for_each_drhd_unit(drhd) {
1847 iommu = drhd->iommu;
1854 static inline u64 aligned_size(u64 host_addr, size_t size)
1857 addr = (host_addr & (~PAGE_MASK_4K)) + size;
1858 return PAGE_ALIGN_4K(addr);
1862 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1866 /* Make sure it's in range */
1867 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1868 if (!size || (IOVA_START_ADDR + size > end))
1871 piova = alloc_iova(&domain->iovad,
1872 size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1876 static struct iova *
1877 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1880 struct pci_dev *pdev = to_pci_dev(dev);
1881 struct iova *iova = NULL;
1883 if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1884 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1887 * First try to allocate an io virtual address in
1888 * DMA_32BIT_MASK and if that fails then try allocating
1891 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1893 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1897 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1904 static struct dmar_domain *
1905 get_valid_domain_for_dev(struct pci_dev *pdev)
1907 struct dmar_domain *domain;
1910 domain = get_domain_for_dev(pdev,
1911 DEFAULT_DOMAIN_ADDRESS_WIDTH);
1914 "Allocating domain for %s failed", pci_name(pdev));
1918 /* make sure context mapping is ok */
1919 if (unlikely(!domain_context_mapped(domain, pdev))) {
1920 ret = domain_context_mapping(domain, pdev);
1923 "Domain context map for %s failed",
1933 intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
1935 struct pci_dev *pdev = to_pci_dev(hwdev);
1936 struct dmar_domain *domain;
1937 unsigned long start_paddr;
1942 BUG_ON(dir == DMA_NONE);
1943 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1946 domain = get_valid_domain_for_dev(pdev);
1950 size = aligned_size((u64)paddr, size);
1952 iova = __intel_alloc_iova(hwdev, domain, size);
1956 start_paddr = iova->pfn_lo << PAGE_SHIFT_4K;
1959 * Check if DMAR supports zero-length reads on write only
1962 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1963 !cap_zlr(domain->iommu->cap))
1964 prot |= DMA_PTE_READ;
1965 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1966 prot |= DMA_PTE_WRITE;
1968 * paddr - (paddr + size) might be partial page, we should map the whole
1969 * page. Note: if two part of one page are separately mapped, we
1970 * might have two guest_addr mapping to the same host paddr, but this
1971 * is not a big problem
1973 ret = domain_page_mapping(domain, start_paddr,
1974 ((u64)paddr) & PAGE_MASK_4K, size, prot);
1978 pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1979 pci_name(pdev), size, (u64)paddr,
1980 size, (u64)start_paddr, dir);
1982 /* it's a non-present to present mapping */
1983 ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1984 start_paddr, size >> PAGE_SHIFT_4K, 1);
1986 iommu_flush_write_buffer(domain->iommu);
1988 return (start_paddr + ((u64)paddr & (~PAGE_MASK_4K)));
1992 __free_iova(&domain->iovad, iova);
1993 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1994 pci_name(pdev), size, (u64)paddr, dir);
1998 static void flush_unmaps(void)
2004 /* just flush them all */
2005 for (i = 0; i < g_num_of_iommus; i++) {
2006 if (deferred_flush[i].next) {
2007 iommu_flush_iotlb_global(&g_iommus[i], 0);
2008 for (j = 0; j < deferred_flush[i].next; j++) {
2009 __free_iova(&deferred_flush[i].domain[j]->iovad,
2010 deferred_flush[i].iova[j]);
2012 deferred_flush[i].next = 0;
2019 static void flush_unmaps_timeout(unsigned long data)
2021 unsigned long flags;
2023 spin_lock_irqsave(&async_umap_flush_lock, flags);
2025 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2028 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2030 unsigned long flags;
2033 spin_lock_irqsave(&async_umap_flush_lock, flags);
2034 if (list_size == HIGH_WATER_MARK)
2037 iommu_id = dom->iommu - g_iommus;
2038 next = deferred_flush[iommu_id].next;
2039 deferred_flush[iommu_id].domain[next] = dom;
2040 deferred_flush[iommu_id].iova[next] = iova;
2041 deferred_flush[iommu_id].next++;
2044 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2048 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2051 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
2052 size_t size, int dir)
2054 struct pci_dev *pdev = to_pci_dev(dev);
2055 struct dmar_domain *domain;
2056 unsigned long start_addr;
2059 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2061 domain = find_domain(pdev);
2064 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2068 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2069 size = aligned_size((u64)dev_addr, size);
2071 pr_debug("Device %s unmapping: %lx@%llx\n",
2072 pci_name(pdev), size, (u64)start_addr);
2074 /* clear the whole page */
2075 dma_pte_clear_range(domain, start_addr, start_addr + size);
2076 /* free page tables */
2077 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2078 if (intel_iommu_strict) {
2079 if (iommu_flush_iotlb_psi(domain->iommu,
2080 domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
2081 iommu_flush_write_buffer(domain->iommu);
2083 __free_iova(&domain->iovad, iova);
2085 add_unmap(domain, iova);
2087 * queue up the release of the unmap to save the 1/6th of the
2088 * cpu used up by the iotlb flush operation...
2093 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
2094 dma_addr_t *dma_handle, gfp_t flags)
2099 size = PAGE_ALIGN_4K(size);
2100 order = get_order(size);
2101 flags &= ~(GFP_DMA | GFP_DMA32);
2103 vaddr = (void *)__get_free_pages(flags, order);
2106 memset(vaddr, 0, size);
2108 *dma_handle = intel_map_single(hwdev, virt_to_bus(vaddr), size, DMA_BIDIRECTIONAL);
2111 free_pages((unsigned long)vaddr, order);
2115 static void intel_free_coherent(struct device *hwdev, size_t size,
2116 void *vaddr, dma_addr_t dma_handle)
2120 size = PAGE_ALIGN_4K(size);
2121 order = get_order(size);
2123 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2124 free_pages((unsigned long)vaddr, order);
2127 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2128 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2129 int nelems, int dir)
2132 struct pci_dev *pdev = to_pci_dev(hwdev);
2133 struct dmar_domain *domain;
2134 unsigned long start_addr;
2138 struct scatterlist *sg;
2140 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2143 domain = find_domain(pdev);
2145 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2148 for_each_sg(sglist, sg, nelems, i) {
2149 addr = SG_ENT_VIRT_ADDRESS(sg);
2150 size += aligned_size((u64)addr, sg->length);
2153 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2155 /* clear the whole page */
2156 dma_pte_clear_range(domain, start_addr, start_addr + size);
2157 /* free page tables */
2158 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2160 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2161 size >> PAGE_SHIFT_4K, 0))
2162 iommu_flush_write_buffer(domain->iommu);
2165 __free_iova(&domain->iovad, iova);
2168 static int intel_nontranslate_map_sg(struct device *hddev,
2169 struct scatterlist *sglist, int nelems, int dir)
2172 struct scatterlist *sg;
2174 for_each_sg(sglist, sg, nelems, i) {
2175 BUG_ON(!sg_page(sg));
2176 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2177 sg->dma_length = sg->length;
2182 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2183 int nelems, int dir)
2187 struct pci_dev *pdev = to_pci_dev(hwdev);
2188 struct dmar_domain *domain;
2192 struct iova *iova = NULL;
2194 struct scatterlist *sg;
2195 unsigned long start_addr;
2197 BUG_ON(dir == DMA_NONE);
2198 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2199 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2201 domain = get_valid_domain_for_dev(pdev);
2205 for_each_sg(sglist, sg, nelems, i) {
2206 addr = SG_ENT_VIRT_ADDRESS(sg);
2207 addr = (void *)virt_to_phys(addr);
2208 size += aligned_size((u64)addr, sg->length);
2211 iova = __intel_alloc_iova(hwdev, domain, size);
2213 sglist->dma_length = 0;
2218 * Check if DMAR supports zero-length reads on write only
2221 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2222 !cap_zlr(domain->iommu->cap))
2223 prot |= DMA_PTE_READ;
2224 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2225 prot |= DMA_PTE_WRITE;
2227 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2229 for_each_sg(sglist, sg, nelems, i) {
2230 addr = SG_ENT_VIRT_ADDRESS(sg);
2231 addr = (void *)virt_to_phys(addr);
2232 size = aligned_size((u64)addr, sg->length);
2233 ret = domain_page_mapping(domain, start_addr + offset,
2234 ((u64)addr) & PAGE_MASK_4K,
2237 /* clear the page */
2238 dma_pte_clear_range(domain, start_addr,
2239 start_addr + offset);
2240 /* free page tables */
2241 dma_pte_free_pagetable(domain, start_addr,
2242 start_addr + offset);
2244 __free_iova(&domain->iovad, iova);
2247 sg->dma_address = start_addr + offset +
2248 ((u64)addr & (~PAGE_MASK_4K));
2249 sg->dma_length = sg->length;
2253 /* it's a non-present to present mapping */
2254 if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2255 start_addr, offset >> PAGE_SHIFT_4K, 1))
2256 iommu_flush_write_buffer(domain->iommu);
2260 static struct dma_mapping_ops intel_dma_ops = {
2261 .alloc_coherent = intel_alloc_coherent,
2262 .free_coherent = intel_free_coherent,
2263 .map_single = intel_map_single,
2264 .unmap_single = intel_unmap_single,
2265 .map_sg = intel_map_sg,
2266 .unmap_sg = intel_unmap_sg,
2269 static inline int iommu_domain_cache_init(void)
2273 iommu_domain_cache = kmem_cache_create("iommu_domain",
2274 sizeof(struct dmar_domain),
2279 if (!iommu_domain_cache) {
2280 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2287 static inline int iommu_devinfo_cache_init(void)
2291 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2292 sizeof(struct device_domain_info),
2297 if (!iommu_devinfo_cache) {
2298 printk(KERN_ERR "Couldn't create devinfo cache\n");
2305 static inline int iommu_iova_cache_init(void)
2309 iommu_iova_cache = kmem_cache_create("iommu_iova",
2310 sizeof(struct iova),
2315 if (!iommu_iova_cache) {
2316 printk(KERN_ERR "Couldn't create iova cache\n");
2323 static int __init iommu_init_mempool(void)
2326 ret = iommu_iova_cache_init();
2330 ret = iommu_domain_cache_init();
2334 ret = iommu_devinfo_cache_init();
2338 kmem_cache_destroy(iommu_domain_cache);
2340 kmem_cache_destroy(iommu_iova_cache);
2345 static void __init iommu_exit_mempool(void)
2347 kmem_cache_destroy(iommu_devinfo_cache);
2348 kmem_cache_destroy(iommu_domain_cache);
2349 kmem_cache_destroy(iommu_iova_cache);
2353 static int blacklist_iommu(const struct dmi_system_id *id)
2355 printk(KERN_INFO "%s detected; disabling IOMMU\n",
2361 static struct dmi_system_id __initdata intel_iommu_dmi_table[] = {
2362 { /* Some DG33BU BIOS revisions advertised non-existent VT-d */
2363 .callback = blacklist_iommu,
2364 .ident = "Intel DG33BU",
2365 { DMI_MATCH(DMI_BOARD_VENDOR, "Intel Corporation"),
2366 DMI_MATCH(DMI_BOARD_NAME, "DG33BU"),
2373 void __init detect_intel_iommu(void)
2375 if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2377 if (early_dmar_detect()) {
2378 dmi_check_system(intel_iommu_dmi_table);
2385 static void __init init_no_remapping_devices(void)
2387 struct dmar_drhd_unit *drhd;
2389 for_each_drhd_unit(drhd) {
2390 if (!drhd->include_all) {
2392 for (i = 0; i < drhd->devices_cnt; i++)
2393 if (drhd->devices[i] != NULL)
2395 /* ignore DMAR unit if no pci devices exist */
2396 if (i == drhd->devices_cnt)
2404 for_each_drhd_unit(drhd) {
2406 if (drhd->ignored || drhd->include_all)
2409 for (i = 0; i < drhd->devices_cnt; i++)
2410 if (drhd->devices[i] &&
2411 !IS_GFX_DEVICE(drhd->devices[i]))
2414 if (i < drhd->devices_cnt)
2417 /* bypass IOMMU if it is just for gfx devices */
2419 for (i = 0; i < drhd->devices_cnt; i++) {
2420 if (!drhd->devices[i])
2422 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2427 int __init intel_iommu_init(void)
2431 if (no_iommu || swiotlb || dmar_disabled)
2434 if (dmar_table_init())
2437 iommu_init_mempool();
2438 dmar_init_reserved_ranges();
2440 init_no_remapping_devices();
2444 printk(KERN_ERR "IOMMU: dmar init failed\n");
2445 put_iova_domain(&reserved_iova_list);
2446 iommu_exit_mempool();
2450 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2452 init_timer(&unmap_timer);
2454 dma_ops = &intel_dma_ops;
2458 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
2460 /* Mobile 4 Series Chipset neglects to set RWBF capability,
2462 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
2466 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);