Fix Intel IOMMU write-buffer flushing
[pandora-kernel.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  */
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/slab.h>
27 #include <linux/irq.h>
28 #include <linux/interrupt.h>
29 #include <linux/sysdev.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include "iova.h"
37 #include "intel-iommu.h"
38 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
41 #include "pci.h"
42
43 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
44 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
45
46 #define IOAPIC_RANGE_START      (0xfee00000)
47 #define IOAPIC_RANGE_END        (0xfeefffff)
48 #define IOVA_START_ADDR         (0x1000)
49
50 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
51
52 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) /* 10sec */
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56
57 static void flush_unmaps_timeout(unsigned long data);
58
59 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
60
61 static struct intel_iommu *g_iommus;
62
63 #define HIGH_WATER_MARK 250
64 struct deferred_flush_tables {
65         int next;
66         struct iova *iova[HIGH_WATER_MARK];
67         struct dmar_domain *domain[HIGH_WATER_MARK];
68 };
69
70 static struct deferred_flush_tables *deferred_flush;
71
72 /* bitmap for indexing intel_iommus */
73 static int g_num_of_iommus;
74
75 static int rwbf_quirk = 0;
76
77 static DEFINE_SPINLOCK(async_umap_flush_lock);
78 static LIST_HEAD(unmaps_to_do);
79
80 static int timer_on;
81 static long list_size;
82
83 static void domain_remove_dev_info(struct dmar_domain *domain);
84
85 static int dmar_disabled;
86 static int __initdata dmar_map_gfx = 1;
87 static int dmar_forcedac;
88 static int intel_iommu_strict;
89
90 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
91 static DEFINE_SPINLOCK(device_domain_lock);
92 static LIST_HEAD(device_domain_list);
93
94 static int __init intel_iommu_setup(char *str)
95 {
96         if (!str)
97                 return -EINVAL;
98         while (*str) {
99                 if (!strncmp(str, "off", 3)) {
100                         dmar_disabled = 1;
101                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
102                 } else if (!strncmp(str, "igfx_off", 8)) {
103                         dmar_map_gfx = 0;
104                         printk(KERN_INFO
105                                 "Intel-IOMMU: disable GFX device mapping\n");
106                 } else if (!strncmp(str, "forcedac", 8)) {
107                         printk(KERN_INFO
108                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
109                         dmar_forcedac = 1;
110                 } else if (!strncmp(str, "strict", 6)) {
111                         printk(KERN_INFO
112                                 "Intel-IOMMU: disable batched IOTLB flush\n");
113                         intel_iommu_strict = 1;
114                 }
115
116                 str += strcspn(str, ",");
117                 while (*str == ',')
118                         str++;
119         }
120         return 0;
121 }
122 __setup("intel_iommu=", intel_iommu_setup);
123
124 static struct kmem_cache *iommu_domain_cache;
125 static struct kmem_cache *iommu_devinfo_cache;
126 static struct kmem_cache *iommu_iova_cache;
127
128 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
129 {
130         unsigned int flags;
131         void *vaddr;
132
133         /* trying to avoid low memory issues */
134         flags = current->flags & PF_MEMALLOC;
135         current->flags |= PF_MEMALLOC;
136         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
137         current->flags &= (~PF_MEMALLOC | flags);
138         return vaddr;
139 }
140
141
142 static inline void *alloc_pgtable_page(void)
143 {
144         unsigned int flags;
145         void *vaddr;
146
147         /* trying to avoid low memory issues */
148         flags = current->flags & PF_MEMALLOC;
149         current->flags |= PF_MEMALLOC;
150         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
151         current->flags &= (~PF_MEMALLOC | flags);
152         return vaddr;
153 }
154
155 static inline void free_pgtable_page(void *vaddr)
156 {
157         free_page((unsigned long)vaddr);
158 }
159
160 static inline void *alloc_domain_mem(void)
161 {
162         return iommu_kmem_cache_alloc(iommu_domain_cache);
163 }
164
165 static inline void free_domain_mem(void *vaddr)
166 {
167         kmem_cache_free(iommu_domain_cache, vaddr);
168 }
169
170 static inline void * alloc_devinfo_mem(void)
171 {
172         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
173 }
174
175 static inline void free_devinfo_mem(void *vaddr)
176 {
177         kmem_cache_free(iommu_devinfo_cache, vaddr);
178 }
179
180 struct iova *alloc_iova_mem(void)
181 {
182         return iommu_kmem_cache_alloc(iommu_iova_cache);
183 }
184
185 void free_iova_mem(struct iova *iova)
186 {
187         kmem_cache_free(iommu_iova_cache, iova);
188 }
189
190 static inline void __iommu_flush_cache(
191         struct intel_iommu *iommu, void *addr, int size)
192 {
193         if (!ecap_coherent(iommu->ecap))
194                 clflush_cache_range(addr, size);
195 }
196
197 /* Gets context entry for a given bus and devfn */
198 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
199                 u8 bus, u8 devfn)
200 {
201         struct root_entry *root;
202         struct context_entry *context;
203         unsigned long phy_addr;
204         unsigned long flags;
205
206         spin_lock_irqsave(&iommu->lock, flags);
207         root = &iommu->root_entry[bus];
208         context = get_context_addr_from_root(root);
209         if (!context) {
210                 context = (struct context_entry *)alloc_pgtable_page();
211                 if (!context) {
212                         spin_unlock_irqrestore(&iommu->lock, flags);
213                         return NULL;
214                 }
215                 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
216                 phy_addr = virt_to_phys((void *)context);
217                 set_root_value(root, phy_addr);
218                 set_root_present(root);
219                 __iommu_flush_cache(iommu, root, sizeof(*root));
220         }
221         spin_unlock_irqrestore(&iommu->lock, flags);
222         return &context[devfn];
223 }
224
225 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
226 {
227         struct root_entry *root;
228         struct context_entry *context;
229         int ret;
230         unsigned long flags;
231
232         spin_lock_irqsave(&iommu->lock, flags);
233         root = &iommu->root_entry[bus];
234         context = get_context_addr_from_root(root);
235         if (!context) {
236                 ret = 0;
237                 goto out;
238         }
239         ret = context_present(context[devfn]);
240 out:
241         spin_unlock_irqrestore(&iommu->lock, flags);
242         return ret;
243 }
244
245 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
246 {
247         struct root_entry *root;
248         struct context_entry *context;
249         unsigned long flags;
250
251         spin_lock_irqsave(&iommu->lock, flags);
252         root = &iommu->root_entry[bus];
253         context = get_context_addr_from_root(root);
254         if (context) {
255                 context_clear_entry(context[devfn]);
256                 __iommu_flush_cache(iommu, &context[devfn], \
257                         sizeof(*context));
258         }
259         spin_unlock_irqrestore(&iommu->lock, flags);
260 }
261
262 static void free_context_table(struct intel_iommu *iommu)
263 {
264         struct root_entry *root;
265         int i;
266         unsigned long flags;
267         struct context_entry *context;
268
269         spin_lock_irqsave(&iommu->lock, flags);
270         if (!iommu->root_entry) {
271                 goto out;
272         }
273         for (i = 0; i < ROOT_ENTRY_NR; i++) {
274                 root = &iommu->root_entry[i];
275                 context = get_context_addr_from_root(root);
276                 if (context)
277                         free_pgtable_page(context);
278         }
279         free_pgtable_page(iommu->root_entry);
280         iommu->root_entry = NULL;
281 out:
282         spin_unlock_irqrestore(&iommu->lock, flags);
283 }
284
285 /* page table handling */
286 #define LEVEL_STRIDE            (9)
287 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
288
289 static inline int agaw_to_level(int agaw)
290 {
291         return agaw + 2;
292 }
293
294 static inline int agaw_to_width(int agaw)
295 {
296         return 30 + agaw * LEVEL_STRIDE;
297
298 }
299
300 static inline int width_to_agaw(int width)
301 {
302         return (width - 30) / LEVEL_STRIDE;
303 }
304
305 static inline unsigned int level_to_offset_bits(int level)
306 {
307         return (12 + (level - 1) * LEVEL_STRIDE);
308 }
309
310 static inline int address_level_offset(u64 addr, int level)
311 {
312         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
313 }
314
315 static inline u64 level_mask(int level)
316 {
317         return ((u64)-1 << level_to_offset_bits(level));
318 }
319
320 static inline u64 level_size(int level)
321 {
322         return ((u64)1 << level_to_offset_bits(level));
323 }
324
325 static inline u64 align_to_level(u64 addr, int level)
326 {
327         return ((addr + level_size(level) - 1) & level_mask(level));
328 }
329
330 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
331 {
332         int addr_width = agaw_to_width(domain->agaw);
333         struct dma_pte *parent, *pte = NULL;
334         int level = agaw_to_level(domain->agaw);
335         int offset;
336         unsigned long flags;
337
338         BUG_ON(!domain->pgd);
339
340         addr &= (((u64)1) << addr_width) - 1;
341         parent = domain->pgd;
342
343         spin_lock_irqsave(&domain->mapping_lock, flags);
344         while (level > 0) {
345                 void *tmp_page;
346
347                 offset = address_level_offset(addr, level);
348                 pte = &parent[offset];
349                 if (level == 1)
350                         break;
351
352                 if (!dma_pte_present(*pte)) {
353                         tmp_page = alloc_pgtable_page();
354
355                         if (!tmp_page) {
356                                 spin_unlock_irqrestore(&domain->mapping_lock,
357                                         flags);
358                                 return NULL;
359                         }
360                         __iommu_flush_cache(domain->iommu, tmp_page,
361                                         PAGE_SIZE_4K);
362                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
363                         /*
364                          * high level table always sets r/w, last level page
365                          * table control read/write
366                          */
367                         dma_set_pte_readable(*pte);
368                         dma_set_pte_writable(*pte);
369                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
370                 }
371                 parent = phys_to_virt(dma_pte_addr(*pte));
372                 level--;
373         }
374
375         spin_unlock_irqrestore(&domain->mapping_lock, flags);
376         return pte;
377 }
378
379 /* return address's pte at specific level */
380 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
381                 int level)
382 {
383         struct dma_pte *parent, *pte = NULL;
384         int total = agaw_to_level(domain->agaw);
385         int offset;
386
387         parent = domain->pgd;
388         while (level <= total) {
389                 offset = address_level_offset(addr, total);
390                 pte = &parent[offset];
391                 if (level == total)
392                         return pte;
393
394                 if (!dma_pte_present(*pte))
395                         break;
396                 parent = phys_to_virt(dma_pte_addr(*pte));
397                 total--;
398         }
399         return NULL;
400 }
401
402 /* clear one page's page table */
403 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
404 {
405         struct dma_pte *pte = NULL;
406
407         /* get last level pte */
408         pte = dma_addr_level_pte(domain, addr, 1);
409
410         if (pte) {
411                 dma_clear_pte(*pte);
412                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
413         }
414 }
415
416 /* clear last level pte, a tlb flush should be followed */
417 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
418 {
419         int addr_width = agaw_to_width(domain->agaw);
420
421         start &= (((u64)1) << addr_width) - 1;
422         end &= (((u64)1) << addr_width) - 1;
423         /* in case it's partial page */
424         start = PAGE_ALIGN_4K(start);
425         end &= PAGE_MASK_4K;
426
427         /* we don't need lock here, nobody else touches the iova range */
428         while (start < end) {
429                 dma_pte_clear_one(domain, start);
430                 start += PAGE_SIZE_4K;
431         }
432 }
433
434 /* free page table pages. last level pte should already be cleared */
435 static void dma_pte_free_pagetable(struct dmar_domain *domain,
436         u64 start, u64 end)
437 {
438         int addr_width = agaw_to_width(domain->agaw);
439         struct dma_pte *pte;
440         int total = agaw_to_level(domain->agaw);
441         int level;
442         u64 tmp;
443
444         start &= (((u64)1) << addr_width) - 1;
445         end &= (((u64)1) << addr_width) - 1;
446
447         /* we don't need lock here, nobody else touches the iova range */
448         level = 2;
449         while (level <= total) {
450                 tmp = align_to_level(start, level);
451                 if (tmp >= end || (tmp + level_size(level) > end))
452                         return;
453
454                 while (tmp < end) {
455                         pte = dma_addr_level_pte(domain, tmp, level);
456                         if (pte) {
457                                 free_pgtable_page(
458                                         phys_to_virt(dma_pte_addr(*pte)));
459                                 dma_clear_pte(*pte);
460                                 __iommu_flush_cache(domain->iommu,
461                                                 pte, sizeof(*pte));
462                         }
463                         tmp += level_size(level);
464                 }
465                 level++;
466         }
467         /* free pgd */
468         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
469                 free_pgtable_page(domain->pgd);
470                 domain->pgd = NULL;
471         }
472 }
473
474 /* iommu handling */
475 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
476 {
477         struct root_entry *root;
478         unsigned long flags;
479
480         root = (struct root_entry *)alloc_pgtable_page();
481         if (!root)
482                 return -ENOMEM;
483
484         __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
485
486         spin_lock_irqsave(&iommu->lock, flags);
487         iommu->root_entry = root;
488         spin_unlock_irqrestore(&iommu->lock, flags);
489
490         return 0;
491 }
492
493 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
494 {\
495         cycles_t start_time = get_cycles();\
496         while (1) {\
497                 sts = op (iommu->reg + offset);\
498                 if (cond)\
499                         break;\
500                 if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
501                         panic("DMAR hardware is malfunctioning\n");\
502                 cpu_relax();\
503         }\
504 }
505
506 static void iommu_set_root_entry(struct intel_iommu *iommu)
507 {
508         void *addr;
509         u32 cmd, sts;
510         unsigned long flag;
511
512         addr = iommu->root_entry;
513
514         spin_lock_irqsave(&iommu->register_lock, flag);
515         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
516
517         cmd = iommu->gcmd | DMA_GCMD_SRTP;
518         writel(cmd, iommu->reg + DMAR_GCMD_REG);
519
520         /* Make sure hardware complete it */
521         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
522                 readl, (sts & DMA_GSTS_RTPS), sts);
523
524         spin_unlock_irqrestore(&iommu->register_lock, flag);
525 }
526
527 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
528 {
529         u32 val;
530         unsigned long flag;
531
532         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
533                 return;
534         val = iommu->gcmd | DMA_GCMD_WBF;
535
536         spin_lock_irqsave(&iommu->register_lock, flag);
537         writel(val, iommu->reg + DMAR_GCMD_REG);
538
539         /* Make sure hardware complete it */
540         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
541                         readl, (!(val & DMA_GSTS_WBFS)), val);
542
543         spin_unlock_irqrestore(&iommu->register_lock, flag);
544 }
545
546 /* return value determine if we need a write buffer flush */
547 static int __iommu_flush_context(struct intel_iommu *iommu,
548         u16 did, u16 source_id, u8 function_mask, u64 type,
549         int non_present_entry_flush)
550 {
551         u64 val = 0;
552         unsigned long flag;
553
554         /*
555          * In the non-present entry flush case, if hardware doesn't cache
556          * non-present entry we do nothing and if hardware cache non-present
557          * entry, we flush entries of domain 0 (the domain id is used to cache
558          * any non-present entries)
559          */
560         if (non_present_entry_flush) {
561                 if (!cap_caching_mode(iommu->cap))
562                         return 1;
563                 else
564                         did = 0;
565         }
566
567         switch (type) {
568         case DMA_CCMD_GLOBAL_INVL:
569                 val = DMA_CCMD_GLOBAL_INVL;
570                 break;
571         case DMA_CCMD_DOMAIN_INVL:
572                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
573                 break;
574         case DMA_CCMD_DEVICE_INVL:
575                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
576                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
577                 break;
578         default:
579                 BUG();
580         }
581         val |= DMA_CCMD_ICC;
582
583         spin_lock_irqsave(&iommu->register_lock, flag);
584         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
585
586         /* Make sure hardware complete it */
587         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
588                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
589
590         spin_unlock_irqrestore(&iommu->register_lock, flag);
591
592         /* flush context entry will implictly flush write buffer */
593         return 0;
594 }
595
596 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
597         int non_present_entry_flush)
598 {
599         return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
600                 non_present_entry_flush);
601 }
602
603 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
604         int non_present_entry_flush)
605 {
606         return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
607                 non_present_entry_flush);
608 }
609
610 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
611         u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
612 {
613         return __iommu_flush_context(iommu, did, source_id, function_mask,
614                 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
615 }
616
617 /* return value determine if we need a write buffer flush */
618 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
619         u64 addr, unsigned int size_order, u64 type,
620         int non_present_entry_flush)
621 {
622         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
623         u64 val = 0, val_iva = 0;
624         unsigned long flag;
625
626         /*
627          * In the non-present entry flush case, if hardware doesn't cache
628          * non-present entry we do nothing and if hardware cache non-present
629          * entry, we flush entries of domain 0 (the domain id is used to cache
630          * any non-present entries)
631          */
632         if (non_present_entry_flush) {
633                 if (!cap_caching_mode(iommu->cap))
634                         return 1;
635                 else
636                         did = 0;
637         }
638
639         switch (type) {
640         case DMA_TLB_GLOBAL_FLUSH:
641                 /* global flush doesn't need set IVA_REG */
642                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
643                 break;
644         case DMA_TLB_DSI_FLUSH:
645                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
646                 break;
647         case DMA_TLB_PSI_FLUSH:
648                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
649                 /* Note: always flush non-leaf currently */
650                 val_iva = size_order | addr;
651                 break;
652         default:
653                 BUG();
654         }
655         /* Note: set drain read/write */
656 #if 0
657         /*
658          * This is probably to be super secure.. Looks like we can
659          * ignore it without any impact.
660          */
661         if (cap_read_drain(iommu->cap))
662                 val |= DMA_TLB_READ_DRAIN;
663 #endif
664         if (cap_write_drain(iommu->cap))
665                 val |= DMA_TLB_WRITE_DRAIN;
666
667         spin_lock_irqsave(&iommu->register_lock, flag);
668         /* Note: Only uses first TLB reg currently */
669         if (val_iva)
670                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
671         dmar_writeq(iommu->reg + tlb_offset + 8, val);
672
673         /* Make sure hardware complete it */
674         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
675                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
676
677         spin_unlock_irqrestore(&iommu->register_lock, flag);
678
679         /* check IOTLB invalidation granularity */
680         if (DMA_TLB_IAIG(val) == 0)
681                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
682         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
683                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
684                         DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
685         /* flush context entry will implictly flush write buffer */
686         return 0;
687 }
688
689 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
690         int non_present_entry_flush)
691 {
692         return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
693                 non_present_entry_flush);
694 }
695
696 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
697         int non_present_entry_flush)
698 {
699         return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
700                 non_present_entry_flush);
701 }
702
703 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
704         u64 addr, unsigned int pages, int non_present_entry_flush)
705 {
706         unsigned int mask;
707
708         BUG_ON(addr & (~PAGE_MASK_4K));
709         BUG_ON(pages == 0);
710
711         /* Fallback to domain selective flush if no PSI support */
712         if (!cap_pgsel_inv(iommu->cap))
713                 return iommu_flush_iotlb_dsi(iommu, did,
714                         non_present_entry_flush);
715
716         /*
717          * PSI requires page size to be 2 ^ x, and the base address is naturally
718          * aligned to the size
719          */
720         mask = ilog2(__roundup_pow_of_two(pages));
721         /* Fallback to domain selective flush if size is too big */
722         if (mask > cap_max_amask_val(iommu->cap))
723                 return iommu_flush_iotlb_dsi(iommu, did,
724                         non_present_entry_flush);
725
726         return __iommu_flush_iotlb(iommu, did, addr, mask,
727                 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
728 }
729
730 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
731 {
732         u32 pmen;
733         unsigned long flags;
734
735         spin_lock_irqsave(&iommu->register_lock, flags);
736         pmen = readl(iommu->reg + DMAR_PMEN_REG);
737         pmen &= ~DMA_PMEN_EPM;
738         writel(pmen, iommu->reg + DMAR_PMEN_REG);
739
740         /* wait for the protected region status bit to clear */
741         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
742                 readl, !(pmen & DMA_PMEN_PRS), pmen);
743
744         spin_unlock_irqrestore(&iommu->register_lock, flags);
745 }
746
747 static int iommu_enable_translation(struct intel_iommu *iommu)
748 {
749         u32 sts;
750         unsigned long flags;
751
752         spin_lock_irqsave(&iommu->register_lock, flags);
753         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
754
755         /* Make sure hardware complete it */
756         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
757                 readl, (sts & DMA_GSTS_TES), sts);
758
759         iommu->gcmd |= DMA_GCMD_TE;
760         spin_unlock_irqrestore(&iommu->register_lock, flags);
761         return 0;
762 }
763
764 static int iommu_disable_translation(struct intel_iommu *iommu)
765 {
766         u32 sts;
767         unsigned long flag;
768
769         spin_lock_irqsave(&iommu->register_lock, flag);
770         iommu->gcmd &= ~DMA_GCMD_TE;
771         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
772
773         /* Make sure hardware complete it */
774         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
775                 readl, (!(sts & DMA_GSTS_TES)), sts);
776
777         spin_unlock_irqrestore(&iommu->register_lock, flag);
778         return 0;
779 }
780
781 /* iommu interrupt handling. Most stuff are MSI-like. */
782
783 static const char *fault_reason_strings[] =
784 {
785         "Software",
786         "Present bit in root entry is clear",
787         "Present bit in context entry is clear",
788         "Invalid context entry",
789         "Access beyond MGAW",
790         "PTE Write access is not set",
791         "PTE Read access is not set",
792         "Next page table ptr is invalid",
793         "Root table address invalid",
794         "Context table ptr is invalid",
795         "non-zero reserved fields in RTP",
796         "non-zero reserved fields in CTP",
797         "non-zero reserved fields in PTE",
798 };
799 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
800
801 const char *dmar_get_fault_reason(u8 fault_reason)
802 {
803         if (fault_reason > MAX_FAULT_REASON_IDX)
804                 return "Unknown";
805         else
806                 return fault_reason_strings[fault_reason];
807 }
808
809 void dmar_msi_unmask(unsigned int irq)
810 {
811         struct intel_iommu *iommu = get_irq_data(irq);
812         unsigned long flag;
813
814         /* unmask it */
815         spin_lock_irqsave(&iommu->register_lock, flag);
816         writel(0, iommu->reg + DMAR_FECTL_REG);
817         /* Read a reg to force flush the post write */
818         readl(iommu->reg + DMAR_FECTL_REG);
819         spin_unlock_irqrestore(&iommu->register_lock, flag);
820 }
821
822 void dmar_msi_mask(unsigned int irq)
823 {
824         unsigned long flag;
825         struct intel_iommu *iommu = get_irq_data(irq);
826
827         /* mask it */
828         spin_lock_irqsave(&iommu->register_lock, flag);
829         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
830         /* Read a reg to force flush the post write */
831         readl(iommu->reg + DMAR_FECTL_REG);
832         spin_unlock_irqrestore(&iommu->register_lock, flag);
833 }
834
835 void dmar_msi_write(int irq, struct msi_msg *msg)
836 {
837         struct intel_iommu *iommu = get_irq_data(irq);
838         unsigned long flag;
839
840         spin_lock_irqsave(&iommu->register_lock, flag);
841         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
842         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
843         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
844         spin_unlock_irqrestore(&iommu->register_lock, flag);
845 }
846
847 void dmar_msi_read(int irq, struct msi_msg *msg)
848 {
849         struct intel_iommu *iommu = get_irq_data(irq);
850         unsigned long flag;
851
852         spin_lock_irqsave(&iommu->register_lock, flag);
853         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
854         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
855         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
856         spin_unlock_irqrestore(&iommu->register_lock, flag);
857 }
858
859 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
860                 u8 fault_reason, u16 source_id, u64 addr)
861 {
862         const char *reason;
863
864         reason = dmar_get_fault_reason(fault_reason);
865
866         printk(KERN_ERR
867                 "DMAR:[%s] Request device [%02x:%02x.%d] "
868                 "fault addr %llx \n"
869                 "DMAR:[fault reason %02d] %s\n",
870                 (type ? "DMA Read" : "DMA Write"),
871                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
872                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
873         return 0;
874 }
875
876 #define PRIMARY_FAULT_REG_LEN (16)
877 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
878 {
879         struct intel_iommu *iommu = dev_id;
880         int reg, fault_index;
881         u32 fault_status;
882         unsigned long flag;
883
884         spin_lock_irqsave(&iommu->register_lock, flag);
885         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
886
887         /* TBD: ignore advanced fault log currently */
888         if (!(fault_status & DMA_FSTS_PPF))
889                 goto clear_overflow;
890
891         fault_index = dma_fsts_fault_record_index(fault_status);
892         reg = cap_fault_reg_offset(iommu->cap);
893         while (1) {
894                 u8 fault_reason;
895                 u16 source_id;
896                 u64 guest_addr;
897                 int type;
898                 u32 data;
899
900                 /* highest 32 bits */
901                 data = readl(iommu->reg + reg +
902                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
903                 if (!(data & DMA_FRCD_F))
904                         break;
905
906                 fault_reason = dma_frcd_fault_reason(data);
907                 type = dma_frcd_type(data);
908
909                 data = readl(iommu->reg + reg +
910                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
911                 source_id = dma_frcd_source_id(data);
912
913                 guest_addr = dmar_readq(iommu->reg + reg +
914                                 fault_index * PRIMARY_FAULT_REG_LEN);
915                 guest_addr = dma_frcd_page_addr(guest_addr);
916                 /* clear the fault */
917                 writel(DMA_FRCD_F, iommu->reg + reg +
918                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
919
920                 spin_unlock_irqrestore(&iommu->register_lock, flag);
921
922                 iommu_page_fault_do_one(iommu, type, fault_reason,
923                                 source_id, guest_addr);
924
925                 fault_index++;
926                 if (fault_index > cap_num_fault_regs(iommu->cap))
927                         fault_index = 0;
928                 spin_lock_irqsave(&iommu->register_lock, flag);
929         }
930 clear_overflow:
931         /* clear primary fault overflow */
932         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
933         if (fault_status & DMA_FSTS_PFO)
934                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
935
936         spin_unlock_irqrestore(&iommu->register_lock, flag);
937         return IRQ_HANDLED;
938 }
939
940 int dmar_set_interrupt(struct intel_iommu *iommu)
941 {
942         int irq, ret;
943
944         irq = create_irq();
945         if (!irq) {
946                 printk(KERN_ERR "IOMMU: no free vectors\n");
947                 return -EINVAL;
948         }
949
950         set_irq_data(irq, iommu);
951         iommu->irq = irq;
952
953         ret = arch_setup_dmar_msi(irq);
954         if (ret) {
955                 set_irq_data(irq, NULL);
956                 iommu->irq = 0;
957                 destroy_irq(irq);
958                 return 0;
959         }
960
961         /* Force fault register is cleared */
962         iommu_page_fault(irq, iommu);
963
964         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
965         if (ret)
966                 printk(KERN_ERR "IOMMU: can't request irq\n");
967         return ret;
968 }
969
970 static int iommu_init_domains(struct intel_iommu *iommu)
971 {
972         unsigned long ndomains;
973         unsigned long nlongs;
974
975         ndomains = cap_ndoms(iommu->cap);
976         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
977         nlongs = BITS_TO_LONGS(ndomains);
978
979         /* TBD: there might be 64K domains,
980          * consider other allocation for future chip
981          */
982         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
983         if (!iommu->domain_ids) {
984                 printk(KERN_ERR "Allocating domain id array failed\n");
985                 return -ENOMEM;
986         }
987         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
988                         GFP_KERNEL);
989         if (!iommu->domains) {
990                 printk(KERN_ERR "Allocating domain array failed\n");
991                 kfree(iommu->domain_ids);
992                 return -ENOMEM;
993         }
994
995         /*
996          * if Caching mode is set, then invalid translations are tagged
997          * with domainid 0. Hence we need to pre-allocate it.
998          */
999         if (cap_caching_mode(iommu->cap))
1000                 set_bit(0, iommu->domain_ids);
1001         return 0;
1002 }
1003 static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
1004                                         struct dmar_drhd_unit *drhd)
1005 {
1006         int ret;
1007         int map_size;
1008         u32 ver;
1009
1010         iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
1011         if (!iommu->reg) {
1012                 printk(KERN_ERR "IOMMU: can't map the region\n");
1013                 goto error;
1014         }
1015         iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
1016         iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
1017
1018         /* the registers might be more than one page */
1019         map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
1020                 cap_max_fault_reg_offset(iommu->cap));
1021         map_size = PAGE_ALIGN_4K(map_size);
1022         if (map_size > PAGE_SIZE_4K) {
1023                 iounmap(iommu->reg);
1024                 iommu->reg = ioremap(drhd->reg_base_addr, map_size);
1025                 if (!iommu->reg) {
1026                         printk(KERN_ERR "IOMMU: can't map the region\n");
1027                         goto error;
1028                 }
1029         }
1030
1031         ver = readl(iommu->reg + DMAR_VER_REG);
1032         pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
1033                 drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
1034                 iommu->cap, iommu->ecap);
1035         ret = iommu_init_domains(iommu);
1036         if (ret)
1037                 goto error_unmap;
1038         spin_lock_init(&iommu->lock);
1039         spin_lock_init(&iommu->register_lock);
1040
1041         drhd->iommu = iommu;
1042         return iommu;
1043 error_unmap:
1044         iounmap(iommu->reg);
1045 error:
1046         kfree(iommu);
1047         return NULL;
1048 }
1049
1050 static void domain_exit(struct dmar_domain *domain);
1051 static void free_iommu(struct intel_iommu *iommu)
1052 {
1053         struct dmar_domain *domain;
1054         int i;
1055
1056         if (!iommu)
1057                 return;
1058
1059         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1060         for (; i < cap_ndoms(iommu->cap); ) {
1061                 domain = iommu->domains[i];
1062                 clear_bit(i, iommu->domain_ids);
1063                 domain_exit(domain);
1064                 i = find_next_bit(iommu->domain_ids,
1065                         cap_ndoms(iommu->cap), i+1);
1066         }
1067
1068         if (iommu->gcmd & DMA_GCMD_TE)
1069                 iommu_disable_translation(iommu);
1070
1071         if (iommu->irq) {
1072                 set_irq_data(iommu->irq, NULL);
1073                 /* This will mask the irq */
1074                 free_irq(iommu->irq, iommu);
1075                 destroy_irq(iommu->irq);
1076         }
1077
1078         kfree(iommu->domains);
1079         kfree(iommu->domain_ids);
1080
1081         /* free context mapping */
1082         free_context_table(iommu);
1083
1084         if (iommu->reg)
1085                 iounmap(iommu->reg);
1086         kfree(iommu);
1087 }
1088
1089 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1090 {
1091         unsigned long num;
1092         unsigned long ndomains;
1093         struct dmar_domain *domain;
1094         unsigned long flags;
1095
1096         domain = alloc_domain_mem();
1097         if (!domain)
1098                 return NULL;
1099
1100         ndomains = cap_ndoms(iommu->cap);
1101
1102         spin_lock_irqsave(&iommu->lock, flags);
1103         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1104         if (num >= ndomains) {
1105                 spin_unlock_irqrestore(&iommu->lock, flags);
1106                 free_domain_mem(domain);
1107                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1108                 return NULL;
1109         }
1110
1111         set_bit(num, iommu->domain_ids);
1112         domain->id = num;
1113         domain->iommu = iommu;
1114         iommu->domains[num] = domain;
1115         spin_unlock_irqrestore(&iommu->lock, flags);
1116
1117         return domain;
1118 }
1119
1120 static void iommu_free_domain(struct dmar_domain *domain)
1121 {
1122         unsigned long flags;
1123
1124         spin_lock_irqsave(&domain->iommu->lock, flags);
1125         clear_bit(domain->id, domain->iommu->domain_ids);
1126         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1127 }
1128
1129 static struct iova_domain reserved_iova_list;
1130 static struct lock_class_key reserved_alloc_key;
1131 static struct lock_class_key reserved_rbtree_key;
1132
1133 static void dmar_init_reserved_ranges(void)
1134 {
1135         struct pci_dev *pdev = NULL;
1136         struct iova *iova;
1137         int i;
1138         u64 addr, size;
1139
1140         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1141
1142         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1143                 &reserved_alloc_key);
1144         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1145                 &reserved_rbtree_key);
1146
1147         /* IOAPIC ranges shouldn't be accessed by DMA */
1148         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1149                 IOVA_PFN(IOAPIC_RANGE_END));
1150         if (!iova)
1151                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1152
1153         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1154         for_each_pci_dev(pdev) {
1155                 struct resource *r;
1156
1157                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1158                         r = &pdev->resource[i];
1159                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1160                                 continue;
1161                         addr = r->start;
1162                         addr &= PAGE_MASK_4K;
1163                         size = r->end - addr;
1164                         size = PAGE_ALIGN_4K(size);
1165                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1166                                 IOVA_PFN(size + addr) - 1);
1167                         if (!iova)
1168                                 printk(KERN_ERR "Reserve iova failed\n");
1169                 }
1170         }
1171
1172 }
1173
1174 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1175 {
1176         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1177 }
1178
1179 static inline int guestwidth_to_adjustwidth(int gaw)
1180 {
1181         int agaw;
1182         int r = (gaw - 12) % 9;
1183
1184         if (r == 0)
1185                 agaw = gaw;
1186         else
1187                 agaw = gaw + 9 - r;
1188         if (agaw > 64)
1189                 agaw = 64;
1190         return agaw;
1191 }
1192
1193 static int domain_init(struct dmar_domain *domain, int guest_width)
1194 {
1195         struct intel_iommu *iommu;
1196         int adjust_width, agaw;
1197         unsigned long sagaw;
1198
1199         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1200         spin_lock_init(&domain->mapping_lock);
1201
1202         domain_reserve_special_ranges(domain);
1203
1204         /* calculate AGAW */
1205         iommu = domain->iommu;
1206         if (guest_width > cap_mgaw(iommu->cap))
1207                 guest_width = cap_mgaw(iommu->cap);
1208         domain->gaw = guest_width;
1209         adjust_width = guestwidth_to_adjustwidth(guest_width);
1210         agaw = width_to_agaw(adjust_width);
1211         sagaw = cap_sagaw(iommu->cap);
1212         if (!test_bit(agaw, &sagaw)) {
1213                 /* hardware doesn't support it, choose a bigger one */
1214                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1215                 agaw = find_next_bit(&sagaw, 5, agaw);
1216                 if (agaw >= 5)
1217                         return -ENODEV;
1218         }
1219         domain->agaw = agaw;
1220         INIT_LIST_HEAD(&domain->devices);
1221
1222         /* always allocate the top pgd */
1223         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1224         if (!domain->pgd)
1225                 return -ENOMEM;
1226         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1227         return 0;
1228 }
1229
1230 static void domain_exit(struct dmar_domain *domain)
1231 {
1232         u64 end;
1233
1234         /* Domain 0 is reserved, so dont process it */
1235         if (!domain)
1236                 return;
1237
1238         domain_remove_dev_info(domain);
1239         /* destroy iovas */
1240         put_iova_domain(&domain->iovad);
1241         end = DOMAIN_MAX_ADDR(domain->gaw);
1242         end = end & (~PAGE_MASK_4K);
1243
1244         /* clear ptes */
1245         dma_pte_clear_range(domain, 0, end);
1246
1247         /* free page tables */
1248         dma_pte_free_pagetable(domain, 0, end);
1249
1250         iommu_free_domain(domain);
1251         free_domain_mem(domain);
1252 }
1253
1254 static int domain_context_mapping_one(struct dmar_domain *domain,
1255                 u8 bus, u8 devfn)
1256 {
1257         struct context_entry *context;
1258         struct intel_iommu *iommu = domain->iommu;
1259         unsigned long flags;
1260
1261         pr_debug("Set context mapping for %02x:%02x.%d\n",
1262                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1263         BUG_ON(!domain->pgd);
1264         context = device_to_context_entry(iommu, bus, devfn);
1265         if (!context)
1266                 return -ENOMEM;
1267         spin_lock_irqsave(&iommu->lock, flags);
1268         if (context_present(*context)) {
1269                 spin_unlock_irqrestore(&iommu->lock, flags);
1270                 return 0;
1271         }
1272
1273         context_set_domain_id(*context, domain->id);
1274         context_set_address_width(*context, domain->agaw);
1275         context_set_address_root(*context, virt_to_phys(domain->pgd));
1276         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1277         context_set_fault_enable(*context);
1278         context_set_present(*context);
1279         __iommu_flush_cache(iommu, context, sizeof(*context));
1280
1281         /* it's a non-present to present mapping */
1282         if (iommu_flush_context_device(iommu, domain->id,
1283                         (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1284                 iommu_flush_write_buffer(iommu);
1285         else
1286                 iommu_flush_iotlb_dsi(iommu, 0, 0);
1287         spin_unlock_irqrestore(&iommu->lock, flags);
1288         return 0;
1289 }
1290
1291 static int
1292 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1293 {
1294         int ret;
1295         struct pci_dev *tmp, *parent;
1296
1297         ret = domain_context_mapping_one(domain, pdev->bus->number,
1298                 pdev->devfn);
1299         if (ret)
1300                 return ret;
1301
1302         /* dependent device mapping */
1303         tmp = pci_find_upstream_pcie_bridge(pdev);
1304         if (!tmp)
1305                 return 0;
1306         /* Secondary interface's bus number and devfn 0 */
1307         parent = pdev->bus->self;
1308         while (parent != tmp) {
1309                 ret = domain_context_mapping_one(domain, parent->bus->number,
1310                         parent->devfn);
1311                 if (ret)
1312                         return ret;
1313                 parent = parent->bus->self;
1314         }
1315         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1316                 return domain_context_mapping_one(domain,
1317                         tmp->subordinate->number, 0);
1318         else /* this is a legacy PCI bridge */
1319                 return domain_context_mapping_one(domain,
1320                         tmp->bus->number, tmp->devfn);
1321 }
1322
1323 static int domain_context_mapped(struct dmar_domain *domain,
1324         struct pci_dev *pdev)
1325 {
1326         int ret;
1327         struct pci_dev *tmp, *parent;
1328
1329         ret = device_context_mapped(domain->iommu,
1330                 pdev->bus->number, pdev->devfn);
1331         if (!ret)
1332                 return ret;
1333         /* dependent device mapping */
1334         tmp = pci_find_upstream_pcie_bridge(pdev);
1335         if (!tmp)
1336                 return ret;
1337         /* Secondary interface's bus number and devfn 0 */
1338         parent = pdev->bus->self;
1339         while (parent != tmp) {
1340                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1341                         parent->devfn);
1342                 if (!ret)
1343                         return ret;
1344                 parent = parent->bus->self;
1345         }
1346         if (tmp->is_pcie)
1347                 return device_context_mapped(domain->iommu,
1348                         tmp->subordinate->number, 0);
1349         else
1350                 return device_context_mapped(domain->iommu,
1351                         tmp->bus->number, tmp->devfn);
1352 }
1353
1354 static int
1355 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1356                         u64 hpa, size_t size, int prot)
1357 {
1358         u64 start_pfn, end_pfn;
1359         struct dma_pte *pte;
1360         int index;
1361
1362         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1363                 return -EINVAL;
1364         iova &= PAGE_MASK_4K;
1365         start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1366         end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1367         index = 0;
1368         while (start_pfn < end_pfn) {
1369                 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1370                 if (!pte)
1371                         return -ENOMEM;
1372                 /* We don't need lock here, nobody else
1373                  * touches the iova range
1374                  */
1375                 BUG_ON(dma_pte_addr(*pte));
1376                 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1377                 dma_set_pte_prot(*pte, prot);
1378                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1379                 start_pfn++;
1380                 index++;
1381         }
1382         return 0;
1383 }
1384
1385 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1386 {
1387         clear_context_table(domain->iommu, bus, devfn);
1388         iommu_flush_context_global(domain->iommu, 0);
1389         iommu_flush_iotlb_global(domain->iommu, 0);
1390 }
1391
1392 static void domain_remove_dev_info(struct dmar_domain *domain)
1393 {
1394         struct device_domain_info *info;
1395         unsigned long flags;
1396
1397         spin_lock_irqsave(&device_domain_lock, flags);
1398         while (!list_empty(&domain->devices)) {
1399                 info = list_entry(domain->devices.next,
1400                         struct device_domain_info, link);
1401                 list_del(&info->link);
1402                 list_del(&info->global);
1403                 if (info->dev)
1404                         info->dev->dev.archdata.iommu = NULL;
1405                 spin_unlock_irqrestore(&device_domain_lock, flags);
1406
1407                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1408                 free_devinfo_mem(info);
1409
1410                 spin_lock_irqsave(&device_domain_lock, flags);
1411         }
1412         spin_unlock_irqrestore(&device_domain_lock, flags);
1413 }
1414
1415 /*
1416  * find_domain
1417  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1418  */
1419 struct dmar_domain *
1420 find_domain(struct pci_dev *pdev)
1421 {
1422         struct device_domain_info *info;
1423
1424         /* No lock here, assumes no domain exit in normal case */
1425         info = pdev->dev.archdata.iommu;
1426         if (info)
1427                 return info->domain;
1428         return NULL;
1429 }
1430
1431 static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
1432      struct pci_dev *dev)
1433 {
1434         int index;
1435
1436         while (dev) {
1437                 for (index = 0; index < cnt; index++)
1438                         if (dev == devices[index])
1439                                 return 1;
1440
1441                 /* Check our parent */
1442                 dev = dev->bus->self;
1443         }
1444
1445         return 0;
1446 }
1447
1448 static struct dmar_drhd_unit *
1449 dmar_find_matched_drhd_unit(struct pci_dev *dev)
1450 {
1451         struct dmar_drhd_unit *drhd = NULL;
1452
1453         list_for_each_entry(drhd, &dmar_drhd_units, list) {
1454                 if (drhd->include_all || dmar_pci_device_match(drhd->devices,
1455                                                 drhd->devices_cnt, dev))
1456                         return drhd;
1457         }
1458
1459         return NULL;
1460 }
1461
1462 /* domain is initialized */
1463 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1464 {
1465         struct dmar_domain *domain, *found = NULL;
1466         struct intel_iommu *iommu;
1467         struct dmar_drhd_unit *drhd;
1468         struct device_domain_info *info, *tmp;
1469         struct pci_dev *dev_tmp;
1470         unsigned long flags;
1471         int bus = 0, devfn = 0;
1472
1473         domain = find_domain(pdev);
1474         if (domain)
1475                 return domain;
1476
1477         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1478         if (dev_tmp) {
1479                 if (dev_tmp->is_pcie) {
1480                         bus = dev_tmp->subordinate->number;
1481                         devfn = 0;
1482                 } else {
1483                         bus = dev_tmp->bus->number;
1484                         devfn = dev_tmp->devfn;
1485                 }
1486                 spin_lock_irqsave(&device_domain_lock, flags);
1487                 list_for_each_entry(info, &device_domain_list, global) {
1488                         if (info->bus == bus && info->devfn == devfn) {
1489                                 found = info->domain;
1490                                 break;
1491                         }
1492                 }
1493                 spin_unlock_irqrestore(&device_domain_lock, flags);
1494                 /* pcie-pci bridge already has a domain, uses it */
1495                 if (found) {
1496                         domain = found;
1497                         goto found_domain;
1498                 }
1499         }
1500
1501         /* Allocate new domain for the device */
1502         drhd = dmar_find_matched_drhd_unit(pdev);
1503         if (!drhd) {
1504                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1505                         pci_name(pdev));
1506                 return NULL;
1507         }
1508         iommu = drhd->iommu;
1509
1510         domain = iommu_alloc_domain(iommu);
1511         if (!domain)
1512                 goto error;
1513
1514         if (domain_init(domain, gaw)) {
1515                 domain_exit(domain);
1516                 goto error;
1517         }
1518
1519         /* register pcie-to-pci device */
1520         if (dev_tmp) {
1521                 info = alloc_devinfo_mem();
1522                 if (!info) {
1523                         domain_exit(domain);
1524                         goto error;
1525                 }
1526                 info->bus = bus;
1527                 info->devfn = devfn;
1528                 info->dev = NULL;
1529                 info->domain = domain;
1530                 /* This domain is shared by devices under p2p bridge */
1531                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1532
1533                 /* pcie-to-pci bridge already has a domain, uses it */
1534                 found = NULL;
1535                 spin_lock_irqsave(&device_domain_lock, flags);
1536                 list_for_each_entry(tmp, &device_domain_list, global) {
1537                         if (tmp->bus == bus && tmp->devfn == devfn) {
1538                                 found = tmp->domain;
1539                                 break;
1540                         }
1541                 }
1542                 if (found) {
1543                         free_devinfo_mem(info);
1544                         domain_exit(domain);
1545                         domain = found;
1546                 } else {
1547                         list_add(&info->link, &domain->devices);
1548                         list_add(&info->global, &device_domain_list);
1549                 }
1550                 spin_unlock_irqrestore(&device_domain_lock, flags);
1551         }
1552
1553 found_domain:
1554         info = alloc_devinfo_mem();
1555         if (!info)
1556                 goto error;
1557         info->bus = pdev->bus->number;
1558         info->devfn = pdev->devfn;
1559         info->dev = pdev;
1560         info->domain = domain;
1561         spin_lock_irqsave(&device_domain_lock, flags);
1562         /* somebody is fast */
1563         found = find_domain(pdev);
1564         if (found != NULL) {
1565                 spin_unlock_irqrestore(&device_domain_lock, flags);
1566                 if (found != domain) {
1567                         domain_exit(domain);
1568                         domain = found;
1569                 }
1570                 free_devinfo_mem(info);
1571                 return domain;
1572         }
1573         list_add(&info->link, &domain->devices);
1574         list_add(&info->global, &device_domain_list);
1575         pdev->dev.archdata.iommu = info;
1576         spin_unlock_irqrestore(&device_domain_lock, flags);
1577         return domain;
1578 error:
1579         /* recheck it here, maybe others set it */
1580         return find_domain(pdev);
1581 }
1582
1583 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1584 {
1585         struct dmar_domain *domain;
1586         unsigned long size;
1587         u64 base;
1588         int ret;
1589
1590         printk(KERN_INFO
1591                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1592                 pci_name(pdev), start, end);
1593         /* page table init */
1594         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1595         if (!domain)
1596                 return -ENOMEM;
1597
1598         /* The address might not be aligned */
1599         base = start & PAGE_MASK_4K;
1600         size = end - base;
1601         size = PAGE_ALIGN_4K(size);
1602         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1603                         IOVA_PFN(base + size) - 1)) {
1604                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1605                 ret = -ENOMEM;
1606                 goto error;
1607         }
1608
1609         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1610                 size, base, pci_name(pdev));
1611         /*
1612          * RMRR range might have overlap with physical memory range,
1613          * clear it first
1614          */
1615         dma_pte_clear_range(domain, base, base + size);
1616
1617         ret = domain_page_mapping(domain, base, base, size,
1618                 DMA_PTE_READ|DMA_PTE_WRITE);
1619         if (ret)
1620                 goto error;
1621
1622         /* context entry init */
1623         ret = domain_context_mapping(domain, pdev);
1624         if (!ret)
1625                 return 0;
1626 error:
1627         domain_exit(domain);
1628         return ret;
1629
1630 }
1631
1632 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1633         struct pci_dev *pdev)
1634 {
1635         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1636                 return 0;
1637         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1638                 rmrr->end_address + 1);
1639 }
1640
1641 #ifdef CONFIG_DMAR_GFX_WA
1642 struct iommu_prepare_data {
1643         struct pci_dev *pdev;
1644         int ret;
1645 };
1646
1647 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1648                                          unsigned long end_pfn, void *datax)
1649 {
1650         struct iommu_prepare_data *data;
1651
1652         data = (struct iommu_prepare_data *)datax;
1653
1654         data->ret = iommu_prepare_identity_map(data->pdev,
1655                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1656         return data->ret;
1657
1658 }
1659
1660 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1661 {
1662         int nid;
1663         struct iommu_prepare_data data;
1664
1665         data.pdev = pdev;
1666         data.ret = 0;
1667
1668         for_each_online_node(nid) {
1669                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1670                 if (data.ret)
1671                         return data.ret;
1672         }
1673         return data.ret;
1674 }
1675
1676 static void __init iommu_prepare_gfx_mapping(void)
1677 {
1678         struct pci_dev *pdev = NULL;
1679         int ret;
1680
1681         for_each_pci_dev(pdev) {
1682                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1683                                 !IS_GFX_DEVICE(pdev))
1684                         continue;
1685                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1686                         pci_name(pdev));
1687                 ret = iommu_prepare_with_active_regions(pdev);
1688                 if (ret)
1689                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1690         }
1691 }
1692 #endif
1693
1694 #ifdef CONFIG_DMAR_FLOPPY_WA
1695 static inline void iommu_prepare_isa(void)
1696 {
1697         struct pci_dev *pdev;
1698         int ret;
1699
1700         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1701         if (!pdev)
1702                 return;
1703
1704         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1705         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1706
1707         if (ret)
1708                 printk("IOMMU: Failed to create 0-64M identity map, "
1709                         "floppy might not work\n");
1710
1711 }
1712 #else
1713 static inline void iommu_prepare_isa(void)
1714 {
1715         return;
1716 }
1717 #endif /* !CONFIG_DMAR_FLPY_WA */
1718
1719 int __init init_dmars(void)
1720 {
1721         struct dmar_drhd_unit *drhd;
1722         struct dmar_rmrr_unit *rmrr;
1723         struct pci_dev *pdev;
1724         struct intel_iommu *iommu;
1725         int i, ret, unit = 0;
1726
1727         /*
1728          * for each drhd
1729          *    allocate root
1730          *    initialize and program root entry to not present
1731          * endfor
1732          */
1733         for_each_drhd_unit(drhd) {
1734                 if (drhd->ignored)
1735                         continue;
1736                 g_num_of_iommus++;
1737                 /*
1738                  * lock not needed as this is only incremented in the single
1739                  * threaded kernel __init code path all other access are read
1740                  * only
1741                  */
1742         }
1743
1744         g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
1745         if (!g_iommus) {
1746                 ret = -ENOMEM;
1747                 goto error;
1748         }
1749
1750         deferred_flush = kzalloc(g_num_of_iommus *
1751                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1752         if (!deferred_flush) {
1753                 ret = -ENOMEM;
1754                 goto error;
1755         }
1756
1757         i = 0;
1758         for_each_drhd_unit(drhd) {
1759                 if (drhd->ignored)
1760                         continue;
1761                 iommu = alloc_iommu(&g_iommus[i], drhd);
1762                 i++;
1763                 if (!iommu) {
1764                         ret = -ENOMEM;
1765                         goto error;
1766                 }
1767
1768                 /*
1769                  * TBD:
1770                  * we could share the same root & context tables
1771                  * amoung all IOMMU's. Need to Split it later.
1772                  */
1773                 ret = iommu_alloc_root_entry(iommu);
1774                 if (ret) {
1775                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1776                         goto error;
1777                 }
1778         }
1779
1780         /*
1781          * For each rmrr
1782          *   for each dev attached to rmrr
1783          *   do
1784          *     locate drhd for dev, alloc domain for dev
1785          *     allocate free domain
1786          *     allocate page table entries for rmrr
1787          *     if context not allocated for bus
1788          *           allocate and init context
1789          *           set present in root table for this bus
1790          *     init context with domain, translation etc
1791          *    endfor
1792          * endfor
1793          */
1794         for_each_rmrr_units(rmrr) {
1795                 for (i = 0; i < rmrr->devices_cnt; i++) {
1796                         pdev = rmrr->devices[i];
1797                         /* some BIOS lists non-exist devices in DMAR table */
1798                         if (!pdev)
1799                                 continue;
1800                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1801                         if (ret)
1802                                 printk(KERN_ERR
1803                                  "IOMMU: mapping reserved region failed\n");
1804                 }
1805         }
1806
1807         iommu_prepare_gfx_mapping();
1808
1809         iommu_prepare_isa();
1810
1811         /*
1812          * for each drhd
1813          *   enable fault log
1814          *   global invalidate context cache
1815          *   global invalidate iotlb
1816          *   enable translation
1817          */
1818         for_each_drhd_unit(drhd) {
1819                 if (drhd->ignored)
1820                         continue;
1821                 iommu = drhd->iommu;
1822                 sprintf (iommu->name, "dmar%d", unit++);
1823
1824                 iommu_flush_write_buffer(iommu);
1825
1826                 ret = dmar_set_interrupt(iommu);
1827                 if (ret)
1828                         goto error;
1829
1830                 iommu_set_root_entry(iommu);
1831
1832                 iommu_flush_context_global(iommu, 0);
1833                 iommu_flush_iotlb_global(iommu, 0);
1834
1835                 iommu_disable_protect_mem_regions(iommu);
1836
1837                 ret = iommu_enable_translation(iommu);
1838                 if (ret)
1839                         goto error;
1840         }
1841
1842         return 0;
1843 error:
1844         for_each_drhd_unit(drhd) {
1845                 if (drhd->ignored)
1846                         continue;
1847                 iommu = drhd->iommu;
1848                 free_iommu(iommu);
1849         }
1850         kfree(g_iommus);
1851         return ret;
1852 }
1853
1854 static inline u64 aligned_size(u64 host_addr, size_t size)
1855 {
1856         u64 addr;
1857         addr = (host_addr & (~PAGE_MASK_4K)) + size;
1858         return PAGE_ALIGN_4K(addr);
1859 }
1860
1861 struct iova *
1862 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1863 {
1864         struct iova *piova;
1865
1866         /* Make sure it's in range */
1867         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1868         if (!size || (IOVA_START_ADDR + size > end))
1869                 return NULL;
1870
1871         piova = alloc_iova(&domain->iovad,
1872                         size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1873         return piova;
1874 }
1875
1876 static struct iova *
1877 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1878                 size_t size)
1879 {
1880         struct pci_dev *pdev = to_pci_dev(dev);
1881         struct iova *iova = NULL;
1882
1883         if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1884                 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1885         } else  {
1886                 /*
1887                  * First try to allocate an io virtual address in
1888                  * DMA_32BIT_MASK and if that fails then try allocating
1889                  * from higher range
1890                  */
1891                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1892                 if (!iova)
1893                         iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1894         }
1895
1896         if (!iova) {
1897                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1898                 return NULL;
1899         }
1900
1901         return iova;
1902 }
1903
1904 static struct dmar_domain *
1905 get_valid_domain_for_dev(struct pci_dev *pdev)
1906 {
1907         struct dmar_domain *domain;
1908         int ret;
1909
1910         domain = get_domain_for_dev(pdev,
1911                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1912         if (!domain) {
1913                 printk(KERN_ERR
1914                         "Allocating domain for %s failed", pci_name(pdev));
1915                 return NULL;
1916         }
1917
1918         /* make sure context mapping is ok */
1919         if (unlikely(!domain_context_mapped(domain, pdev))) {
1920                 ret = domain_context_mapping(domain, pdev);
1921                 if (ret) {
1922                         printk(KERN_ERR
1923                                 "Domain context map for %s failed",
1924                                 pci_name(pdev));
1925                         return NULL;
1926                 }
1927         }
1928
1929         return domain;
1930 }
1931
1932 static dma_addr_t
1933 intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
1934 {
1935         struct pci_dev *pdev = to_pci_dev(hwdev);
1936         struct dmar_domain *domain;
1937         unsigned long start_paddr;
1938         struct iova *iova;
1939         int prot = 0;
1940         int ret;
1941
1942         BUG_ON(dir == DMA_NONE);
1943         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1944                 return paddr;
1945
1946         domain = get_valid_domain_for_dev(pdev);
1947         if (!domain)
1948                 return 0;
1949
1950         size = aligned_size((u64)paddr, size);
1951
1952         iova = __intel_alloc_iova(hwdev, domain, size);
1953         if (!iova)
1954                 goto error;
1955
1956         start_paddr = iova->pfn_lo << PAGE_SHIFT_4K;
1957
1958         /*
1959          * Check if DMAR supports zero-length reads on write only
1960          * mappings..
1961          */
1962         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1963                         !cap_zlr(domain->iommu->cap))
1964                 prot |= DMA_PTE_READ;
1965         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1966                 prot |= DMA_PTE_WRITE;
1967         /*
1968          * paddr - (paddr + size) might be partial page, we should map the whole
1969          * page.  Note: if two part of one page are separately mapped, we
1970          * might have two guest_addr mapping to the same host paddr, but this
1971          * is not a big problem
1972          */
1973         ret = domain_page_mapping(domain, start_paddr,
1974                 ((u64)paddr) & PAGE_MASK_4K, size, prot);
1975         if (ret)
1976                 goto error;
1977
1978         pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1979                 pci_name(pdev), size, (u64)paddr,
1980                 size, (u64)start_paddr, dir);
1981
1982         /* it's a non-present to present mapping */
1983         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1984                         start_paddr, size >> PAGE_SHIFT_4K, 1);
1985         if (ret)
1986                 iommu_flush_write_buffer(domain->iommu);
1987
1988         return (start_paddr + ((u64)paddr & (~PAGE_MASK_4K)));
1989
1990 error:
1991         if (iova)
1992                 __free_iova(&domain->iovad, iova);
1993         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1994                 pci_name(pdev), size, (u64)paddr, dir);
1995         return 0;
1996 }
1997
1998 static void flush_unmaps(void)
1999 {
2000         int i, j;
2001
2002         timer_on = 0;
2003
2004         /* just flush them all */
2005         for (i = 0; i < g_num_of_iommus; i++) {
2006                 if (deferred_flush[i].next) {
2007                         iommu_flush_iotlb_global(&g_iommus[i], 0);
2008                         for (j = 0; j < deferred_flush[i].next; j++) {
2009                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2010                                                 deferred_flush[i].iova[j]);
2011                         }
2012                         deferred_flush[i].next = 0;
2013                 }
2014         }
2015
2016         list_size = 0;
2017 }
2018
2019 static void flush_unmaps_timeout(unsigned long data)
2020 {
2021         unsigned long flags;
2022
2023         spin_lock_irqsave(&async_umap_flush_lock, flags);
2024         flush_unmaps();
2025         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2026 }
2027
2028 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2029 {
2030         unsigned long flags;
2031         int next, iommu_id;
2032
2033         spin_lock_irqsave(&async_umap_flush_lock, flags);
2034         if (list_size == HIGH_WATER_MARK)
2035                 flush_unmaps();
2036
2037         iommu_id = dom->iommu - g_iommus;
2038         next = deferred_flush[iommu_id].next;
2039         deferred_flush[iommu_id].domain[next] = dom;
2040         deferred_flush[iommu_id].iova[next] = iova;
2041         deferred_flush[iommu_id].next++;
2042
2043         if (!timer_on) {
2044                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2045                 timer_on = 1;
2046         }
2047         list_size++;
2048         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2049 }
2050
2051 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
2052         size_t size, int dir)
2053 {
2054         struct pci_dev *pdev = to_pci_dev(dev);
2055         struct dmar_domain *domain;
2056         unsigned long start_addr;
2057         struct iova *iova;
2058
2059         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2060                 return;
2061         domain = find_domain(pdev);
2062         BUG_ON(!domain);
2063
2064         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2065         if (!iova)
2066                 return;
2067
2068         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2069         size = aligned_size((u64)dev_addr, size);
2070
2071         pr_debug("Device %s unmapping: %lx@%llx\n",
2072                 pci_name(pdev), size, (u64)start_addr);
2073
2074         /*  clear the whole page */
2075         dma_pte_clear_range(domain, start_addr, start_addr + size);
2076         /* free page tables */
2077         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2078         if (intel_iommu_strict) {
2079                 if (iommu_flush_iotlb_psi(domain->iommu,
2080                         domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
2081                         iommu_flush_write_buffer(domain->iommu);
2082                 /* free iova */
2083                 __free_iova(&domain->iovad, iova);
2084         } else {
2085                 add_unmap(domain, iova);
2086                 /*
2087                  * queue up the release of the unmap to save the 1/6th of the
2088                  * cpu used up by the iotlb flush operation...
2089                  */
2090         }
2091 }
2092
2093 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
2094                        dma_addr_t *dma_handle, gfp_t flags)
2095 {
2096         void *vaddr;
2097         int order;
2098
2099         size = PAGE_ALIGN_4K(size);
2100         order = get_order(size);
2101         flags &= ~(GFP_DMA | GFP_DMA32);
2102
2103         vaddr = (void *)__get_free_pages(flags, order);
2104         if (!vaddr)
2105                 return NULL;
2106         memset(vaddr, 0, size);
2107
2108         *dma_handle = intel_map_single(hwdev, virt_to_bus(vaddr), size, DMA_BIDIRECTIONAL);
2109         if (*dma_handle)
2110                 return vaddr;
2111         free_pages((unsigned long)vaddr, order);
2112         return NULL;
2113 }
2114
2115 static void intel_free_coherent(struct device *hwdev, size_t size,
2116         void *vaddr, dma_addr_t dma_handle)
2117 {
2118         int order;
2119
2120         size = PAGE_ALIGN_4K(size);
2121         order = get_order(size);
2122
2123         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2124         free_pages((unsigned long)vaddr, order);
2125 }
2126
2127 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2128 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2129         int nelems, int dir)
2130 {
2131         int i;
2132         struct pci_dev *pdev = to_pci_dev(hwdev);
2133         struct dmar_domain *domain;
2134         unsigned long start_addr;
2135         struct iova *iova;
2136         size_t size = 0;
2137         void *addr;
2138         struct scatterlist *sg;
2139
2140         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2141                 return;
2142
2143         domain = find_domain(pdev);
2144
2145         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2146         if (!iova)
2147                 return;
2148         for_each_sg(sglist, sg, nelems, i) {
2149                 addr = SG_ENT_VIRT_ADDRESS(sg);
2150                 size += aligned_size((u64)addr, sg->length);
2151         }
2152
2153         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2154
2155         /*  clear the whole page */
2156         dma_pte_clear_range(domain, start_addr, start_addr + size);
2157         /* free page tables */
2158         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2159
2160         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2161                         size >> PAGE_SHIFT_4K, 0))
2162                 iommu_flush_write_buffer(domain->iommu);
2163
2164         /* free iova */
2165         __free_iova(&domain->iovad, iova);
2166 }
2167
2168 static int intel_nontranslate_map_sg(struct device *hddev,
2169         struct scatterlist *sglist, int nelems, int dir)
2170 {
2171         int i;
2172         struct scatterlist *sg;
2173
2174         for_each_sg(sglist, sg, nelems, i) {
2175                 BUG_ON(!sg_page(sg));
2176                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2177                 sg->dma_length = sg->length;
2178         }
2179         return nelems;
2180 }
2181
2182 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2183                                 int nelems, int dir)
2184 {
2185         void *addr;
2186         int i;
2187         struct pci_dev *pdev = to_pci_dev(hwdev);
2188         struct dmar_domain *domain;
2189         size_t size = 0;
2190         int prot = 0;
2191         size_t offset = 0;
2192         struct iova *iova = NULL;
2193         int ret;
2194         struct scatterlist *sg;
2195         unsigned long start_addr;
2196
2197         BUG_ON(dir == DMA_NONE);
2198         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2199                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2200
2201         domain = get_valid_domain_for_dev(pdev);
2202         if (!domain)
2203                 return 0;
2204
2205         for_each_sg(sglist, sg, nelems, i) {
2206                 addr = SG_ENT_VIRT_ADDRESS(sg);
2207                 addr = (void *)virt_to_phys(addr);
2208                 size += aligned_size((u64)addr, sg->length);
2209         }
2210
2211         iova = __intel_alloc_iova(hwdev, domain, size);
2212         if (!iova) {
2213                 sglist->dma_length = 0;
2214                 return 0;
2215         }
2216
2217         /*
2218          * Check if DMAR supports zero-length reads on write only
2219          * mappings..
2220          */
2221         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2222                         !cap_zlr(domain->iommu->cap))
2223                 prot |= DMA_PTE_READ;
2224         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2225                 prot |= DMA_PTE_WRITE;
2226
2227         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2228         offset = 0;
2229         for_each_sg(sglist, sg, nelems, i) {
2230                 addr = SG_ENT_VIRT_ADDRESS(sg);
2231                 addr = (void *)virt_to_phys(addr);
2232                 size = aligned_size((u64)addr, sg->length);
2233                 ret = domain_page_mapping(domain, start_addr + offset,
2234                         ((u64)addr) & PAGE_MASK_4K,
2235                         size, prot);
2236                 if (ret) {
2237                         /*  clear the page */
2238                         dma_pte_clear_range(domain, start_addr,
2239                                   start_addr + offset);
2240                         /* free page tables */
2241                         dma_pte_free_pagetable(domain, start_addr,
2242                                   start_addr + offset);
2243                         /* free iova */
2244                         __free_iova(&domain->iovad, iova);
2245                         return 0;
2246                 }
2247                 sg->dma_address = start_addr + offset +
2248                                 ((u64)addr & (~PAGE_MASK_4K));
2249                 sg->dma_length = sg->length;
2250                 offset += size;
2251         }
2252
2253         /* it's a non-present to present mapping */
2254         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2255                         start_addr, offset >> PAGE_SHIFT_4K, 1))
2256                 iommu_flush_write_buffer(domain->iommu);
2257         return nelems;
2258 }
2259
2260 static struct dma_mapping_ops intel_dma_ops = {
2261         .alloc_coherent = intel_alloc_coherent,
2262         .free_coherent = intel_free_coherent,
2263         .map_single = intel_map_single,
2264         .unmap_single = intel_unmap_single,
2265         .map_sg = intel_map_sg,
2266         .unmap_sg = intel_unmap_sg,
2267 };
2268
2269 static inline int iommu_domain_cache_init(void)
2270 {
2271         int ret = 0;
2272
2273         iommu_domain_cache = kmem_cache_create("iommu_domain",
2274                                          sizeof(struct dmar_domain),
2275                                          0,
2276                                          SLAB_HWCACHE_ALIGN,
2277
2278                                          NULL);
2279         if (!iommu_domain_cache) {
2280                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2281                 ret = -ENOMEM;
2282         }
2283
2284         return ret;
2285 }
2286
2287 static inline int iommu_devinfo_cache_init(void)
2288 {
2289         int ret = 0;
2290
2291         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2292                                          sizeof(struct device_domain_info),
2293                                          0,
2294                                          SLAB_HWCACHE_ALIGN,
2295
2296                                          NULL);
2297         if (!iommu_devinfo_cache) {
2298                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2299                 ret = -ENOMEM;
2300         }
2301
2302         return ret;
2303 }
2304
2305 static inline int iommu_iova_cache_init(void)
2306 {
2307         int ret = 0;
2308
2309         iommu_iova_cache = kmem_cache_create("iommu_iova",
2310                                          sizeof(struct iova),
2311                                          0,
2312                                          SLAB_HWCACHE_ALIGN,
2313
2314                                          NULL);
2315         if (!iommu_iova_cache) {
2316                 printk(KERN_ERR "Couldn't create iova cache\n");
2317                 ret = -ENOMEM;
2318         }
2319
2320         return ret;
2321 }
2322
2323 static int __init iommu_init_mempool(void)
2324 {
2325         int ret;
2326         ret = iommu_iova_cache_init();
2327         if (ret)
2328                 return ret;
2329
2330         ret = iommu_domain_cache_init();
2331         if (ret)
2332                 goto domain_error;
2333
2334         ret = iommu_devinfo_cache_init();
2335         if (!ret)
2336                 return ret;
2337
2338         kmem_cache_destroy(iommu_domain_cache);
2339 domain_error:
2340         kmem_cache_destroy(iommu_iova_cache);
2341
2342         return -ENOMEM;
2343 }
2344
2345 static void __init iommu_exit_mempool(void)
2346 {
2347         kmem_cache_destroy(iommu_devinfo_cache);
2348         kmem_cache_destroy(iommu_domain_cache);
2349         kmem_cache_destroy(iommu_iova_cache);
2350
2351 }
2352
2353 static int blacklist_iommu(const struct dmi_system_id *id)
2354 {
2355         printk(KERN_INFO "%s detected; disabling IOMMU\n",
2356                id->ident);
2357         dmar_disabled = 1;
2358         return 0;
2359 }
2360
2361 static struct dmi_system_id __initdata intel_iommu_dmi_table[] = {
2362         {       /* Some DG33BU BIOS revisions advertised non-existent VT-d */
2363                 .callback = blacklist_iommu,
2364                 .ident = "Intel DG33BU",
2365                 {       DMI_MATCH(DMI_BOARD_VENDOR, "Intel Corporation"),
2366                         DMI_MATCH(DMI_BOARD_NAME, "DG33BU"),
2367                 }
2368         },
2369         { }
2370 };
2371
2372
2373 void __init detect_intel_iommu(void)
2374 {
2375         if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2376                 return;
2377         if (early_dmar_detect()) {
2378                 dmi_check_system(intel_iommu_dmi_table);
2379                 if (dmar_disabled)
2380                         return;
2381                 iommu_detected = 1;
2382         }
2383 }
2384
2385 static void __init init_no_remapping_devices(void)
2386 {
2387         struct dmar_drhd_unit *drhd;
2388
2389         for_each_drhd_unit(drhd) {
2390                 if (!drhd->include_all) {
2391                         int i;
2392                         for (i = 0; i < drhd->devices_cnt; i++)
2393                                 if (drhd->devices[i] != NULL)
2394                                         break;
2395                         /* ignore DMAR unit if no pci devices exist */
2396                         if (i == drhd->devices_cnt)
2397                                 drhd->ignored = 1;
2398                 }
2399         }
2400
2401         if (dmar_map_gfx)
2402                 return;
2403
2404         for_each_drhd_unit(drhd) {
2405                 int i;
2406                 if (drhd->ignored || drhd->include_all)
2407                         continue;
2408
2409                 for (i = 0; i < drhd->devices_cnt; i++)
2410                         if (drhd->devices[i] &&
2411                                 !IS_GFX_DEVICE(drhd->devices[i]))
2412                                 break;
2413
2414                 if (i < drhd->devices_cnt)
2415                         continue;
2416
2417                 /* bypass IOMMU if it is just for gfx devices */
2418                 drhd->ignored = 1;
2419                 for (i = 0; i < drhd->devices_cnt; i++) {
2420                         if (!drhd->devices[i])
2421                                 continue;
2422                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2423                 }
2424         }
2425 }
2426
2427 int __init intel_iommu_init(void)
2428 {
2429         int ret = 0;
2430
2431         if (no_iommu || swiotlb || dmar_disabled)
2432                 return -ENODEV;
2433
2434         if (dmar_table_init())
2435                 return  -ENODEV;
2436
2437         iommu_init_mempool();
2438         dmar_init_reserved_ranges();
2439
2440         init_no_remapping_devices();
2441
2442         ret = init_dmars();
2443         if (ret) {
2444                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2445                 put_iova_domain(&reserved_iova_list);
2446                 iommu_exit_mempool();
2447                 return ret;
2448         }
2449         printk(KERN_INFO
2450         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2451
2452         init_timer(&unmap_timer);
2453         force_iommu = 1;
2454         dma_ops = &intel_dma_ops;
2455         return 0;
2456 }
2457
2458 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
2459 {
2460         /* Mobile 4 Series Chipset neglects to set RWBF capability,
2461            but needs it */
2462         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
2463         rwbf_quirk = 1;
2464 }
2465
2466 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);