Merge branch 'x86/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip...
[pandora-kernel.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  */
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/slab.h>
27 #include <linux/irq.h>
28 #include <linux/interrupt.h>
29 #include <linux/sysdev.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include "iova.h"
37 #include "intel-iommu.h"
38 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
39 #include <asm/cacheflush.h>
40 #include <asm/gart.h>
41 #include "pci.h"
42
43 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
44 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
45
46 #define IOAPIC_RANGE_START      (0xfee00000)
47 #define IOAPIC_RANGE_END        (0xfeefffff)
48 #define IOVA_START_ADDR         (0x1000)
49
50 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
51
52 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) /* 10sec */
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56
57 static void flush_unmaps_timeout(unsigned long data);
58
59 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
60
61 static struct intel_iommu *g_iommus;
62
63 #define HIGH_WATER_MARK 250
64 struct deferred_flush_tables {
65         int next;
66         struct iova *iova[HIGH_WATER_MARK];
67         struct dmar_domain *domain[HIGH_WATER_MARK];
68 };
69
70 static struct deferred_flush_tables *deferred_flush;
71
72 /* bitmap for indexing intel_iommus */
73 static int g_num_of_iommus;
74
75 static DEFINE_SPINLOCK(async_umap_flush_lock);
76 static LIST_HEAD(unmaps_to_do);
77
78 static int timer_on;
79 static long list_size;
80
81 static void domain_remove_dev_info(struct dmar_domain *domain);
82
83 static int dmar_disabled;
84 static int __initdata dmar_map_gfx = 1;
85 static int dmar_forcedac;
86 static int intel_iommu_strict;
87
88 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
89 static DEFINE_SPINLOCK(device_domain_lock);
90 static LIST_HEAD(device_domain_list);
91
92 static int __init intel_iommu_setup(char *str)
93 {
94         if (!str)
95                 return -EINVAL;
96         while (*str) {
97                 if (!strncmp(str, "off", 3)) {
98                         dmar_disabled = 1;
99                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
100                 } else if (!strncmp(str, "igfx_off", 8)) {
101                         dmar_map_gfx = 0;
102                         printk(KERN_INFO
103                                 "Intel-IOMMU: disable GFX device mapping\n");
104                 } else if (!strncmp(str, "forcedac", 8)) {
105                         printk(KERN_INFO
106                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
107                         dmar_forcedac = 1;
108                 } else if (!strncmp(str, "strict", 6)) {
109                         printk(KERN_INFO
110                                 "Intel-IOMMU: disable batched IOTLB flush\n");
111                         intel_iommu_strict = 1;
112                 }
113
114                 str += strcspn(str, ",");
115                 while (*str == ',')
116                         str++;
117         }
118         return 0;
119 }
120 __setup("intel_iommu=", intel_iommu_setup);
121
122 static struct kmem_cache *iommu_domain_cache;
123 static struct kmem_cache *iommu_devinfo_cache;
124 static struct kmem_cache *iommu_iova_cache;
125
126 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
127 {
128         unsigned int flags;
129         void *vaddr;
130
131         /* trying to avoid low memory issues */
132         flags = current->flags & PF_MEMALLOC;
133         current->flags |= PF_MEMALLOC;
134         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
135         current->flags &= (~PF_MEMALLOC | flags);
136         return vaddr;
137 }
138
139
140 static inline void *alloc_pgtable_page(void)
141 {
142         unsigned int flags;
143         void *vaddr;
144
145         /* trying to avoid low memory issues */
146         flags = current->flags & PF_MEMALLOC;
147         current->flags |= PF_MEMALLOC;
148         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
149         current->flags &= (~PF_MEMALLOC | flags);
150         return vaddr;
151 }
152
153 static inline void free_pgtable_page(void *vaddr)
154 {
155         free_page((unsigned long)vaddr);
156 }
157
158 static inline void *alloc_domain_mem(void)
159 {
160         return iommu_kmem_cache_alloc(iommu_domain_cache);
161 }
162
163 static inline void free_domain_mem(void *vaddr)
164 {
165         kmem_cache_free(iommu_domain_cache, vaddr);
166 }
167
168 static inline void * alloc_devinfo_mem(void)
169 {
170         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
171 }
172
173 static inline void free_devinfo_mem(void *vaddr)
174 {
175         kmem_cache_free(iommu_devinfo_cache, vaddr);
176 }
177
178 struct iova *alloc_iova_mem(void)
179 {
180         return iommu_kmem_cache_alloc(iommu_iova_cache);
181 }
182
183 void free_iova_mem(struct iova *iova)
184 {
185         kmem_cache_free(iommu_iova_cache, iova);
186 }
187
188 static inline void __iommu_flush_cache(
189         struct intel_iommu *iommu, void *addr, int size)
190 {
191         if (!ecap_coherent(iommu->ecap))
192                 clflush_cache_range(addr, size);
193 }
194
195 /* Gets context entry for a given bus and devfn */
196 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
197                 u8 bus, u8 devfn)
198 {
199         struct root_entry *root;
200         struct context_entry *context;
201         unsigned long phy_addr;
202         unsigned long flags;
203
204         spin_lock_irqsave(&iommu->lock, flags);
205         root = &iommu->root_entry[bus];
206         context = get_context_addr_from_root(root);
207         if (!context) {
208                 context = (struct context_entry *)alloc_pgtable_page();
209                 if (!context) {
210                         spin_unlock_irqrestore(&iommu->lock, flags);
211                         return NULL;
212                 }
213                 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
214                 phy_addr = virt_to_phys((void *)context);
215                 set_root_value(root, phy_addr);
216                 set_root_present(root);
217                 __iommu_flush_cache(iommu, root, sizeof(*root));
218         }
219         spin_unlock_irqrestore(&iommu->lock, flags);
220         return &context[devfn];
221 }
222
223 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
224 {
225         struct root_entry *root;
226         struct context_entry *context;
227         int ret;
228         unsigned long flags;
229
230         spin_lock_irqsave(&iommu->lock, flags);
231         root = &iommu->root_entry[bus];
232         context = get_context_addr_from_root(root);
233         if (!context) {
234                 ret = 0;
235                 goto out;
236         }
237         ret = context_present(context[devfn]);
238 out:
239         spin_unlock_irqrestore(&iommu->lock, flags);
240         return ret;
241 }
242
243 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
244 {
245         struct root_entry *root;
246         struct context_entry *context;
247         unsigned long flags;
248
249         spin_lock_irqsave(&iommu->lock, flags);
250         root = &iommu->root_entry[bus];
251         context = get_context_addr_from_root(root);
252         if (context) {
253                 context_clear_entry(context[devfn]);
254                 __iommu_flush_cache(iommu, &context[devfn], \
255                         sizeof(*context));
256         }
257         spin_unlock_irqrestore(&iommu->lock, flags);
258 }
259
260 static void free_context_table(struct intel_iommu *iommu)
261 {
262         struct root_entry *root;
263         int i;
264         unsigned long flags;
265         struct context_entry *context;
266
267         spin_lock_irqsave(&iommu->lock, flags);
268         if (!iommu->root_entry) {
269                 goto out;
270         }
271         for (i = 0; i < ROOT_ENTRY_NR; i++) {
272                 root = &iommu->root_entry[i];
273                 context = get_context_addr_from_root(root);
274                 if (context)
275                         free_pgtable_page(context);
276         }
277         free_pgtable_page(iommu->root_entry);
278         iommu->root_entry = NULL;
279 out:
280         spin_unlock_irqrestore(&iommu->lock, flags);
281 }
282
283 /* page table handling */
284 #define LEVEL_STRIDE            (9)
285 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
286
287 static inline int agaw_to_level(int agaw)
288 {
289         return agaw + 2;
290 }
291
292 static inline int agaw_to_width(int agaw)
293 {
294         return 30 + agaw * LEVEL_STRIDE;
295
296 }
297
298 static inline int width_to_agaw(int width)
299 {
300         return (width - 30) / LEVEL_STRIDE;
301 }
302
303 static inline unsigned int level_to_offset_bits(int level)
304 {
305         return (12 + (level - 1) * LEVEL_STRIDE);
306 }
307
308 static inline int address_level_offset(u64 addr, int level)
309 {
310         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
311 }
312
313 static inline u64 level_mask(int level)
314 {
315         return ((u64)-1 << level_to_offset_bits(level));
316 }
317
318 static inline u64 level_size(int level)
319 {
320         return ((u64)1 << level_to_offset_bits(level));
321 }
322
323 static inline u64 align_to_level(u64 addr, int level)
324 {
325         return ((addr + level_size(level) - 1) & level_mask(level));
326 }
327
328 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
329 {
330         int addr_width = agaw_to_width(domain->agaw);
331         struct dma_pte *parent, *pte = NULL;
332         int level = agaw_to_level(domain->agaw);
333         int offset;
334         unsigned long flags;
335
336         BUG_ON(!domain->pgd);
337
338         addr &= (((u64)1) << addr_width) - 1;
339         parent = domain->pgd;
340
341         spin_lock_irqsave(&domain->mapping_lock, flags);
342         while (level > 0) {
343                 void *tmp_page;
344
345                 offset = address_level_offset(addr, level);
346                 pte = &parent[offset];
347                 if (level == 1)
348                         break;
349
350                 if (!dma_pte_present(*pte)) {
351                         tmp_page = alloc_pgtable_page();
352
353                         if (!tmp_page) {
354                                 spin_unlock_irqrestore(&domain->mapping_lock,
355                                         flags);
356                                 return NULL;
357                         }
358                         __iommu_flush_cache(domain->iommu, tmp_page,
359                                         PAGE_SIZE_4K);
360                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
361                         /*
362                          * high level table always sets r/w, last level page
363                          * table control read/write
364                          */
365                         dma_set_pte_readable(*pte);
366                         dma_set_pte_writable(*pte);
367                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
368                 }
369                 parent = phys_to_virt(dma_pte_addr(*pte));
370                 level--;
371         }
372
373         spin_unlock_irqrestore(&domain->mapping_lock, flags);
374         return pte;
375 }
376
377 /* return address's pte at specific level */
378 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
379                 int level)
380 {
381         struct dma_pte *parent, *pte = NULL;
382         int total = agaw_to_level(domain->agaw);
383         int offset;
384
385         parent = domain->pgd;
386         while (level <= total) {
387                 offset = address_level_offset(addr, total);
388                 pte = &parent[offset];
389                 if (level == total)
390                         return pte;
391
392                 if (!dma_pte_present(*pte))
393                         break;
394                 parent = phys_to_virt(dma_pte_addr(*pte));
395                 total--;
396         }
397         return NULL;
398 }
399
400 /* clear one page's page table */
401 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
402 {
403         struct dma_pte *pte = NULL;
404
405         /* get last level pte */
406         pte = dma_addr_level_pte(domain, addr, 1);
407
408         if (pte) {
409                 dma_clear_pte(*pte);
410                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
411         }
412 }
413
414 /* clear last level pte, a tlb flush should be followed */
415 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
416 {
417         int addr_width = agaw_to_width(domain->agaw);
418
419         start &= (((u64)1) << addr_width) - 1;
420         end &= (((u64)1) << addr_width) - 1;
421         /* in case it's partial page */
422         start = PAGE_ALIGN_4K(start);
423         end &= PAGE_MASK_4K;
424
425         /* we don't need lock here, nobody else touches the iova range */
426         while (start < end) {
427                 dma_pte_clear_one(domain, start);
428                 start += PAGE_SIZE_4K;
429         }
430 }
431
432 /* free page table pages. last level pte should already be cleared */
433 static void dma_pte_free_pagetable(struct dmar_domain *domain,
434         u64 start, u64 end)
435 {
436         int addr_width = agaw_to_width(domain->agaw);
437         struct dma_pte *pte;
438         int total = agaw_to_level(domain->agaw);
439         int level;
440         u64 tmp;
441
442         start &= (((u64)1) << addr_width) - 1;
443         end &= (((u64)1) << addr_width) - 1;
444
445         /* we don't need lock here, nobody else touches the iova range */
446         level = 2;
447         while (level <= total) {
448                 tmp = align_to_level(start, level);
449                 if (tmp >= end || (tmp + level_size(level) > end))
450                         return;
451
452                 while (tmp < end) {
453                         pte = dma_addr_level_pte(domain, tmp, level);
454                         if (pte) {
455                                 free_pgtable_page(
456                                         phys_to_virt(dma_pte_addr(*pte)));
457                                 dma_clear_pte(*pte);
458                                 __iommu_flush_cache(domain->iommu,
459                                                 pte, sizeof(*pte));
460                         }
461                         tmp += level_size(level);
462                 }
463                 level++;
464         }
465         /* free pgd */
466         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
467                 free_pgtable_page(domain->pgd);
468                 domain->pgd = NULL;
469         }
470 }
471
472 /* iommu handling */
473 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
474 {
475         struct root_entry *root;
476         unsigned long flags;
477
478         root = (struct root_entry *)alloc_pgtable_page();
479         if (!root)
480                 return -ENOMEM;
481
482         __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
483
484         spin_lock_irqsave(&iommu->lock, flags);
485         iommu->root_entry = root;
486         spin_unlock_irqrestore(&iommu->lock, flags);
487
488         return 0;
489 }
490
491 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
492 {\
493         cycles_t start_time = get_cycles();\
494         while (1) {\
495                 sts = op (iommu->reg + offset);\
496                 if (cond)\
497                         break;\
498                 if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
499                         panic("DMAR hardware is malfunctioning\n");\
500                 cpu_relax();\
501         }\
502 }
503
504 static void iommu_set_root_entry(struct intel_iommu *iommu)
505 {
506         void *addr;
507         u32 cmd, sts;
508         unsigned long flag;
509
510         addr = iommu->root_entry;
511
512         spin_lock_irqsave(&iommu->register_lock, flag);
513         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
514
515         cmd = iommu->gcmd | DMA_GCMD_SRTP;
516         writel(cmd, iommu->reg + DMAR_GCMD_REG);
517
518         /* Make sure hardware complete it */
519         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
520                 readl, (sts & DMA_GSTS_RTPS), sts);
521
522         spin_unlock_irqrestore(&iommu->register_lock, flag);
523 }
524
525 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
526 {
527         u32 val;
528         unsigned long flag;
529
530         if (!cap_rwbf(iommu->cap))
531                 return;
532         val = iommu->gcmd | DMA_GCMD_WBF;
533
534         spin_lock_irqsave(&iommu->register_lock, flag);
535         writel(val, iommu->reg + DMAR_GCMD_REG);
536
537         /* Make sure hardware complete it */
538         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
539                         readl, (!(val & DMA_GSTS_WBFS)), val);
540
541         spin_unlock_irqrestore(&iommu->register_lock, flag);
542 }
543
544 /* return value determine if we need a write buffer flush */
545 static int __iommu_flush_context(struct intel_iommu *iommu,
546         u16 did, u16 source_id, u8 function_mask, u64 type,
547         int non_present_entry_flush)
548 {
549         u64 val = 0;
550         unsigned long flag;
551
552         /*
553          * In the non-present entry flush case, if hardware doesn't cache
554          * non-present entry we do nothing and if hardware cache non-present
555          * entry, we flush entries of domain 0 (the domain id is used to cache
556          * any non-present entries)
557          */
558         if (non_present_entry_flush) {
559                 if (!cap_caching_mode(iommu->cap))
560                         return 1;
561                 else
562                         did = 0;
563         }
564
565         switch (type) {
566         case DMA_CCMD_GLOBAL_INVL:
567                 val = DMA_CCMD_GLOBAL_INVL;
568                 break;
569         case DMA_CCMD_DOMAIN_INVL:
570                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
571                 break;
572         case DMA_CCMD_DEVICE_INVL:
573                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
574                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
575                 break;
576         default:
577                 BUG();
578         }
579         val |= DMA_CCMD_ICC;
580
581         spin_lock_irqsave(&iommu->register_lock, flag);
582         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
583
584         /* Make sure hardware complete it */
585         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
586                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
587
588         spin_unlock_irqrestore(&iommu->register_lock, flag);
589
590         /* flush context entry will implictly flush write buffer */
591         return 0;
592 }
593
594 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
595         int non_present_entry_flush)
596 {
597         return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
598                 non_present_entry_flush);
599 }
600
601 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
602         int non_present_entry_flush)
603 {
604         return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
605                 non_present_entry_flush);
606 }
607
608 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
609         u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
610 {
611         return __iommu_flush_context(iommu, did, source_id, function_mask,
612                 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
613 }
614
615 /* return value determine if we need a write buffer flush */
616 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
617         u64 addr, unsigned int size_order, u64 type,
618         int non_present_entry_flush)
619 {
620         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
621         u64 val = 0, val_iva = 0;
622         unsigned long flag;
623
624         /*
625          * In the non-present entry flush case, if hardware doesn't cache
626          * non-present entry we do nothing and if hardware cache non-present
627          * entry, we flush entries of domain 0 (the domain id is used to cache
628          * any non-present entries)
629          */
630         if (non_present_entry_flush) {
631                 if (!cap_caching_mode(iommu->cap))
632                         return 1;
633                 else
634                         did = 0;
635         }
636
637         switch (type) {
638         case DMA_TLB_GLOBAL_FLUSH:
639                 /* global flush doesn't need set IVA_REG */
640                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
641                 break;
642         case DMA_TLB_DSI_FLUSH:
643                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
644                 break;
645         case DMA_TLB_PSI_FLUSH:
646                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
647                 /* Note: always flush non-leaf currently */
648                 val_iva = size_order | addr;
649                 break;
650         default:
651                 BUG();
652         }
653         /* Note: set drain read/write */
654 #if 0
655         /*
656          * This is probably to be super secure.. Looks like we can
657          * ignore it without any impact.
658          */
659         if (cap_read_drain(iommu->cap))
660                 val |= DMA_TLB_READ_DRAIN;
661 #endif
662         if (cap_write_drain(iommu->cap))
663                 val |= DMA_TLB_WRITE_DRAIN;
664
665         spin_lock_irqsave(&iommu->register_lock, flag);
666         /* Note: Only uses first TLB reg currently */
667         if (val_iva)
668                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
669         dmar_writeq(iommu->reg + tlb_offset + 8, val);
670
671         /* Make sure hardware complete it */
672         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
673                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
674
675         spin_unlock_irqrestore(&iommu->register_lock, flag);
676
677         /* check IOTLB invalidation granularity */
678         if (DMA_TLB_IAIG(val) == 0)
679                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
680         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
681                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
682                         DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
683         /* flush context entry will implictly flush write buffer */
684         return 0;
685 }
686
687 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
688         int non_present_entry_flush)
689 {
690         return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
691                 non_present_entry_flush);
692 }
693
694 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
695         int non_present_entry_flush)
696 {
697         return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
698                 non_present_entry_flush);
699 }
700
701 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
702         u64 addr, unsigned int pages, int non_present_entry_flush)
703 {
704         unsigned int mask;
705
706         BUG_ON(addr & (~PAGE_MASK_4K));
707         BUG_ON(pages == 0);
708
709         /* Fallback to domain selective flush if no PSI support */
710         if (!cap_pgsel_inv(iommu->cap))
711                 return iommu_flush_iotlb_dsi(iommu, did,
712                         non_present_entry_flush);
713
714         /*
715          * PSI requires page size to be 2 ^ x, and the base address is naturally
716          * aligned to the size
717          */
718         mask = ilog2(__roundup_pow_of_two(pages));
719         /* Fallback to domain selective flush if size is too big */
720         if (mask > cap_max_amask_val(iommu->cap))
721                 return iommu_flush_iotlb_dsi(iommu, did,
722                         non_present_entry_flush);
723
724         return __iommu_flush_iotlb(iommu, did, addr, mask,
725                 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
726 }
727
728 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
729 {
730         u32 pmen;
731         unsigned long flags;
732
733         spin_lock_irqsave(&iommu->register_lock, flags);
734         pmen = readl(iommu->reg + DMAR_PMEN_REG);
735         pmen &= ~DMA_PMEN_EPM;
736         writel(pmen, iommu->reg + DMAR_PMEN_REG);
737
738         /* wait for the protected region status bit to clear */
739         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
740                 readl, !(pmen & DMA_PMEN_PRS), pmen);
741
742         spin_unlock_irqrestore(&iommu->register_lock, flags);
743 }
744
745 static int iommu_enable_translation(struct intel_iommu *iommu)
746 {
747         u32 sts;
748         unsigned long flags;
749
750         spin_lock_irqsave(&iommu->register_lock, flags);
751         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
752
753         /* Make sure hardware complete it */
754         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
755                 readl, (sts & DMA_GSTS_TES), sts);
756
757         iommu->gcmd |= DMA_GCMD_TE;
758         spin_unlock_irqrestore(&iommu->register_lock, flags);
759         return 0;
760 }
761
762 static int iommu_disable_translation(struct intel_iommu *iommu)
763 {
764         u32 sts;
765         unsigned long flag;
766
767         spin_lock_irqsave(&iommu->register_lock, flag);
768         iommu->gcmd &= ~DMA_GCMD_TE;
769         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
770
771         /* Make sure hardware complete it */
772         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
773                 readl, (!(sts & DMA_GSTS_TES)), sts);
774
775         spin_unlock_irqrestore(&iommu->register_lock, flag);
776         return 0;
777 }
778
779 /* iommu interrupt handling. Most stuff are MSI-like. */
780
781 static const char *fault_reason_strings[] =
782 {
783         "Software",
784         "Present bit in root entry is clear",
785         "Present bit in context entry is clear",
786         "Invalid context entry",
787         "Access beyond MGAW",
788         "PTE Write access is not set",
789         "PTE Read access is not set",
790         "Next page table ptr is invalid",
791         "Root table address invalid",
792         "Context table ptr is invalid",
793         "non-zero reserved fields in RTP",
794         "non-zero reserved fields in CTP",
795         "non-zero reserved fields in PTE",
796 };
797 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
798
799 const char *dmar_get_fault_reason(u8 fault_reason)
800 {
801         if (fault_reason > MAX_FAULT_REASON_IDX)
802                 return "Unknown";
803         else
804                 return fault_reason_strings[fault_reason];
805 }
806
807 void dmar_msi_unmask(unsigned int irq)
808 {
809         struct intel_iommu *iommu = get_irq_data(irq);
810         unsigned long flag;
811
812         /* unmask it */
813         spin_lock_irqsave(&iommu->register_lock, flag);
814         writel(0, iommu->reg + DMAR_FECTL_REG);
815         /* Read a reg to force flush the post write */
816         readl(iommu->reg + DMAR_FECTL_REG);
817         spin_unlock_irqrestore(&iommu->register_lock, flag);
818 }
819
820 void dmar_msi_mask(unsigned int irq)
821 {
822         unsigned long flag;
823         struct intel_iommu *iommu = get_irq_data(irq);
824
825         /* mask it */
826         spin_lock_irqsave(&iommu->register_lock, flag);
827         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
828         /* Read a reg to force flush the post write */
829         readl(iommu->reg + DMAR_FECTL_REG);
830         spin_unlock_irqrestore(&iommu->register_lock, flag);
831 }
832
833 void dmar_msi_write(int irq, struct msi_msg *msg)
834 {
835         struct intel_iommu *iommu = get_irq_data(irq);
836         unsigned long flag;
837
838         spin_lock_irqsave(&iommu->register_lock, flag);
839         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
840         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
841         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
842         spin_unlock_irqrestore(&iommu->register_lock, flag);
843 }
844
845 void dmar_msi_read(int irq, struct msi_msg *msg)
846 {
847         struct intel_iommu *iommu = get_irq_data(irq);
848         unsigned long flag;
849
850         spin_lock_irqsave(&iommu->register_lock, flag);
851         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
852         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
853         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
854         spin_unlock_irqrestore(&iommu->register_lock, flag);
855 }
856
857 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
858                 u8 fault_reason, u16 source_id, u64 addr)
859 {
860         const char *reason;
861
862         reason = dmar_get_fault_reason(fault_reason);
863
864         printk(KERN_ERR
865                 "DMAR:[%s] Request device [%02x:%02x.%d] "
866                 "fault addr %llx \n"
867                 "DMAR:[fault reason %02d] %s\n",
868                 (type ? "DMA Read" : "DMA Write"),
869                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
870                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
871         return 0;
872 }
873
874 #define PRIMARY_FAULT_REG_LEN (16)
875 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
876 {
877         struct intel_iommu *iommu = dev_id;
878         int reg, fault_index;
879         u32 fault_status;
880         unsigned long flag;
881
882         spin_lock_irqsave(&iommu->register_lock, flag);
883         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
884
885         /* TBD: ignore advanced fault log currently */
886         if (!(fault_status & DMA_FSTS_PPF))
887                 goto clear_overflow;
888
889         fault_index = dma_fsts_fault_record_index(fault_status);
890         reg = cap_fault_reg_offset(iommu->cap);
891         while (1) {
892                 u8 fault_reason;
893                 u16 source_id;
894                 u64 guest_addr;
895                 int type;
896                 u32 data;
897
898                 /* highest 32 bits */
899                 data = readl(iommu->reg + reg +
900                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
901                 if (!(data & DMA_FRCD_F))
902                         break;
903
904                 fault_reason = dma_frcd_fault_reason(data);
905                 type = dma_frcd_type(data);
906
907                 data = readl(iommu->reg + reg +
908                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
909                 source_id = dma_frcd_source_id(data);
910
911                 guest_addr = dmar_readq(iommu->reg + reg +
912                                 fault_index * PRIMARY_FAULT_REG_LEN);
913                 guest_addr = dma_frcd_page_addr(guest_addr);
914                 /* clear the fault */
915                 writel(DMA_FRCD_F, iommu->reg + reg +
916                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
917
918                 spin_unlock_irqrestore(&iommu->register_lock, flag);
919
920                 iommu_page_fault_do_one(iommu, type, fault_reason,
921                                 source_id, guest_addr);
922
923                 fault_index++;
924                 if (fault_index > cap_num_fault_regs(iommu->cap))
925                         fault_index = 0;
926                 spin_lock_irqsave(&iommu->register_lock, flag);
927         }
928 clear_overflow:
929         /* clear primary fault overflow */
930         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
931         if (fault_status & DMA_FSTS_PFO)
932                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
933
934         spin_unlock_irqrestore(&iommu->register_lock, flag);
935         return IRQ_HANDLED;
936 }
937
938 int dmar_set_interrupt(struct intel_iommu *iommu)
939 {
940         int irq, ret;
941
942         irq = create_irq();
943         if (!irq) {
944                 printk(KERN_ERR "IOMMU: no free vectors\n");
945                 return -EINVAL;
946         }
947
948         set_irq_data(irq, iommu);
949         iommu->irq = irq;
950
951         ret = arch_setup_dmar_msi(irq);
952         if (ret) {
953                 set_irq_data(irq, NULL);
954                 iommu->irq = 0;
955                 destroy_irq(irq);
956                 return 0;
957         }
958
959         /* Force fault register is cleared */
960         iommu_page_fault(irq, iommu);
961
962         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
963         if (ret)
964                 printk(KERN_ERR "IOMMU: can't request irq\n");
965         return ret;
966 }
967
968 static int iommu_init_domains(struct intel_iommu *iommu)
969 {
970         unsigned long ndomains;
971         unsigned long nlongs;
972
973         ndomains = cap_ndoms(iommu->cap);
974         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
975         nlongs = BITS_TO_LONGS(ndomains);
976
977         /* TBD: there might be 64K domains,
978          * consider other allocation for future chip
979          */
980         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
981         if (!iommu->domain_ids) {
982                 printk(KERN_ERR "Allocating domain id array failed\n");
983                 return -ENOMEM;
984         }
985         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
986                         GFP_KERNEL);
987         if (!iommu->domains) {
988                 printk(KERN_ERR "Allocating domain array failed\n");
989                 kfree(iommu->domain_ids);
990                 return -ENOMEM;
991         }
992
993         /*
994          * if Caching mode is set, then invalid translations are tagged
995          * with domainid 0. Hence we need to pre-allocate it.
996          */
997         if (cap_caching_mode(iommu->cap))
998                 set_bit(0, iommu->domain_ids);
999         return 0;
1000 }
1001 static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
1002                                         struct dmar_drhd_unit *drhd)
1003 {
1004         int ret;
1005         int map_size;
1006         u32 ver;
1007
1008         iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
1009         if (!iommu->reg) {
1010                 printk(KERN_ERR "IOMMU: can't map the region\n");
1011                 goto error;
1012         }
1013         iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
1014         iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
1015
1016         /* the registers might be more than one page */
1017         map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
1018                 cap_max_fault_reg_offset(iommu->cap));
1019         map_size = PAGE_ALIGN_4K(map_size);
1020         if (map_size > PAGE_SIZE_4K) {
1021                 iounmap(iommu->reg);
1022                 iommu->reg = ioremap(drhd->reg_base_addr, map_size);
1023                 if (!iommu->reg) {
1024                         printk(KERN_ERR "IOMMU: can't map the region\n");
1025                         goto error;
1026                 }
1027         }
1028
1029         ver = readl(iommu->reg + DMAR_VER_REG);
1030         pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
1031                 drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
1032                 iommu->cap, iommu->ecap);
1033         ret = iommu_init_domains(iommu);
1034         if (ret)
1035                 goto error_unmap;
1036         spin_lock_init(&iommu->lock);
1037         spin_lock_init(&iommu->register_lock);
1038
1039         drhd->iommu = iommu;
1040         return iommu;
1041 error_unmap:
1042         iounmap(iommu->reg);
1043 error:
1044         kfree(iommu);
1045         return NULL;
1046 }
1047
1048 static void domain_exit(struct dmar_domain *domain);
1049 static void free_iommu(struct intel_iommu *iommu)
1050 {
1051         struct dmar_domain *domain;
1052         int i;
1053
1054         if (!iommu)
1055                 return;
1056
1057         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1058         for (; i < cap_ndoms(iommu->cap); ) {
1059                 domain = iommu->domains[i];
1060                 clear_bit(i, iommu->domain_ids);
1061                 domain_exit(domain);
1062                 i = find_next_bit(iommu->domain_ids,
1063                         cap_ndoms(iommu->cap), i+1);
1064         }
1065
1066         if (iommu->gcmd & DMA_GCMD_TE)
1067                 iommu_disable_translation(iommu);
1068
1069         if (iommu->irq) {
1070                 set_irq_data(iommu->irq, NULL);
1071                 /* This will mask the irq */
1072                 free_irq(iommu->irq, iommu);
1073                 destroy_irq(iommu->irq);
1074         }
1075
1076         kfree(iommu->domains);
1077         kfree(iommu->domain_ids);
1078
1079         /* free context mapping */
1080         free_context_table(iommu);
1081
1082         if (iommu->reg)
1083                 iounmap(iommu->reg);
1084         kfree(iommu);
1085 }
1086
1087 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1088 {
1089         unsigned long num;
1090         unsigned long ndomains;
1091         struct dmar_domain *domain;
1092         unsigned long flags;
1093
1094         domain = alloc_domain_mem();
1095         if (!domain)
1096                 return NULL;
1097
1098         ndomains = cap_ndoms(iommu->cap);
1099
1100         spin_lock_irqsave(&iommu->lock, flags);
1101         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1102         if (num >= ndomains) {
1103                 spin_unlock_irqrestore(&iommu->lock, flags);
1104                 free_domain_mem(domain);
1105                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1106                 return NULL;
1107         }
1108
1109         set_bit(num, iommu->domain_ids);
1110         domain->id = num;
1111         domain->iommu = iommu;
1112         iommu->domains[num] = domain;
1113         spin_unlock_irqrestore(&iommu->lock, flags);
1114
1115         return domain;
1116 }
1117
1118 static void iommu_free_domain(struct dmar_domain *domain)
1119 {
1120         unsigned long flags;
1121
1122         spin_lock_irqsave(&domain->iommu->lock, flags);
1123         clear_bit(domain->id, domain->iommu->domain_ids);
1124         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1125 }
1126
1127 static struct iova_domain reserved_iova_list;
1128 static struct lock_class_key reserved_alloc_key;
1129 static struct lock_class_key reserved_rbtree_key;
1130
1131 static void dmar_init_reserved_ranges(void)
1132 {
1133         struct pci_dev *pdev = NULL;
1134         struct iova *iova;
1135         int i;
1136         u64 addr, size;
1137
1138         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1139
1140         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1141                 &reserved_alloc_key);
1142         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1143                 &reserved_rbtree_key);
1144
1145         /* IOAPIC ranges shouldn't be accessed by DMA */
1146         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1147                 IOVA_PFN(IOAPIC_RANGE_END));
1148         if (!iova)
1149                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1150
1151         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1152         for_each_pci_dev(pdev) {
1153                 struct resource *r;
1154
1155                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1156                         r = &pdev->resource[i];
1157                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1158                                 continue;
1159                         addr = r->start;
1160                         addr &= PAGE_MASK_4K;
1161                         size = r->end - addr;
1162                         size = PAGE_ALIGN_4K(size);
1163                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1164                                 IOVA_PFN(size + addr) - 1);
1165                         if (!iova)
1166                                 printk(KERN_ERR "Reserve iova failed\n");
1167                 }
1168         }
1169
1170 }
1171
1172 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1173 {
1174         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1175 }
1176
1177 static inline int guestwidth_to_adjustwidth(int gaw)
1178 {
1179         int agaw;
1180         int r = (gaw - 12) % 9;
1181
1182         if (r == 0)
1183                 agaw = gaw;
1184         else
1185                 agaw = gaw + 9 - r;
1186         if (agaw > 64)
1187                 agaw = 64;
1188         return agaw;
1189 }
1190
1191 static int domain_init(struct dmar_domain *domain, int guest_width)
1192 {
1193         struct intel_iommu *iommu;
1194         int adjust_width, agaw;
1195         unsigned long sagaw;
1196
1197         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1198         spin_lock_init(&domain->mapping_lock);
1199
1200         domain_reserve_special_ranges(domain);
1201
1202         /* calculate AGAW */
1203         iommu = domain->iommu;
1204         if (guest_width > cap_mgaw(iommu->cap))
1205                 guest_width = cap_mgaw(iommu->cap);
1206         domain->gaw = guest_width;
1207         adjust_width = guestwidth_to_adjustwidth(guest_width);
1208         agaw = width_to_agaw(adjust_width);
1209         sagaw = cap_sagaw(iommu->cap);
1210         if (!test_bit(agaw, &sagaw)) {
1211                 /* hardware doesn't support it, choose a bigger one */
1212                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1213                 agaw = find_next_bit(&sagaw, 5, agaw);
1214                 if (agaw >= 5)
1215                         return -ENODEV;
1216         }
1217         domain->agaw = agaw;
1218         INIT_LIST_HEAD(&domain->devices);
1219
1220         /* always allocate the top pgd */
1221         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1222         if (!domain->pgd)
1223                 return -ENOMEM;
1224         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1225         return 0;
1226 }
1227
1228 static void domain_exit(struct dmar_domain *domain)
1229 {
1230         u64 end;
1231
1232         /* Domain 0 is reserved, so dont process it */
1233         if (!domain)
1234                 return;
1235
1236         domain_remove_dev_info(domain);
1237         /* destroy iovas */
1238         put_iova_domain(&domain->iovad);
1239         end = DOMAIN_MAX_ADDR(domain->gaw);
1240         end = end & (~PAGE_MASK_4K);
1241
1242         /* clear ptes */
1243         dma_pte_clear_range(domain, 0, end);
1244
1245         /* free page tables */
1246         dma_pte_free_pagetable(domain, 0, end);
1247
1248         iommu_free_domain(domain);
1249         free_domain_mem(domain);
1250 }
1251
1252 static int domain_context_mapping_one(struct dmar_domain *domain,
1253                 u8 bus, u8 devfn)
1254 {
1255         struct context_entry *context;
1256         struct intel_iommu *iommu = domain->iommu;
1257         unsigned long flags;
1258
1259         pr_debug("Set context mapping for %02x:%02x.%d\n",
1260                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1261         BUG_ON(!domain->pgd);
1262         context = device_to_context_entry(iommu, bus, devfn);
1263         if (!context)
1264                 return -ENOMEM;
1265         spin_lock_irqsave(&iommu->lock, flags);
1266         if (context_present(*context)) {
1267                 spin_unlock_irqrestore(&iommu->lock, flags);
1268                 return 0;
1269         }
1270
1271         context_set_domain_id(*context, domain->id);
1272         context_set_address_width(*context, domain->agaw);
1273         context_set_address_root(*context, virt_to_phys(domain->pgd));
1274         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1275         context_set_fault_enable(*context);
1276         context_set_present(*context);
1277         __iommu_flush_cache(iommu, context, sizeof(*context));
1278
1279         /* it's a non-present to present mapping */
1280         if (iommu_flush_context_device(iommu, domain->id,
1281                         (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1282                 iommu_flush_write_buffer(iommu);
1283         else
1284                 iommu_flush_iotlb_dsi(iommu, 0, 0);
1285         spin_unlock_irqrestore(&iommu->lock, flags);
1286         return 0;
1287 }
1288
1289 static int
1290 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1291 {
1292         int ret;
1293         struct pci_dev *tmp, *parent;
1294
1295         ret = domain_context_mapping_one(domain, pdev->bus->number,
1296                 pdev->devfn);
1297         if (ret)
1298                 return ret;
1299
1300         /* dependent device mapping */
1301         tmp = pci_find_upstream_pcie_bridge(pdev);
1302         if (!tmp)
1303                 return 0;
1304         /* Secondary interface's bus number and devfn 0 */
1305         parent = pdev->bus->self;
1306         while (parent != tmp) {
1307                 ret = domain_context_mapping_one(domain, parent->bus->number,
1308                         parent->devfn);
1309                 if (ret)
1310                         return ret;
1311                 parent = parent->bus->self;
1312         }
1313         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1314                 return domain_context_mapping_one(domain,
1315                         tmp->subordinate->number, 0);
1316         else /* this is a legacy PCI bridge */
1317                 return domain_context_mapping_one(domain,
1318                         tmp->bus->number, tmp->devfn);
1319 }
1320
1321 static int domain_context_mapped(struct dmar_domain *domain,
1322         struct pci_dev *pdev)
1323 {
1324         int ret;
1325         struct pci_dev *tmp, *parent;
1326
1327         ret = device_context_mapped(domain->iommu,
1328                 pdev->bus->number, pdev->devfn);
1329         if (!ret)
1330                 return ret;
1331         /* dependent device mapping */
1332         tmp = pci_find_upstream_pcie_bridge(pdev);
1333         if (!tmp)
1334                 return ret;
1335         /* Secondary interface's bus number and devfn 0 */
1336         parent = pdev->bus->self;
1337         while (parent != tmp) {
1338                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1339                         parent->devfn);
1340                 if (!ret)
1341                         return ret;
1342                 parent = parent->bus->self;
1343         }
1344         if (tmp->is_pcie)
1345                 return device_context_mapped(domain->iommu,
1346                         tmp->subordinate->number, 0);
1347         else
1348                 return device_context_mapped(domain->iommu,
1349                         tmp->bus->number, tmp->devfn);
1350 }
1351
1352 static int
1353 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1354                         u64 hpa, size_t size, int prot)
1355 {
1356         u64 start_pfn, end_pfn;
1357         struct dma_pte *pte;
1358         int index;
1359
1360         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1361                 return -EINVAL;
1362         iova &= PAGE_MASK_4K;
1363         start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1364         end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1365         index = 0;
1366         while (start_pfn < end_pfn) {
1367                 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1368                 if (!pte)
1369                         return -ENOMEM;
1370                 /* We don't need lock here, nobody else
1371                  * touches the iova range
1372                  */
1373                 BUG_ON(dma_pte_addr(*pte));
1374                 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1375                 dma_set_pte_prot(*pte, prot);
1376                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1377                 start_pfn++;
1378                 index++;
1379         }
1380         return 0;
1381 }
1382
1383 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1384 {
1385         clear_context_table(domain->iommu, bus, devfn);
1386         iommu_flush_context_global(domain->iommu, 0);
1387         iommu_flush_iotlb_global(domain->iommu, 0);
1388 }
1389
1390 static void domain_remove_dev_info(struct dmar_domain *domain)
1391 {
1392         struct device_domain_info *info;
1393         unsigned long flags;
1394
1395         spin_lock_irqsave(&device_domain_lock, flags);
1396         while (!list_empty(&domain->devices)) {
1397                 info = list_entry(domain->devices.next,
1398                         struct device_domain_info, link);
1399                 list_del(&info->link);
1400                 list_del(&info->global);
1401                 if (info->dev)
1402                         info->dev->dev.archdata.iommu = NULL;
1403                 spin_unlock_irqrestore(&device_domain_lock, flags);
1404
1405                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1406                 free_devinfo_mem(info);
1407
1408                 spin_lock_irqsave(&device_domain_lock, flags);
1409         }
1410         spin_unlock_irqrestore(&device_domain_lock, flags);
1411 }
1412
1413 /*
1414  * find_domain
1415  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1416  */
1417 struct dmar_domain *
1418 find_domain(struct pci_dev *pdev)
1419 {
1420         struct device_domain_info *info;
1421
1422         /* No lock here, assumes no domain exit in normal case */
1423         info = pdev->dev.archdata.iommu;
1424         if (info)
1425                 return info->domain;
1426         return NULL;
1427 }
1428
1429 static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
1430      struct pci_dev *dev)
1431 {
1432         int index;
1433
1434         while (dev) {
1435                 for (index = 0; index < cnt; index++)
1436                         if (dev == devices[index])
1437                                 return 1;
1438
1439                 /* Check our parent */
1440                 dev = dev->bus->self;
1441         }
1442
1443         return 0;
1444 }
1445
1446 static struct dmar_drhd_unit *
1447 dmar_find_matched_drhd_unit(struct pci_dev *dev)
1448 {
1449         struct dmar_drhd_unit *drhd = NULL;
1450
1451         list_for_each_entry(drhd, &dmar_drhd_units, list) {
1452                 if (drhd->include_all || dmar_pci_device_match(drhd->devices,
1453                                                 drhd->devices_cnt, dev))
1454                         return drhd;
1455         }
1456
1457         return NULL;
1458 }
1459
1460 /* domain is initialized */
1461 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1462 {
1463         struct dmar_domain *domain, *found = NULL;
1464         struct intel_iommu *iommu;
1465         struct dmar_drhd_unit *drhd;
1466         struct device_domain_info *info, *tmp;
1467         struct pci_dev *dev_tmp;
1468         unsigned long flags;
1469         int bus = 0, devfn = 0;
1470
1471         domain = find_domain(pdev);
1472         if (domain)
1473                 return domain;
1474
1475         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1476         if (dev_tmp) {
1477                 if (dev_tmp->is_pcie) {
1478                         bus = dev_tmp->subordinate->number;
1479                         devfn = 0;
1480                 } else {
1481                         bus = dev_tmp->bus->number;
1482                         devfn = dev_tmp->devfn;
1483                 }
1484                 spin_lock_irqsave(&device_domain_lock, flags);
1485                 list_for_each_entry(info, &device_domain_list, global) {
1486                         if (info->bus == bus && info->devfn == devfn) {
1487                                 found = info->domain;
1488                                 break;
1489                         }
1490                 }
1491                 spin_unlock_irqrestore(&device_domain_lock, flags);
1492                 /* pcie-pci bridge already has a domain, uses it */
1493                 if (found) {
1494                         domain = found;
1495                         goto found_domain;
1496                 }
1497         }
1498
1499         /* Allocate new domain for the device */
1500         drhd = dmar_find_matched_drhd_unit(pdev);
1501         if (!drhd) {
1502                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1503                         pci_name(pdev));
1504                 return NULL;
1505         }
1506         iommu = drhd->iommu;
1507
1508         domain = iommu_alloc_domain(iommu);
1509         if (!domain)
1510                 goto error;
1511
1512         if (domain_init(domain, gaw)) {
1513                 domain_exit(domain);
1514                 goto error;
1515         }
1516
1517         /* register pcie-to-pci device */
1518         if (dev_tmp) {
1519                 info = alloc_devinfo_mem();
1520                 if (!info) {
1521                         domain_exit(domain);
1522                         goto error;
1523                 }
1524                 info->bus = bus;
1525                 info->devfn = devfn;
1526                 info->dev = NULL;
1527                 info->domain = domain;
1528                 /* This domain is shared by devices under p2p bridge */
1529                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1530
1531                 /* pcie-to-pci bridge already has a domain, uses it */
1532                 found = NULL;
1533                 spin_lock_irqsave(&device_domain_lock, flags);
1534                 list_for_each_entry(tmp, &device_domain_list, global) {
1535                         if (tmp->bus == bus && tmp->devfn == devfn) {
1536                                 found = tmp->domain;
1537                                 break;
1538                         }
1539                 }
1540                 if (found) {
1541                         free_devinfo_mem(info);
1542                         domain_exit(domain);
1543                         domain = found;
1544                 } else {
1545                         list_add(&info->link, &domain->devices);
1546                         list_add(&info->global, &device_domain_list);
1547                 }
1548                 spin_unlock_irqrestore(&device_domain_lock, flags);
1549         }
1550
1551 found_domain:
1552         info = alloc_devinfo_mem();
1553         if (!info)
1554                 goto error;
1555         info->bus = pdev->bus->number;
1556         info->devfn = pdev->devfn;
1557         info->dev = pdev;
1558         info->domain = domain;
1559         spin_lock_irqsave(&device_domain_lock, flags);
1560         /* somebody is fast */
1561         found = find_domain(pdev);
1562         if (found != NULL) {
1563                 spin_unlock_irqrestore(&device_domain_lock, flags);
1564                 if (found != domain) {
1565                         domain_exit(domain);
1566                         domain = found;
1567                 }
1568                 free_devinfo_mem(info);
1569                 return domain;
1570         }
1571         list_add(&info->link, &domain->devices);
1572         list_add(&info->global, &device_domain_list);
1573         pdev->dev.archdata.iommu = info;
1574         spin_unlock_irqrestore(&device_domain_lock, flags);
1575         return domain;
1576 error:
1577         /* recheck it here, maybe others set it */
1578         return find_domain(pdev);
1579 }
1580
1581 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1582 {
1583         struct dmar_domain *domain;
1584         unsigned long size;
1585         u64 base;
1586         int ret;
1587
1588         printk(KERN_INFO
1589                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1590                 pci_name(pdev), start, end);
1591         /* page table init */
1592         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1593         if (!domain)
1594                 return -ENOMEM;
1595
1596         /* The address might not be aligned */
1597         base = start & PAGE_MASK_4K;
1598         size = end - base;
1599         size = PAGE_ALIGN_4K(size);
1600         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1601                         IOVA_PFN(base + size) - 1)) {
1602                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1603                 ret = -ENOMEM;
1604                 goto error;
1605         }
1606
1607         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1608                 size, base, pci_name(pdev));
1609         /*
1610          * RMRR range might have overlap with physical memory range,
1611          * clear it first
1612          */
1613         dma_pte_clear_range(domain, base, base + size);
1614
1615         ret = domain_page_mapping(domain, base, base, size,
1616                 DMA_PTE_READ|DMA_PTE_WRITE);
1617         if (ret)
1618                 goto error;
1619
1620         /* context entry init */
1621         ret = domain_context_mapping(domain, pdev);
1622         if (!ret)
1623                 return 0;
1624 error:
1625         domain_exit(domain);
1626         return ret;
1627
1628 }
1629
1630 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1631         struct pci_dev *pdev)
1632 {
1633         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1634                 return 0;
1635         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1636                 rmrr->end_address + 1);
1637 }
1638
1639 #ifdef CONFIG_DMAR_GFX_WA
1640 struct iommu_prepare_data {
1641         struct pci_dev *pdev;
1642         int ret;
1643 };
1644
1645 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1646                                          unsigned long end_pfn, void *datax)
1647 {
1648         struct iommu_prepare_data *data;
1649
1650         data = (struct iommu_prepare_data *)datax;
1651
1652         data->ret = iommu_prepare_identity_map(data->pdev,
1653                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1654         return data->ret;
1655
1656 }
1657
1658 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1659 {
1660         int nid;
1661         struct iommu_prepare_data data;
1662
1663         data.pdev = pdev;
1664         data.ret = 0;
1665
1666         for_each_online_node(nid) {
1667                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1668                 if (data.ret)
1669                         return data.ret;
1670         }
1671         return data.ret;
1672 }
1673
1674 static void __init iommu_prepare_gfx_mapping(void)
1675 {
1676         struct pci_dev *pdev = NULL;
1677         int ret;
1678
1679         for_each_pci_dev(pdev) {
1680                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1681                                 !IS_GFX_DEVICE(pdev))
1682                         continue;
1683                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1684                         pci_name(pdev));
1685                 ret = iommu_prepare_with_active_regions(pdev);
1686                 if (ret)
1687                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1688         }
1689 }
1690 #endif
1691
1692 #ifdef CONFIG_DMAR_FLOPPY_WA
1693 static inline void iommu_prepare_isa(void)
1694 {
1695         struct pci_dev *pdev;
1696         int ret;
1697
1698         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1699         if (!pdev)
1700                 return;
1701
1702         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1703         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1704
1705         if (ret)
1706                 printk("IOMMU: Failed to create 0-64M identity map, "
1707                         "floppy might not work\n");
1708
1709 }
1710 #else
1711 static inline void iommu_prepare_isa(void)
1712 {
1713         return;
1714 }
1715 #endif /* !CONFIG_DMAR_FLPY_WA */
1716
1717 int __init init_dmars(void)
1718 {
1719         struct dmar_drhd_unit *drhd;
1720         struct dmar_rmrr_unit *rmrr;
1721         struct pci_dev *pdev;
1722         struct intel_iommu *iommu;
1723         int i, ret, unit = 0;
1724
1725         /*
1726          * for each drhd
1727          *    allocate root
1728          *    initialize and program root entry to not present
1729          * endfor
1730          */
1731         for_each_drhd_unit(drhd) {
1732                 if (drhd->ignored)
1733                         continue;
1734                 g_num_of_iommus++;
1735                 /*
1736                  * lock not needed as this is only incremented in the single
1737                  * threaded kernel __init code path all other access are read
1738                  * only
1739                  */
1740         }
1741
1742         g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
1743         if (!g_iommus) {
1744                 ret = -ENOMEM;
1745                 goto error;
1746         }
1747
1748         deferred_flush = kzalloc(g_num_of_iommus *
1749                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1750         if (!deferred_flush) {
1751                 kfree(g_iommus);
1752                 ret = -ENOMEM;
1753                 goto error;
1754         }
1755
1756         i = 0;
1757         for_each_drhd_unit(drhd) {
1758                 if (drhd->ignored)
1759                         continue;
1760                 iommu = alloc_iommu(&g_iommus[i], drhd);
1761                 i++;
1762                 if (!iommu) {
1763                         ret = -ENOMEM;
1764                         goto error;
1765                 }
1766
1767                 /*
1768                  * TBD:
1769                  * we could share the same root & context tables
1770                  * amoung all IOMMU's. Need to Split it later.
1771                  */
1772                 ret = iommu_alloc_root_entry(iommu);
1773                 if (ret) {
1774                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1775                         goto error;
1776                 }
1777         }
1778
1779         /*
1780          * For each rmrr
1781          *   for each dev attached to rmrr
1782          *   do
1783          *     locate drhd for dev, alloc domain for dev
1784          *     allocate free domain
1785          *     allocate page table entries for rmrr
1786          *     if context not allocated for bus
1787          *           allocate and init context
1788          *           set present in root table for this bus
1789          *     init context with domain, translation etc
1790          *    endfor
1791          * endfor
1792          */
1793         for_each_rmrr_units(rmrr) {
1794                 for (i = 0; i < rmrr->devices_cnt; i++) {
1795                         pdev = rmrr->devices[i];
1796                         /* some BIOS lists non-exist devices in DMAR table */
1797                         if (!pdev)
1798                                 continue;
1799                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1800                         if (ret)
1801                                 printk(KERN_ERR
1802                                  "IOMMU: mapping reserved region failed\n");
1803                 }
1804         }
1805
1806         iommu_prepare_gfx_mapping();
1807
1808         iommu_prepare_isa();
1809
1810         /*
1811          * for each drhd
1812          *   enable fault log
1813          *   global invalidate context cache
1814          *   global invalidate iotlb
1815          *   enable translation
1816          */
1817         for_each_drhd_unit(drhd) {
1818                 if (drhd->ignored)
1819                         continue;
1820                 iommu = drhd->iommu;
1821                 sprintf (iommu->name, "dmar%d", unit++);
1822
1823                 iommu_flush_write_buffer(iommu);
1824
1825                 ret = dmar_set_interrupt(iommu);
1826                 if (ret)
1827                         goto error;
1828
1829                 iommu_set_root_entry(iommu);
1830
1831                 iommu_flush_context_global(iommu, 0);
1832                 iommu_flush_iotlb_global(iommu, 0);
1833
1834                 iommu_disable_protect_mem_regions(iommu);
1835
1836                 ret = iommu_enable_translation(iommu);
1837                 if (ret)
1838                         goto error;
1839         }
1840
1841         return 0;
1842 error:
1843         for_each_drhd_unit(drhd) {
1844                 if (drhd->ignored)
1845                         continue;
1846                 iommu = drhd->iommu;
1847                 free_iommu(iommu);
1848         }
1849         kfree(g_iommus);
1850         return ret;
1851 }
1852
1853 static inline u64 aligned_size(u64 host_addr, size_t size)
1854 {
1855         u64 addr;
1856         addr = (host_addr & (~PAGE_MASK_4K)) + size;
1857         return PAGE_ALIGN_4K(addr);
1858 }
1859
1860 struct iova *
1861 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1862 {
1863         struct iova *piova;
1864
1865         /* Make sure it's in range */
1866         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1867         if (!size || (IOVA_START_ADDR + size > end))
1868                 return NULL;
1869
1870         piova = alloc_iova(&domain->iovad,
1871                         size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1872         return piova;
1873 }
1874
1875 static struct iova *
1876 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1877                 size_t size)
1878 {
1879         struct pci_dev *pdev = to_pci_dev(dev);
1880         struct iova *iova = NULL;
1881
1882         if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1883                 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1884         } else  {
1885                 /*
1886                  * First try to allocate an io virtual address in
1887                  * DMA_32BIT_MASK and if that fails then try allocating
1888                  * from higher range
1889                  */
1890                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1891                 if (!iova)
1892                         iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1893         }
1894
1895         if (!iova) {
1896                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1897                 return NULL;
1898         }
1899
1900         return iova;
1901 }
1902
1903 static struct dmar_domain *
1904 get_valid_domain_for_dev(struct pci_dev *pdev)
1905 {
1906         struct dmar_domain *domain;
1907         int ret;
1908
1909         domain = get_domain_for_dev(pdev,
1910                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1911         if (!domain) {
1912                 printk(KERN_ERR
1913                         "Allocating domain for %s failed", pci_name(pdev));
1914                 return NULL;
1915         }
1916
1917         /* make sure context mapping is ok */
1918         if (unlikely(!domain_context_mapped(domain, pdev))) {
1919                 ret = domain_context_mapping(domain, pdev);
1920                 if (ret) {
1921                         printk(KERN_ERR
1922                                 "Domain context map for %s failed",
1923                                 pci_name(pdev));
1924                         return NULL;
1925                 }
1926         }
1927
1928         return domain;
1929 }
1930
1931 static dma_addr_t
1932 intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
1933 {
1934         struct pci_dev *pdev = to_pci_dev(hwdev);
1935         struct dmar_domain *domain;
1936         unsigned long start_paddr;
1937         struct iova *iova;
1938         int prot = 0;
1939         int ret;
1940
1941         BUG_ON(dir == DMA_NONE);
1942         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1943                 return paddr;
1944
1945         domain = get_valid_domain_for_dev(pdev);
1946         if (!domain)
1947                 return 0;
1948
1949         size = aligned_size((u64)paddr, size);
1950
1951         iova = __intel_alloc_iova(hwdev, domain, size);
1952         if (!iova)
1953                 goto error;
1954
1955         start_paddr = iova->pfn_lo << PAGE_SHIFT_4K;
1956
1957         /*
1958          * Check if DMAR supports zero-length reads on write only
1959          * mappings..
1960          */
1961         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1962                         !cap_zlr(domain->iommu->cap))
1963                 prot |= DMA_PTE_READ;
1964         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1965                 prot |= DMA_PTE_WRITE;
1966         /*
1967          * paddr - (paddr + size) might be partial page, we should map the whole
1968          * page.  Note: if two part of one page are separately mapped, we
1969          * might have two guest_addr mapping to the same host paddr, but this
1970          * is not a big problem
1971          */
1972         ret = domain_page_mapping(domain, start_paddr,
1973                 ((u64)paddr) & PAGE_MASK_4K, size, prot);
1974         if (ret)
1975                 goto error;
1976
1977         pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1978                 pci_name(pdev), size, (u64)paddr,
1979                 size, (u64)start_paddr, dir);
1980
1981         /* it's a non-present to present mapping */
1982         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1983                         start_paddr, size >> PAGE_SHIFT_4K, 1);
1984         if (ret)
1985                 iommu_flush_write_buffer(domain->iommu);
1986
1987         return (start_paddr + ((u64)paddr & (~PAGE_MASK_4K)));
1988
1989 error:
1990         if (iova)
1991                 __free_iova(&domain->iovad, iova);
1992         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1993                 pci_name(pdev), size, (u64)paddr, dir);
1994         return 0;
1995 }
1996
1997 static void flush_unmaps(void)
1998 {
1999         int i, j;
2000
2001         timer_on = 0;
2002
2003         /* just flush them all */
2004         for (i = 0; i < g_num_of_iommus; i++) {
2005                 if (deferred_flush[i].next) {
2006                         iommu_flush_iotlb_global(&g_iommus[i], 0);
2007                         for (j = 0; j < deferred_flush[i].next; j++) {
2008                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2009                                                 deferred_flush[i].iova[j]);
2010                         }
2011                         deferred_flush[i].next = 0;
2012                 }
2013         }
2014
2015         list_size = 0;
2016 }
2017
2018 static void flush_unmaps_timeout(unsigned long data)
2019 {
2020         unsigned long flags;
2021
2022         spin_lock_irqsave(&async_umap_flush_lock, flags);
2023         flush_unmaps();
2024         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2025 }
2026
2027 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2028 {
2029         unsigned long flags;
2030         int next, iommu_id;
2031
2032         spin_lock_irqsave(&async_umap_flush_lock, flags);
2033         if (list_size == HIGH_WATER_MARK)
2034                 flush_unmaps();
2035
2036         iommu_id = dom->iommu - g_iommus;
2037         next = deferred_flush[iommu_id].next;
2038         deferred_flush[iommu_id].domain[next] = dom;
2039         deferred_flush[iommu_id].iova[next] = iova;
2040         deferred_flush[iommu_id].next++;
2041
2042         if (!timer_on) {
2043                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2044                 timer_on = 1;
2045         }
2046         list_size++;
2047         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2048 }
2049
2050 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
2051         size_t size, int dir)
2052 {
2053         struct pci_dev *pdev = to_pci_dev(dev);
2054         struct dmar_domain *domain;
2055         unsigned long start_addr;
2056         struct iova *iova;
2057
2058         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2059                 return;
2060         domain = find_domain(pdev);
2061         BUG_ON(!domain);
2062
2063         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2064         if (!iova)
2065                 return;
2066
2067         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2068         size = aligned_size((u64)dev_addr, size);
2069
2070         pr_debug("Device %s unmapping: %lx@%llx\n",
2071                 pci_name(pdev), size, (u64)start_addr);
2072
2073         /*  clear the whole page */
2074         dma_pte_clear_range(domain, start_addr, start_addr + size);
2075         /* free page tables */
2076         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2077         if (intel_iommu_strict) {
2078                 if (iommu_flush_iotlb_psi(domain->iommu,
2079                         domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
2080                         iommu_flush_write_buffer(domain->iommu);
2081                 /* free iova */
2082                 __free_iova(&domain->iovad, iova);
2083         } else {
2084                 add_unmap(domain, iova);
2085                 /*
2086                  * queue up the release of the unmap to save the 1/6th of the
2087                  * cpu used up by the iotlb flush operation...
2088                  */
2089         }
2090 }
2091
2092 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
2093                        dma_addr_t *dma_handle, gfp_t flags)
2094 {
2095         void *vaddr;
2096         int order;
2097
2098         size = PAGE_ALIGN_4K(size);
2099         order = get_order(size);
2100         flags &= ~(GFP_DMA | GFP_DMA32);
2101
2102         vaddr = (void *)__get_free_pages(flags, order);
2103         if (!vaddr)
2104                 return NULL;
2105         memset(vaddr, 0, size);
2106
2107         *dma_handle = intel_map_single(hwdev, virt_to_bus(vaddr), size, DMA_BIDIRECTIONAL);
2108         if (*dma_handle)
2109                 return vaddr;
2110         free_pages((unsigned long)vaddr, order);
2111         return NULL;
2112 }
2113
2114 static void intel_free_coherent(struct device *hwdev, size_t size,
2115         void *vaddr, dma_addr_t dma_handle)
2116 {
2117         int order;
2118
2119         size = PAGE_ALIGN_4K(size);
2120         order = get_order(size);
2121
2122         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2123         free_pages((unsigned long)vaddr, order);
2124 }
2125
2126 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2127 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2128         int nelems, int dir)
2129 {
2130         int i;
2131         struct pci_dev *pdev = to_pci_dev(hwdev);
2132         struct dmar_domain *domain;
2133         unsigned long start_addr;
2134         struct iova *iova;
2135         size_t size = 0;
2136         void *addr;
2137         struct scatterlist *sg;
2138
2139         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2140                 return;
2141
2142         domain = find_domain(pdev);
2143
2144         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2145         if (!iova)
2146                 return;
2147         for_each_sg(sglist, sg, nelems, i) {
2148                 addr = SG_ENT_VIRT_ADDRESS(sg);
2149                 size += aligned_size((u64)addr, sg->length);
2150         }
2151
2152         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2153
2154         /*  clear the whole page */
2155         dma_pte_clear_range(domain, start_addr, start_addr + size);
2156         /* free page tables */
2157         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2158
2159         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2160                         size >> PAGE_SHIFT_4K, 0))
2161                 iommu_flush_write_buffer(domain->iommu);
2162
2163         /* free iova */
2164         __free_iova(&domain->iovad, iova);
2165 }
2166
2167 static int intel_nontranslate_map_sg(struct device *hddev,
2168         struct scatterlist *sglist, int nelems, int dir)
2169 {
2170         int i;
2171         struct scatterlist *sg;
2172
2173         for_each_sg(sglist, sg, nelems, i) {
2174                 BUG_ON(!sg_page(sg));
2175                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2176                 sg->dma_length = sg->length;
2177         }
2178         return nelems;
2179 }
2180
2181 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2182                                 int nelems, int dir)
2183 {
2184         void *addr;
2185         int i;
2186         struct pci_dev *pdev = to_pci_dev(hwdev);
2187         struct dmar_domain *domain;
2188         size_t size = 0;
2189         int prot = 0;
2190         size_t offset = 0;
2191         struct iova *iova = NULL;
2192         int ret;
2193         struct scatterlist *sg;
2194         unsigned long start_addr;
2195
2196         BUG_ON(dir == DMA_NONE);
2197         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2198                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2199
2200         domain = get_valid_domain_for_dev(pdev);
2201         if (!domain)
2202                 return 0;
2203
2204         for_each_sg(sglist, sg, nelems, i) {
2205                 addr = SG_ENT_VIRT_ADDRESS(sg);
2206                 addr = (void *)virt_to_phys(addr);
2207                 size += aligned_size((u64)addr, sg->length);
2208         }
2209
2210         iova = __intel_alloc_iova(hwdev, domain, size);
2211         if (!iova) {
2212                 sglist->dma_length = 0;
2213                 return 0;
2214         }
2215
2216         /*
2217          * Check if DMAR supports zero-length reads on write only
2218          * mappings..
2219          */
2220         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2221                         !cap_zlr(domain->iommu->cap))
2222                 prot |= DMA_PTE_READ;
2223         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2224                 prot |= DMA_PTE_WRITE;
2225
2226         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2227         offset = 0;
2228         for_each_sg(sglist, sg, nelems, i) {
2229                 addr = SG_ENT_VIRT_ADDRESS(sg);
2230                 addr = (void *)virt_to_phys(addr);
2231                 size = aligned_size((u64)addr, sg->length);
2232                 ret = domain_page_mapping(domain, start_addr + offset,
2233                         ((u64)addr) & PAGE_MASK_4K,
2234                         size, prot);
2235                 if (ret) {
2236                         /*  clear the page */
2237                         dma_pte_clear_range(domain, start_addr,
2238                                   start_addr + offset);
2239                         /* free page tables */
2240                         dma_pte_free_pagetable(domain, start_addr,
2241                                   start_addr + offset);
2242                         /* free iova */
2243                         __free_iova(&domain->iovad, iova);
2244                         return 0;
2245                 }
2246                 sg->dma_address = start_addr + offset +
2247                                 ((u64)addr & (~PAGE_MASK_4K));
2248                 sg->dma_length = sg->length;
2249                 offset += size;
2250         }
2251
2252         /* it's a non-present to present mapping */
2253         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2254                         start_addr, offset >> PAGE_SHIFT_4K, 1))
2255                 iommu_flush_write_buffer(domain->iommu);
2256         return nelems;
2257 }
2258
2259 static struct dma_mapping_ops intel_dma_ops = {
2260         .alloc_coherent = intel_alloc_coherent,
2261         .free_coherent = intel_free_coherent,
2262         .map_single = intel_map_single,
2263         .unmap_single = intel_unmap_single,
2264         .map_sg = intel_map_sg,
2265         .unmap_sg = intel_unmap_sg,
2266 };
2267
2268 static inline int iommu_domain_cache_init(void)
2269 {
2270         int ret = 0;
2271
2272         iommu_domain_cache = kmem_cache_create("iommu_domain",
2273                                          sizeof(struct dmar_domain),
2274                                          0,
2275                                          SLAB_HWCACHE_ALIGN,
2276
2277                                          NULL);
2278         if (!iommu_domain_cache) {
2279                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2280                 ret = -ENOMEM;
2281         }
2282
2283         return ret;
2284 }
2285
2286 static inline int iommu_devinfo_cache_init(void)
2287 {
2288         int ret = 0;
2289
2290         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2291                                          sizeof(struct device_domain_info),
2292                                          0,
2293                                          SLAB_HWCACHE_ALIGN,
2294
2295                                          NULL);
2296         if (!iommu_devinfo_cache) {
2297                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2298                 ret = -ENOMEM;
2299         }
2300
2301         return ret;
2302 }
2303
2304 static inline int iommu_iova_cache_init(void)
2305 {
2306         int ret = 0;
2307
2308         iommu_iova_cache = kmem_cache_create("iommu_iova",
2309                                          sizeof(struct iova),
2310                                          0,
2311                                          SLAB_HWCACHE_ALIGN,
2312
2313                                          NULL);
2314         if (!iommu_iova_cache) {
2315                 printk(KERN_ERR "Couldn't create iova cache\n");
2316                 ret = -ENOMEM;
2317         }
2318
2319         return ret;
2320 }
2321
2322 static int __init iommu_init_mempool(void)
2323 {
2324         int ret;
2325         ret = iommu_iova_cache_init();
2326         if (ret)
2327                 return ret;
2328
2329         ret = iommu_domain_cache_init();
2330         if (ret)
2331                 goto domain_error;
2332
2333         ret = iommu_devinfo_cache_init();
2334         if (!ret)
2335                 return ret;
2336
2337         kmem_cache_destroy(iommu_domain_cache);
2338 domain_error:
2339         kmem_cache_destroy(iommu_iova_cache);
2340
2341         return -ENOMEM;
2342 }
2343
2344 static void __init iommu_exit_mempool(void)
2345 {
2346         kmem_cache_destroy(iommu_devinfo_cache);
2347         kmem_cache_destroy(iommu_domain_cache);
2348         kmem_cache_destroy(iommu_iova_cache);
2349
2350 }
2351
2352 void __init detect_intel_iommu(void)
2353 {
2354         if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2355                 return;
2356         if (early_dmar_detect()) {
2357                 iommu_detected = 1;
2358         }
2359 }
2360
2361 static void __init init_no_remapping_devices(void)
2362 {
2363         struct dmar_drhd_unit *drhd;
2364
2365         for_each_drhd_unit(drhd) {
2366                 if (!drhd->include_all) {
2367                         int i;
2368                         for (i = 0; i < drhd->devices_cnt; i++)
2369                                 if (drhd->devices[i] != NULL)
2370                                         break;
2371                         /* ignore DMAR unit if no pci devices exist */
2372                         if (i == drhd->devices_cnt)
2373                                 drhd->ignored = 1;
2374                 }
2375         }
2376
2377         if (dmar_map_gfx)
2378                 return;
2379
2380         for_each_drhd_unit(drhd) {
2381                 int i;
2382                 if (drhd->ignored || drhd->include_all)
2383                         continue;
2384
2385                 for (i = 0; i < drhd->devices_cnt; i++)
2386                         if (drhd->devices[i] &&
2387                                 !IS_GFX_DEVICE(drhd->devices[i]))
2388                                 break;
2389
2390                 if (i < drhd->devices_cnt)
2391                         continue;
2392
2393                 /* bypass IOMMU if it is just for gfx devices */
2394                 drhd->ignored = 1;
2395                 for (i = 0; i < drhd->devices_cnt; i++) {
2396                         if (!drhd->devices[i])
2397                                 continue;
2398                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2399                 }
2400         }
2401 }
2402
2403 int __init intel_iommu_init(void)
2404 {
2405         int ret = 0;
2406
2407         if (no_iommu || swiotlb || dmar_disabled)
2408                 return -ENODEV;
2409
2410         if (dmar_table_init())
2411                 return  -ENODEV;
2412
2413         iommu_init_mempool();
2414         dmar_init_reserved_ranges();
2415
2416         init_no_remapping_devices();
2417
2418         ret = init_dmars();
2419         if (ret) {
2420                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2421                 put_iova_domain(&reserved_iova_list);
2422                 iommu_exit_mempool();
2423                 return ret;
2424         }
2425         printk(KERN_INFO
2426         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2427
2428         init_timer(&unmap_timer);
2429         force_iommu = 1;
2430         dma_ops = &intel_dma_ops;
2431         return 0;
2432 }
2433