Merge branches 'release', 'asus', 'sony-laptop' and 'thinkpad' into release
[pandora-kernel.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18  * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19  * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
20  */
21
22 #include <linux/init.h>
23 #include <linux/bitmap.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/sysdev.h>
28 #include <linux/spinlock.h>
29 #include <linux/pci.h>
30 #include <linux/dmar.h>
31 #include <linux/dma-mapping.h>
32 #include <linux/mempool.h>
33 #include "iova.h"
34 #include "intel-iommu.h"
35 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
36 #include <asm/cacheflush.h>
37 #include <asm/gart.h>
38 #include "pci.h"
39
40 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
41 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
42
43 #define IOAPIC_RANGE_START      (0xfee00000)
44 #define IOAPIC_RANGE_END        (0xfeefffff)
45 #define IOVA_START_ADDR         (0x1000)
46
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
48
49 #define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */
50
51 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
52
53 static void domain_remove_dev_info(struct dmar_domain *domain);
54
55 static int dmar_disabled;
56 static int __initdata dmar_map_gfx = 1;
57 static int dmar_forcedac;
58
59 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
60 static DEFINE_SPINLOCK(device_domain_lock);
61 static LIST_HEAD(device_domain_list);
62
63 static int __init intel_iommu_setup(char *str)
64 {
65         if (!str)
66                 return -EINVAL;
67         while (*str) {
68                 if (!strncmp(str, "off", 3)) {
69                         dmar_disabled = 1;
70                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
71                 } else if (!strncmp(str, "igfx_off", 8)) {
72                         dmar_map_gfx = 0;
73                         printk(KERN_INFO
74                                 "Intel-IOMMU: disable GFX device mapping\n");
75                 } else if (!strncmp(str, "forcedac", 8)) {
76                         printk (KERN_INFO
77                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
78                         dmar_forcedac = 1;
79                 }
80
81                 str += strcspn(str, ",");
82                 while (*str == ',')
83                         str++;
84         }
85         return 0;
86 }
87 __setup("intel_iommu=", intel_iommu_setup);
88
89 static struct kmem_cache *iommu_domain_cache;
90 static struct kmem_cache *iommu_devinfo_cache;
91 static struct kmem_cache *iommu_iova_cache;
92
93 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
94 {
95         unsigned int flags;
96         void *vaddr;
97
98         /* trying to avoid low memory issues */
99         flags = current->flags & PF_MEMALLOC;
100         current->flags |= PF_MEMALLOC;
101         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
102         current->flags &= (~PF_MEMALLOC | flags);
103         return vaddr;
104 }
105
106
107 static inline void *alloc_pgtable_page(void)
108 {
109         unsigned int flags;
110         void *vaddr;
111
112         /* trying to avoid low memory issues */
113         flags = current->flags & PF_MEMALLOC;
114         current->flags |= PF_MEMALLOC;
115         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
116         current->flags &= (~PF_MEMALLOC | flags);
117         return vaddr;
118 }
119
120 static inline void free_pgtable_page(void *vaddr)
121 {
122         free_page((unsigned long)vaddr);
123 }
124
125 static inline void *alloc_domain_mem(void)
126 {
127         return iommu_kmem_cache_alloc(iommu_domain_cache);
128 }
129
130 static inline void free_domain_mem(void *vaddr)
131 {
132         kmem_cache_free(iommu_domain_cache, vaddr);
133 }
134
135 static inline void * alloc_devinfo_mem(void)
136 {
137         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
138 }
139
140 static inline void free_devinfo_mem(void *vaddr)
141 {
142         kmem_cache_free(iommu_devinfo_cache, vaddr);
143 }
144
145 struct iova *alloc_iova_mem(void)
146 {
147         return iommu_kmem_cache_alloc(iommu_iova_cache);
148 }
149
150 void free_iova_mem(struct iova *iova)
151 {
152         kmem_cache_free(iommu_iova_cache, iova);
153 }
154
155 static inline void __iommu_flush_cache(
156         struct intel_iommu *iommu, void *addr, int size)
157 {
158         if (!ecap_coherent(iommu->ecap))
159                 clflush_cache_range(addr, size);
160 }
161
162 /* Gets context entry for a given bus and devfn */
163 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
164                 u8 bus, u8 devfn)
165 {
166         struct root_entry *root;
167         struct context_entry *context;
168         unsigned long phy_addr;
169         unsigned long flags;
170
171         spin_lock_irqsave(&iommu->lock, flags);
172         root = &iommu->root_entry[bus];
173         context = get_context_addr_from_root(root);
174         if (!context) {
175                 context = (struct context_entry *)alloc_pgtable_page();
176                 if (!context) {
177                         spin_unlock_irqrestore(&iommu->lock, flags);
178                         return NULL;
179                 }
180                 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
181                 phy_addr = virt_to_phys((void *)context);
182                 set_root_value(root, phy_addr);
183                 set_root_present(root);
184                 __iommu_flush_cache(iommu, root, sizeof(*root));
185         }
186         spin_unlock_irqrestore(&iommu->lock, flags);
187         return &context[devfn];
188 }
189
190 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
191 {
192         struct root_entry *root;
193         struct context_entry *context;
194         int ret;
195         unsigned long flags;
196
197         spin_lock_irqsave(&iommu->lock, flags);
198         root = &iommu->root_entry[bus];
199         context = get_context_addr_from_root(root);
200         if (!context) {
201                 ret = 0;
202                 goto out;
203         }
204         ret = context_present(context[devfn]);
205 out:
206         spin_unlock_irqrestore(&iommu->lock, flags);
207         return ret;
208 }
209
210 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
211 {
212         struct root_entry *root;
213         struct context_entry *context;
214         unsigned long flags;
215
216         spin_lock_irqsave(&iommu->lock, flags);
217         root = &iommu->root_entry[bus];
218         context = get_context_addr_from_root(root);
219         if (context) {
220                 context_clear_entry(context[devfn]);
221                 __iommu_flush_cache(iommu, &context[devfn], \
222                         sizeof(*context));
223         }
224         spin_unlock_irqrestore(&iommu->lock, flags);
225 }
226
227 static void free_context_table(struct intel_iommu *iommu)
228 {
229         struct root_entry *root;
230         int i;
231         unsigned long flags;
232         struct context_entry *context;
233
234         spin_lock_irqsave(&iommu->lock, flags);
235         if (!iommu->root_entry) {
236                 goto out;
237         }
238         for (i = 0; i < ROOT_ENTRY_NR; i++) {
239                 root = &iommu->root_entry[i];
240                 context = get_context_addr_from_root(root);
241                 if (context)
242                         free_pgtable_page(context);
243         }
244         free_pgtable_page(iommu->root_entry);
245         iommu->root_entry = NULL;
246 out:
247         spin_unlock_irqrestore(&iommu->lock, flags);
248 }
249
250 /* page table handling */
251 #define LEVEL_STRIDE            (9)
252 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
253
254 static inline int agaw_to_level(int agaw)
255 {
256         return agaw + 2;
257 }
258
259 static inline int agaw_to_width(int agaw)
260 {
261         return 30 + agaw * LEVEL_STRIDE;
262
263 }
264
265 static inline int width_to_agaw(int width)
266 {
267         return (width - 30) / LEVEL_STRIDE;
268 }
269
270 static inline unsigned int level_to_offset_bits(int level)
271 {
272         return (12 + (level - 1) * LEVEL_STRIDE);
273 }
274
275 static inline int address_level_offset(u64 addr, int level)
276 {
277         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
278 }
279
280 static inline u64 level_mask(int level)
281 {
282         return ((u64)-1 << level_to_offset_bits(level));
283 }
284
285 static inline u64 level_size(int level)
286 {
287         return ((u64)1 << level_to_offset_bits(level));
288 }
289
290 static inline u64 align_to_level(u64 addr, int level)
291 {
292         return ((addr + level_size(level) - 1) & level_mask(level));
293 }
294
295 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
296 {
297         int addr_width = agaw_to_width(domain->agaw);
298         struct dma_pte *parent, *pte = NULL;
299         int level = agaw_to_level(domain->agaw);
300         int offset;
301         unsigned long flags;
302
303         BUG_ON(!domain->pgd);
304
305         addr &= (((u64)1) << addr_width) - 1;
306         parent = domain->pgd;
307
308         spin_lock_irqsave(&domain->mapping_lock, flags);
309         while (level > 0) {
310                 void *tmp_page;
311
312                 offset = address_level_offset(addr, level);
313                 pte = &parent[offset];
314                 if (level == 1)
315                         break;
316
317                 if (!dma_pte_present(*pte)) {
318                         tmp_page = alloc_pgtable_page();
319
320                         if (!tmp_page) {
321                                 spin_unlock_irqrestore(&domain->mapping_lock,
322                                         flags);
323                                 return NULL;
324                         }
325                         __iommu_flush_cache(domain->iommu, tmp_page,
326                                         PAGE_SIZE_4K);
327                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
328                         /*
329                          * high level table always sets r/w, last level page
330                          * table control read/write
331                          */
332                         dma_set_pte_readable(*pte);
333                         dma_set_pte_writable(*pte);
334                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
335                 }
336                 parent = phys_to_virt(dma_pte_addr(*pte));
337                 level--;
338         }
339
340         spin_unlock_irqrestore(&domain->mapping_lock, flags);
341         return pte;
342 }
343
344 /* return address's pte at specific level */
345 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
346                 int level)
347 {
348         struct dma_pte *parent, *pte = NULL;
349         int total = agaw_to_level(domain->agaw);
350         int offset;
351
352         parent = domain->pgd;
353         while (level <= total) {
354                 offset = address_level_offset(addr, total);
355                 pte = &parent[offset];
356                 if (level == total)
357                         return pte;
358
359                 if (!dma_pte_present(*pte))
360                         break;
361                 parent = phys_to_virt(dma_pte_addr(*pte));
362                 total--;
363         }
364         return NULL;
365 }
366
367 /* clear one page's page table */
368 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
369 {
370         struct dma_pte *pte = NULL;
371
372         /* get last level pte */
373         pte = dma_addr_level_pte(domain, addr, 1);
374
375         if (pte) {
376                 dma_clear_pte(*pte);
377                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
378         }
379 }
380
381 /* clear last level pte, a tlb flush should be followed */
382 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
383 {
384         int addr_width = agaw_to_width(domain->agaw);
385
386         start &= (((u64)1) << addr_width) - 1;
387         end &= (((u64)1) << addr_width) - 1;
388         /* in case it's partial page */
389         start = PAGE_ALIGN_4K(start);
390         end &= PAGE_MASK_4K;
391
392         /* we don't need lock here, nobody else touches the iova range */
393         while (start < end) {
394                 dma_pte_clear_one(domain, start);
395                 start += PAGE_SIZE_4K;
396         }
397 }
398
399 /* free page table pages. last level pte should already be cleared */
400 static void dma_pte_free_pagetable(struct dmar_domain *domain,
401         u64 start, u64 end)
402 {
403         int addr_width = agaw_to_width(domain->agaw);
404         struct dma_pte *pte;
405         int total = agaw_to_level(domain->agaw);
406         int level;
407         u64 tmp;
408
409         start &= (((u64)1) << addr_width) - 1;
410         end &= (((u64)1) << addr_width) - 1;
411
412         /* we don't need lock here, nobody else touches the iova range */
413         level = 2;
414         while (level <= total) {
415                 tmp = align_to_level(start, level);
416                 if (tmp >= end || (tmp + level_size(level) > end))
417                         return;
418
419                 while (tmp < end) {
420                         pte = dma_addr_level_pte(domain, tmp, level);
421                         if (pte) {
422                                 free_pgtable_page(
423                                         phys_to_virt(dma_pte_addr(*pte)));
424                                 dma_clear_pte(*pte);
425                                 __iommu_flush_cache(domain->iommu,
426                                                 pte, sizeof(*pte));
427                         }
428                         tmp += level_size(level);
429                 }
430                 level++;
431         }
432         /* free pgd */
433         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
434                 free_pgtable_page(domain->pgd);
435                 domain->pgd = NULL;
436         }
437 }
438
439 /* iommu handling */
440 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
441 {
442         struct root_entry *root;
443         unsigned long flags;
444
445         root = (struct root_entry *)alloc_pgtable_page();
446         if (!root)
447                 return -ENOMEM;
448
449         __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
450
451         spin_lock_irqsave(&iommu->lock, flags);
452         iommu->root_entry = root;
453         spin_unlock_irqrestore(&iommu->lock, flags);
454
455         return 0;
456 }
457
458 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
459 {\
460         unsigned long start_time = jiffies;\
461         while (1) {\
462                 sts = op (iommu->reg + offset);\
463                 if (cond)\
464                         break;\
465                 if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\
466                         panic("DMAR hardware is malfunctioning\n");\
467                 cpu_relax();\
468         }\
469 }
470
471 static void iommu_set_root_entry(struct intel_iommu *iommu)
472 {
473         void *addr;
474         u32 cmd, sts;
475         unsigned long flag;
476
477         addr = iommu->root_entry;
478
479         spin_lock_irqsave(&iommu->register_lock, flag);
480         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
481
482         cmd = iommu->gcmd | DMA_GCMD_SRTP;
483         writel(cmd, iommu->reg + DMAR_GCMD_REG);
484
485         /* Make sure hardware complete it */
486         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
487                 readl, (sts & DMA_GSTS_RTPS), sts);
488
489         spin_unlock_irqrestore(&iommu->register_lock, flag);
490 }
491
492 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
493 {
494         u32 val;
495         unsigned long flag;
496
497         if (!cap_rwbf(iommu->cap))
498                 return;
499         val = iommu->gcmd | DMA_GCMD_WBF;
500
501         spin_lock_irqsave(&iommu->register_lock, flag);
502         writel(val, iommu->reg + DMAR_GCMD_REG);
503
504         /* Make sure hardware complete it */
505         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
506                         readl, (!(val & DMA_GSTS_WBFS)), val);
507
508         spin_unlock_irqrestore(&iommu->register_lock, flag);
509 }
510
511 /* return value determine if we need a write buffer flush */
512 static int __iommu_flush_context(struct intel_iommu *iommu,
513         u16 did, u16 source_id, u8 function_mask, u64 type,
514         int non_present_entry_flush)
515 {
516         u64 val = 0;
517         unsigned long flag;
518
519         /*
520          * In the non-present entry flush case, if hardware doesn't cache
521          * non-present entry we do nothing and if hardware cache non-present
522          * entry, we flush entries of domain 0 (the domain id is used to cache
523          * any non-present entries)
524          */
525         if (non_present_entry_flush) {
526                 if (!cap_caching_mode(iommu->cap))
527                         return 1;
528                 else
529                         did = 0;
530         }
531
532         switch (type) {
533         case DMA_CCMD_GLOBAL_INVL:
534                 val = DMA_CCMD_GLOBAL_INVL;
535                 break;
536         case DMA_CCMD_DOMAIN_INVL:
537                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
538                 break;
539         case DMA_CCMD_DEVICE_INVL:
540                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
541                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
542                 break;
543         default:
544                 BUG();
545         }
546         val |= DMA_CCMD_ICC;
547
548         spin_lock_irqsave(&iommu->register_lock, flag);
549         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
550
551         /* Make sure hardware complete it */
552         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
553                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
554
555         spin_unlock_irqrestore(&iommu->register_lock, flag);
556
557         /* flush context entry will implictly flush write buffer */
558         return 0;
559 }
560
561 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
562         int non_present_entry_flush)
563 {
564         return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
565                 non_present_entry_flush);
566 }
567
568 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
569         int non_present_entry_flush)
570 {
571         return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
572                 non_present_entry_flush);
573 }
574
575 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
576         u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
577 {
578         return __iommu_flush_context(iommu, did, source_id, function_mask,
579                 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
580 }
581
582 /* return value determine if we need a write buffer flush */
583 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
584         u64 addr, unsigned int size_order, u64 type,
585         int non_present_entry_flush)
586 {
587         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
588         u64 val = 0, val_iva = 0;
589         unsigned long flag;
590
591         /*
592          * In the non-present entry flush case, if hardware doesn't cache
593          * non-present entry we do nothing and if hardware cache non-present
594          * entry, we flush entries of domain 0 (the domain id is used to cache
595          * any non-present entries)
596          */
597         if (non_present_entry_flush) {
598                 if (!cap_caching_mode(iommu->cap))
599                         return 1;
600                 else
601                         did = 0;
602         }
603
604         switch (type) {
605         case DMA_TLB_GLOBAL_FLUSH:
606                 /* global flush doesn't need set IVA_REG */
607                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
608                 break;
609         case DMA_TLB_DSI_FLUSH:
610                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
611                 break;
612         case DMA_TLB_PSI_FLUSH:
613                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
614                 /* Note: always flush non-leaf currently */
615                 val_iva = size_order | addr;
616                 break;
617         default:
618                 BUG();
619         }
620         /* Note: set drain read/write */
621 #if 0
622         /*
623          * This is probably to be super secure.. Looks like we can
624          * ignore it without any impact.
625          */
626         if (cap_read_drain(iommu->cap))
627                 val |= DMA_TLB_READ_DRAIN;
628 #endif
629         if (cap_write_drain(iommu->cap))
630                 val |= DMA_TLB_WRITE_DRAIN;
631
632         spin_lock_irqsave(&iommu->register_lock, flag);
633         /* Note: Only uses first TLB reg currently */
634         if (val_iva)
635                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
636         dmar_writeq(iommu->reg + tlb_offset + 8, val);
637
638         /* Make sure hardware complete it */
639         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
640                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
641
642         spin_unlock_irqrestore(&iommu->register_lock, flag);
643
644         /* check IOTLB invalidation granularity */
645         if (DMA_TLB_IAIG(val) == 0)
646                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
647         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
648                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
649                         DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
650         /* flush context entry will implictly flush write buffer */
651         return 0;
652 }
653
654 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
655         int non_present_entry_flush)
656 {
657         return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
658                 non_present_entry_flush);
659 }
660
661 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
662         int non_present_entry_flush)
663 {
664         return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
665                 non_present_entry_flush);
666 }
667
668 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
669         u64 addr, unsigned int pages, int non_present_entry_flush)
670 {
671         unsigned int mask;
672
673         BUG_ON(addr & (~PAGE_MASK_4K));
674         BUG_ON(pages == 0);
675
676         /* Fallback to domain selective flush if no PSI support */
677         if (!cap_pgsel_inv(iommu->cap))
678                 return iommu_flush_iotlb_dsi(iommu, did,
679                         non_present_entry_flush);
680
681         /*
682          * PSI requires page size to be 2 ^ x, and the base address is naturally
683          * aligned to the size
684          */
685         mask = ilog2(__roundup_pow_of_two(pages));
686         /* Fallback to domain selective flush if size is too big */
687         if (mask > cap_max_amask_val(iommu->cap))
688                 return iommu_flush_iotlb_dsi(iommu, did,
689                         non_present_entry_flush);
690
691         return __iommu_flush_iotlb(iommu, did, addr, mask,
692                 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
693 }
694
695 static int iommu_enable_translation(struct intel_iommu *iommu)
696 {
697         u32 sts;
698         unsigned long flags;
699
700         spin_lock_irqsave(&iommu->register_lock, flags);
701         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
702
703         /* Make sure hardware complete it */
704         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
705                 readl, (sts & DMA_GSTS_TES), sts);
706
707         iommu->gcmd |= DMA_GCMD_TE;
708         spin_unlock_irqrestore(&iommu->register_lock, flags);
709         return 0;
710 }
711
712 static int iommu_disable_translation(struct intel_iommu *iommu)
713 {
714         u32 sts;
715         unsigned long flag;
716
717         spin_lock_irqsave(&iommu->register_lock, flag);
718         iommu->gcmd &= ~DMA_GCMD_TE;
719         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
720
721         /* Make sure hardware complete it */
722         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
723                 readl, (!(sts & DMA_GSTS_TES)), sts);
724
725         spin_unlock_irqrestore(&iommu->register_lock, flag);
726         return 0;
727 }
728
729 /* iommu interrupt handling. Most stuff are MSI-like. */
730
731 static char *fault_reason_strings[] =
732 {
733         "Software",
734         "Present bit in root entry is clear",
735         "Present bit in context entry is clear",
736         "Invalid context entry",
737         "Access beyond MGAW",
738         "PTE Write access is not set",
739         "PTE Read access is not set",
740         "Next page table ptr is invalid",
741         "Root table address invalid",
742         "Context table ptr is invalid",
743         "non-zero reserved fields in RTP",
744         "non-zero reserved fields in CTP",
745         "non-zero reserved fields in PTE",
746         "Unknown"
747 };
748 #define MAX_FAULT_REASON_IDX    ARRAY_SIZE(fault_reason_strings) - 1
749
750 char *dmar_get_fault_reason(u8 fault_reason)
751 {
752         if (fault_reason >= MAX_FAULT_REASON_IDX)
753                 return fault_reason_strings[MAX_FAULT_REASON_IDX - 1];
754         else
755                 return fault_reason_strings[fault_reason];
756 }
757
758 void dmar_msi_unmask(unsigned int irq)
759 {
760         struct intel_iommu *iommu = get_irq_data(irq);
761         unsigned long flag;
762
763         /* unmask it */
764         spin_lock_irqsave(&iommu->register_lock, flag);
765         writel(0, iommu->reg + DMAR_FECTL_REG);
766         /* Read a reg to force flush the post write */
767         readl(iommu->reg + DMAR_FECTL_REG);
768         spin_unlock_irqrestore(&iommu->register_lock, flag);
769 }
770
771 void dmar_msi_mask(unsigned int irq)
772 {
773         unsigned long flag;
774         struct intel_iommu *iommu = get_irq_data(irq);
775
776         /* mask it */
777         spin_lock_irqsave(&iommu->register_lock, flag);
778         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
779         /* Read a reg to force flush the post write */
780         readl(iommu->reg + DMAR_FECTL_REG);
781         spin_unlock_irqrestore(&iommu->register_lock, flag);
782 }
783
784 void dmar_msi_write(int irq, struct msi_msg *msg)
785 {
786         struct intel_iommu *iommu = get_irq_data(irq);
787         unsigned long flag;
788
789         spin_lock_irqsave(&iommu->register_lock, flag);
790         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
791         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
792         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
793         spin_unlock_irqrestore(&iommu->register_lock, flag);
794 }
795
796 void dmar_msi_read(int irq, struct msi_msg *msg)
797 {
798         struct intel_iommu *iommu = get_irq_data(irq);
799         unsigned long flag;
800
801         spin_lock_irqsave(&iommu->register_lock, flag);
802         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
803         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
804         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
805         spin_unlock_irqrestore(&iommu->register_lock, flag);
806 }
807
808 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
809                 u8 fault_reason, u16 source_id, u64 addr)
810 {
811         char *reason;
812
813         reason = dmar_get_fault_reason(fault_reason);
814
815         printk(KERN_ERR
816                 "DMAR:[%s] Request device [%02x:%02x.%d] "
817                 "fault addr %llx \n"
818                 "DMAR:[fault reason %02d] %s\n",
819                 (type ? "DMA Read" : "DMA Write"),
820                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
821                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
822         return 0;
823 }
824
825 #define PRIMARY_FAULT_REG_LEN (16)
826 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
827 {
828         struct intel_iommu *iommu = dev_id;
829         int reg, fault_index;
830         u32 fault_status;
831         unsigned long flag;
832
833         spin_lock_irqsave(&iommu->register_lock, flag);
834         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
835
836         /* TBD: ignore advanced fault log currently */
837         if (!(fault_status & DMA_FSTS_PPF))
838                 goto clear_overflow;
839
840         fault_index = dma_fsts_fault_record_index(fault_status);
841         reg = cap_fault_reg_offset(iommu->cap);
842         while (1) {
843                 u8 fault_reason;
844                 u16 source_id;
845                 u64 guest_addr;
846                 int type;
847                 u32 data;
848
849                 /* highest 32 bits */
850                 data = readl(iommu->reg + reg +
851                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
852                 if (!(data & DMA_FRCD_F))
853                         break;
854
855                 fault_reason = dma_frcd_fault_reason(data);
856                 type = dma_frcd_type(data);
857
858                 data = readl(iommu->reg + reg +
859                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
860                 source_id = dma_frcd_source_id(data);
861
862                 guest_addr = dmar_readq(iommu->reg + reg +
863                                 fault_index * PRIMARY_FAULT_REG_LEN);
864                 guest_addr = dma_frcd_page_addr(guest_addr);
865                 /* clear the fault */
866                 writel(DMA_FRCD_F, iommu->reg + reg +
867                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
868
869                 spin_unlock_irqrestore(&iommu->register_lock, flag);
870
871                 iommu_page_fault_do_one(iommu, type, fault_reason,
872                                 source_id, guest_addr);
873
874                 fault_index++;
875                 if (fault_index > cap_num_fault_regs(iommu->cap))
876                         fault_index = 0;
877                 spin_lock_irqsave(&iommu->register_lock, flag);
878         }
879 clear_overflow:
880         /* clear primary fault overflow */
881         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
882         if (fault_status & DMA_FSTS_PFO)
883                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
884
885         spin_unlock_irqrestore(&iommu->register_lock, flag);
886         return IRQ_HANDLED;
887 }
888
889 int dmar_set_interrupt(struct intel_iommu *iommu)
890 {
891         int irq, ret;
892
893         irq = create_irq();
894         if (!irq) {
895                 printk(KERN_ERR "IOMMU: no free vectors\n");
896                 return -EINVAL;
897         }
898
899         set_irq_data(irq, iommu);
900         iommu->irq = irq;
901
902         ret = arch_setup_dmar_msi(irq);
903         if (ret) {
904                 set_irq_data(irq, NULL);
905                 iommu->irq = 0;
906                 destroy_irq(irq);
907                 return 0;
908         }
909
910         /* Force fault register is cleared */
911         iommu_page_fault(irq, iommu);
912
913         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
914         if (ret)
915                 printk(KERN_ERR "IOMMU: can't request irq\n");
916         return ret;
917 }
918
919 static int iommu_init_domains(struct intel_iommu *iommu)
920 {
921         unsigned long ndomains;
922         unsigned long nlongs;
923
924         ndomains = cap_ndoms(iommu->cap);
925         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
926         nlongs = BITS_TO_LONGS(ndomains);
927
928         /* TBD: there might be 64K domains,
929          * consider other allocation for future chip
930          */
931         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
932         if (!iommu->domain_ids) {
933                 printk(KERN_ERR "Allocating domain id array failed\n");
934                 return -ENOMEM;
935         }
936         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
937                         GFP_KERNEL);
938         if (!iommu->domains) {
939                 printk(KERN_ERR "Allocating domain array failed\n");
940                 kfree(iommu->domain_ids);
941                 return -ENOMEM;
942         }
943
944         /*
945          * if Caching mode is set, then invalid translations are tagged
946          * with domainid 0. Hence we need to pre-allocate it.
947          */
948         if (cap_caching_mode(iommu->cap))
949                 set_bit(0, iommu->domain_ids);
950         return 0;
951 }
952
953 static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
954 {
955         struct intel_iommu *iommu;
956         int ret;
957         int map_size;
958         u32 ver;
959
960         iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
961         if (!iommu)
962                 return NULL;
963         iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
964         if (!iommu->reg) {
965                 printk(KERN_ERR "IOMMU: can't map the region\n");
966                 goto error;
967         }
968         iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
969         iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
970
971         /* the registers might be more than one page */
972         map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
973                 cap_max_fault_reg_offset(iommu->cap));
974         map_size = PAGE_ALIGN_4K(map_size);
975         if (map_size > PAGE_SIZE_4K) {
976                 iounmap(iommu->reg);
977                 iommu->reg = ioremap(drhd->reg_base_addr, map_size);
978                 if (!iommu->reg) {
979                         printk(KERN_ERR "IOMMU: can't map the region\n");
980                         goto error;
981                 }
982         }
983
984         ver = readl(iommu->reg + DMAR_VER_REG);
985         pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
986                 drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
987                 iommu->cap, iommu->ecap);
988         ret = iommu_init_domains(iommu);
989         if (ret)
990                 goto error_unmap;
991         spin_lock_init(&iommu->lock);
992         spin_lock_init(&iommu->register_lock);
993
994         drhd->iommu = iommu;
995         return iommu;
996 error_unmap:
997         iounmap(iommu->reg);
998 error:
999         kfree(iommu);
1000         return NULL;
1001 }
1002
1003 static void domain_exit(struct dmar_domain *domain);
1004 static void free_iommu(struct intel_iommu *iommu)
1005 {
1006         struct dmar_domain *domain;
1007         int i;
1008
1009         if (!iommu)
1010                 return;
1011
1012         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1013         for (; i < cap_ndoms(iommu->cap); ) {
1014                 domain = iommu->domains[i];
1015                 clear_bit(i, iommu->domain_ids);
1016                 domain_exit(domain);
1017                 i = find_next_bit(iommu->domain_ids,
1018                         cap_ndoms(iommu->cap), i+1);
1019         }
1020
1021         if (iommu->gcmd & DMA_GCMD_TE)
1022                 iommu_disable_translation(iommu);
1023
1024         if (iommu->irq) {
1025                 set_irq_data(iommu->irq, NULL);
1026                 /* This will mask the irq */
1027                 free_irq(iommu->irq, iommu);
1028                 destroy_irq(iommu->irq);
1029         }
1030
1031         kfree(iommu->domains);
1032         kfree(iommu->domain_ids);
1033
1034         /* free context mapping */
1035         free_context_table(iommu);
1036
1037         if (iommu->reg)
1038                 iounmap(iommu->reg);
1039         kfree(iommu);
1040 }
1041
1042 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1043 {
1044         unsigned long num;
1045         unsigned long ndomains;
1046         struct dmar_domain *domain;
1047         unsigned long flags;
1048
1049         domain = alloc_domain_mem();
1050         if (!domain)
1051                 return NULL;
1052
1053         ndomains = cap_ndoms(iommu->cap);
1054
1055         spin_lock_irqsave(&iommu->lock, flags);
1056         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1057         if (num >= ndomains) {
1058                 spin_unlock_irqrestore(&iommu->lock, flags);
1059                 free_domain_mem(domain);
1060                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1061                 return NULL;
1062         }
1063
1064         set_bit(num, iommu->domain_ids);
1065         domain->id = num;
1066         domain->iommu = iommu;
1067         iommu->domains[num] = domain;
1068         spin_unlock_irqrestore(&iommu->lock, flags);
1069
1070         return domain;
1071 }
1072
1073 static void iommu_free_domain(struct dmar_domain *domain)
1074 {
1075         unsigned long flags;
1076
1077         spin_lock_irqsave(&domain->iommu->lock, flags);
1078         clear_bit(domain->id, domain->iommu->domain_ids);
1079         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1080 }
1081
1082 static struct iova_domain reserved_iova_list;
1083
1084 static void dmar_init_reserved_ranges(void)
1085 {
1086         struct pci_dev *pdev = NULL;
1087         struct iova *iova;
1088         int i;
1089         u64 addr, size;
1090
1091         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1092
1093         /* IOAPIC ranges shouldn't be accessed by DMA */
1094         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1095                 IOVA_PFN(IOAPIC_RANGE_END));
1096         if (!iova)
1097                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1098
1099         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1100         for_each_pci_dev(pdev) {
1101                 struct resource *r;
1102
1103                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1104                         r = &pdev->resource[i];
1105                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1106                                 continue;
1107                         addr = r->start;
1108                         addr &= PAGE_MASK_4K;
1109                         size = r->end - addr;
1110                         size = PAGE_ALIGN_4K(size);
1111                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1112                                 IOVA_PFN(size + addr) - 1);
1113                         if (!iova)
1114                                 printk(KERN_ERR "Reserve iova failed\n");
1115                 }
1116         }
1117
1118 }
1119
1120 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1121 {
1122         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1123 }
1124
1125 static inline int guestwidth_to_adjustwidth(int gaw)
1126 {
1127         int agaw;
1128         int r = (gaw - 12) % 9;
1129
1130         if (r == 0)
1131                 agaw = gaw;
1132         else
1133                 agaw = gaw + 9 - r;
1134         if (agaw > 64)
1135                 agaw = 64;
1136         return agaw;
1137 }
1138
1139 static int domain_init(struct dmar_domain *domain, int guest_width)
1140 {
1141         struct intel_iommu *iommu;
1142         int adjust_width, agaw;
1143         unsigned long sagaw;
1144
1145         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1146         spin_lock_init(&domain->mapping_lock);
1147
1148         domain_reserve_special_ranges(domain);
1149
1150         /* calculate AGAW */
1151         iommu = domain->iommu;
1152         if (guest_width > cap_mgaw(iommu->cap))
1153                 guest_width = cap_mgaw(iommu->cap);
1154         domain->gaw = guest_width;
1155         adjust_width = guestwidth_to_adjustwidth(guest_width);
1156         agaw = width_to_agaw(adjust_width);
1157         sagaw = cap_sagaw(iommu->cap);
1158         if (!test_bit(agaw, &sagaw)) {
1159                 /* hardware doesn't support it, choose a bigger one */
1160                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1161                 agaw = find_next_bit(&sagaw, 5, agaw);
1162                 if (agaw >= 5)
1163                         return -ENODEV;
1164         }
1165         domain->agaw = agaw;
1166         INIT_LIST_HEAD(&domain->devices);
1167
1168         /* always allocate the top pgd */
1169         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1170         if (!domain->pgd)
1171                 return -ENOMEM;
1172         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1173         return 0;
1174 }
1175
1176 static void domain_exit(struct dmar_domain *domain)
1177 {
1178         u64 end;
1179
1180         /* Domain 0 is reserved, so dont process it */
1181         if (!domain)
1182                 return;
1183
1184         domain_remove_dev_info(domain);
1185         /* destroy iovas */
1186         put_iova_domain(&domain->iovad);
1187         end = DOMAIN_MAX_ADDR(domain->gaw);
1188         end = end & (~PAGE_MASK_4K);
1189
1190         /* clear ptes */
1191         dma_pte_clear_range(domain, 0, end);
1192
1193         /* free page tables */
1194         dma_pte_free_pagetable(domain, 0, end);
1195
1196         iommu_free_domain(domain);
1197         free_domain_mem(domain);
1198 }
1199
1200 static int domain_context_mapping_one(struct dmar_domain *domain,
1201                 u8 bus, u8 devfn)
1202 {
1203         struct context_entry *context;
1204         struct intel_iommu *iommu = domain->iommu;
1205         unsigned long flags;
1206
1207         pr_debug("Set context mapping for %02x:%02x.%d\n",
1208                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1209         BUG_ON(!domain->pgd);
1210         context = device_to_context_entry(iommu, bus, devfn);
1211         if (!context)
1212                 return -ENOMEM;
1213         spin_lock_irqsave(&iommu->lock, flags);
1214         if (context_present(*context)) {
1215                 spin_unlock_irqrestore(&iommu->lock, flags);
1216                 return 0;
1217         }
1218
1219         context_set_domain_id(*context, domain->id);
1220         context_set_address_width(*context, domain->agaw);
1221         context_set_address_root(*context, virt_to_phys(domain->pgd));
1222         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1223         context_set_fault_enable(*context);
1224         context_set_present(*context);
1225         __iommu_flush_cache(iommu, context, sizeof(*context));
1226
1227         /* it's a non-present to present mapping */
1228         if (iommu_flush_context_device(iommu, domain->id,
1229                         (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1230                 iommu_flush_write_buffer(iommu);
1231         else
1232                 iommu_flush_iotlb_dsi(iommu, 0, 0);
1233         spin_unlock_irqrestore(&iommu->lock, flags);
1234         return 0;
1235 }
1236
1237 static int
1238 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1239 {
1240         int ret;
1241         struct pci_dev *tmp, *parent;
1242
1243         ret = domain_context_mapping_one(domain, pdev->bus->number,
1244                 pdev->devfn);
1245         if (ret)
1246                 return ret;
1247
1248         /* dependent device mapping */
1249         tmp = pci_find_upstream_pcie_bridge(pdev);
1250         if (!tmp)
1251                 return 0;
1252         /* Secondary interface's bus number and devfn 0 */
1253         parent = pdev->bus->self;
1254         while (parent != tmp) {
1255                 ret = domain_context_mapping_one(domain, parent->bus->number,
1256                         parent->devfn);
1257                 if (ret)
1258                         return ret;
1259                 parent = parent->bus->self;
1260         }
1261         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1262                 return domain_context_mapping_one(domain,
1263                         tmp->subordinate->number, 0);
1264         else /* this is a legacy PCI bridge */
1265                 return domain_context_mapping_one(domain,
1266                         tmp->bus->number, tmp->devfn);
1267 }
1268
1269 static int domain_context_mapped(struct dmar_domain *domain,
1270         struct pci_dev *pdev)
1271 {
1272         int ret;
1273         struct pci_dev *tmp, *parent;
1274
1275         ret = device_context_mapped(domain->iommu,
1276                 pdev->bus->number, pdev->devfn);
1277         if (!ret)
1278                 return ret;
1279         /* dependent device mapping */
1280         tmp = pci_find_upstream_pcie_bridge(pdev);
1281         if (!tmp)
1282                 return ret;
1283         /* Secondary interface's bus number and devfn 0 */
1284         parent = pdev->bus->self;
1285         while (parent != tmp) {
1286                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1287                         parent->devfn);
1288                 if (!ret)
1289                         return ret;
1290                 parent = parent->bus->self;
1291         }
1292         if (tmp->is_pcie)
1293                 return device_context_mapped(domain->iommu,
1294                         tmp->subordinate->number, 0);
1295         else
1296                 return device_context_mapped(domain->iommu,
1297                         tmp->bus->number, tmp->devfn);
1298 }
1299
1300 static int
1301 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1302                         u64 hpa, size_t size, int prot)
1303 {
1304         u64 start_pfn, end_pfn;
1305         struct dma_pte *pte;
1306         int index;
1307
1308         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1309                 return -EINVAL;
1310         iova &= PAGE_MASK_4K;
1311         start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1312         end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1313         index = 0;
1314         while (start_pfn < end_pfn) {
1315                 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1316                 if (!pte)
1317                         return -ENOMEM;
1318                 /* We don't need lock here, nobody else
1319                  * touches the iova range
1320                  */
1321                 BUG_ON(dma_pte_addr(*pte));
1322                 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1323                 dma_set_pte_prot(*pte, prot);
1324                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1325                 start_pfn++;
1326                 index++;
1327         }
1328         return 0;
1329 }
1330
1331 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1332 {
1333         clear_context_table(domain->iommu, bus, devfn);
1334         iommu_flush_context_global(domain->iommu, 0);
1335         iommu_flush_iotlb_global(domain->iommu, 0);
1336 }
1337
1338 static void domain_remove_dev_info(struct dmar_domain *domain)
1339 {
1340         struct device_domain_info *info;
1341         unsigned long flags;
1342
1343         spin_lock_irqsave(&device_domain_lock, flags);
1344         while (!list_empty(&domain->devices)) {
1345                 info = list_entry(domain->devices.next,
1346                         struct device_domain_info, link);
1347                 list_del(&info->link);
1348                 list_del(&info->global);
1349                 if (info->dev)
1350                         info->dev->dev.archdata.iommu = NULL;
1351                 spin_unlock_irqrestore(&device_domain_lock, flags);
1352
1353                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1354                 free_devinfo_mem(info);
1355
1356                 spin_lock_irqsave(&device_domain_lock, flags);
1357         }
1358         spin_unlock_irqrestore(&device_domain_lock, flags);
1359 }
1360
1361 /*
1362  * find_domain
1363  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1364  */
1365 struct dmar_domain *
1366 find_domain(struct pci_dev *pdev)
1367 {
1368         struct device_domain_info *info;
1369
1370         /* No lock here, assumes no domain exit in normal case */
1371         info = pdev->dev.archdata.iommu;
1372         if (info)
1373                 return info->domain;
1374         return NULL;
1375 }
1376
1377 static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
1378      struct pci_dev *dev)
1379 {
1380         int index;
1381
1382         while (dev) {
1383                 for (index = 0; index < cnt; index ++)
1384                         if (dev == devices[index])
1385                                 return 1;
1386
1387                 /* Check our parent */
1388                 dev = dev->bus->self;
1389         }
1390
1391         return 0;
1392 }
1393
1394 static struct dmar_drhd_unit *
1395 dmar_find_matched_drhd_unit(struct pci_dev *dev)
1396 {
1397         struct dmar_drhd_unit *drhd = NULL;
1398
1399         list_for_each_entry(drhd, &dmar_drhd_units, list) {
1400                 if (drhd->include_all || dmar_pci_device_match(drhd->devices,
1401                                                 drhd->devices_cnt, dev))
1402                         return drhd;
1403         }
1404
1405         return NULL;
1406 }
1407
1408 /* domain is initialized */
1409 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1410 {
1411         struct dmar_domain *domain, *found = NULL;
1412         struct intel_iommu *iommu;
1413         struct dmar_drhd_unit *drhd;
1414         struct device_domain_info *info, *tmp;
1415         struct pci_dev *dev_tmp;
1416         unsigned long flags;
1417         int bus = 0, devfn = 0;
1418
1419         domain = find_domain(pdev);
1420         if (domain)
1421                 return domain;
1422
1423         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1424         if (dev_tmp) {
1425                 if (dev_tmp->is_pcie) {
1426                         bus = dev_tmp->subordinate->number;
1427                         devfn = 0;
1428                 } else {
1429                         bus = dev_tmp->bus->number;
1430                         devfn = dev_tmp->devfn;
1431                 }
1432                 spin_lock_irqsave(&device_domain_lock, flags);
1433                 list_for_each_entry(info, &device_domain_list, global) {
1434                         if (info->bus == bus && info->devfn == devfn) {
1435                                 found = info->domain;
1436                                 break;
1437                         }
1438                 }
1439                 spin_unlock_irqrestore(&device_domain_lock, flags);
1440                 /* pcie-pci bridge already has a domain, uses it */
1441                 if (found) {
1442                         domain = found;
1443                         goto found_domain;
1444                 }
1445         }
1446
1447         /* Allocate new domain for the device */
1448         drhd = dmar_find_matched_drhd_unit(pdev);
1449         if (!drhd) {
1450                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1451                         pci_name(pdev));
1452                 return NULL;
1453         }
1454         iommu = drhd->iommu;
1455
1456         domain = iommu_alloc_domain(iommu);
1457         if (!domain)
1458                 goto error;
1459
1460         if (domain_init(domain, gaw)) {
1461                 domain_exit(domain);
1462                 goto error;
1463         }
1464
1465         /* register pcie-to-pci device */
1466         if (dev_tmp) {
1467                 info = alloc_devinfo_mem();
1468                 if (!info) {
1469                         domain_exit(domain);
1470                         goto error;
1471                 }
1472                 info->bus = bus;
1473                 info->devfn = devfn;
1474                 info->dev = NULL;
1475                 info->domain = domain;
1476                 /* This domain is shared by devices under p2p bridge */
1477                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1478
1479                 /* pcie-to-pci bridge already has a domain, uses it */
1480                 found = NULL;
1481                 spin_lock_irqsave(&device_domain_lock, flags);
1482                 list_for_each_entry(tmp, &device_domain_list, global) {
1483                         if (tmp->bus == bus && tmp->devfn == devfn) {
1484                                 found = tmp->domain;
1485                                 break;
1486                         }
1487                 }
1488                 if (found) {
1489                         free_devinfo_mem(info);
1490                         domain_exit(domain);
1491                         domain = found;
1492                 } else {
1493                         list_add(&info->link, &domain->devices);
1494                         list_add(&info->global, &device_domain_list);
1495                 }
1496                 spin_unlock_irqrestore(&device_domain_lock, flags);
1497         }
1498
1499 found_domain:
1500         info = alloc_devinfo_mem();
1501         if (!info)
1502                 goto error;
1503         info->bus = pdev->bus->number;
1504         info->devfn = pdev->devfn;
1505         info->dev = pdev;
1506         info->domain = domain;
1507         spin_lock_irqsave(&device_domain_lock, flags);
1508         /* somebody is fast */
1509         found = find_domain(pdev);
1510         if (found != NULL) {
1511                 spin_unlock_irqrestore(&device_domain_lock, flags);
1512                 if (found != domain) {
1513                         domain_exit(domain);
1514                         domain = found;
1515                 }
1516                 free_devinfo_mem(info);
1517                 return domain;
1518         }
1519         list_add(&info->link, &domain->devices);
1520         list_add(&info->global, &device_domain_list);
1521         pdev->dev.archdata.iommu = info;
1522         spin_unlock_irqrestore(&device_domain_lock, flags);
1523         return domain;
1524 error:
1525         /* recheck it here, maybe others set it */
1526         return find_domain(pdev);
1527 }
1528
1529 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1530 {
1531         struct dmar_domain *domain;
1532         unsigned long size;
1533         u64 base;
1534         int ret;
1535
1536         printk(KERN_INFO
1537                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1538                 pci_name(pdev), start, end);
1539         /* page table init */
1540         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1541         if (!domain)
1542                 return -ENOMEM;
1543
1544         /* The address might not be aligned */
1545         base = start & PAGE_MASK_4K;
1546         size = end - base;
1547         size = PAGE_ALIGN_4K(size);
1548         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1549                         IOVA_PFN(base + size) - 1)) {
1550                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1551                 ret = -ENOMEM;
1552                 goto error;
1553         }
1554
1555         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1556                 size, base, pci_name(pdev));
1557         /*
1558          * RMRR range might have overlap with physical memory range,
1559          * clear it first
1560          */
1561         dma_pte_clear_range(domain, base, base + size);
1562
1563         ret = domain_page_mapping(domain, base, base, size,
1564                 DMA_PTE_READ|DMA_PTE_WRITE);
1565         if (ret)
1566                 goto error;
1567
1568         /* context entry init */
1569         ret = domain_context_mapping(domain, pdev);
1570         if (!ret)
1571                 return 0;
1572 error:
1573         domain_exit(domain);
1574         return ret;
1575
1576 }
1577
1578 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1579         struct pci_dev *pdev)
1580 {
1581         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1582                 return 0;
1583         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1584                 rmrr->end_address + 1);
1585 }
1586
1587 #ifdef CONFIG_DMAR_GFX_WA
1588 extern int arch_get_ram_range(int slot, u64 *addr, u64 *size);
1589 static void __init iommu_prepare_gfx_mapping(void)
1590 {
1591         struct pci_dev *pdev = NULL;
1592         u64 base, size;
1593         int slot;
1594         int ret;
1595
1596         for_each_pci_dev(pdev) {
1597                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1598                                 !IS_GFX_DEVICE(pdev))
1599                         continue;
1600                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1601                         pci_name(pdev));
1602                 slot = arch_get_ram_range(0, &base, &size);
1603                 while (slot >= 0) {
1604                         ret = iommu_prepare_identity_map(pdev,
1605                                         base, base + size);
1606                         if (ret)
1607                                 goto error;
1608                         slot = arch_get_ram_range(slot, &base, &size);
1609                 }
1610                 continue;
1611 error:
1612                 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1613         }
1614 }
1615 #endif
1616
1617 #ifdef CONFIG_DMAR_FLOPPY_WA
1618 static inline void iommu_prepare_isa(void)
1619 {
1620         struct pci_dev *pdev;
1621         int ret;
1622
1623         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1624         if (!pdev)
1625                 return;
1626
1627         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1628         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1629
1630         if (ret)
1631                 printk("IOMMU: Failed to create 0-64M identity map, "
1632                         "floppy might not work\n");
1633
1634 }
1635 #else
1636 static inline void iommu_prepare_isa(void)
1637 {
1638         return;
1639 }
1640 #endif /* !CONFIG_DMAR_FLPY_WA */
1641
1642 int __init init_dmars(void)
1643 {
1644         struct dmar_drhd_unit *drhd;
1645         struct dmar_rmrr_unit *rmrr;
1646         struct pci_dev *pdev;
1647         struct intel_iommu *iommu;
1648         int ret, unit = 0;
1649
1650         /*
1651          * for each drhd
1652          *    allocate root
1653          *    initialize and program root entry to not present
1654          * endfor
1655          */
1656         for_each_drhd_unit(drhd) {
1657                 if (drhd->ignored)
1658                         continue;
1659                 iommu = alloc_iommu(drhd);
1660                 if (!iommu) {
1661                         ret = -ENOMEM;
1662                         goto error;
1663                 }
1664
1665                 /*
1666                  * TBD:
1667                  * we could share the same root & context tables
1668                  * amoung all IOMMU's. Need to Split it later.
1669                  */
1670                 ret = iommu_alloc_root_entry(iommu);
1671                 if (ret) {
1672                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1673                         goto error;
1674                 }
1675         }
1676
1677         /*
1678          * For each rmrr
1679          *   for each dev attached to rmrr
1680          *   do
1681          *     locate drhd for dev, alloc domain for dev
1682          *     allocate free domain
1683          *     allocate page table entries for rmrr
1684          *     if context not allocated for bus
1685          *           allocate and init context
1686          *           set present in root table for this bus
1687          *     init context with domain, translation etc
1688          *    endfor
1689          * endfor
1690          */
1691         for_each_rmrr_units(rmrr) {
1692                 int i;
1693                 for (i = 0; i < rmrr->devices_cnt; i++) {
1694                         pdev = rmrr->devices[i];
1695                         /* some BIOS lists non-exist devices in DMAR table */
1696                         if (!pdev)
1697                                 continue;
1698                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1699                         if (ret)
1700                                 printk(KERN_ERR
1701                                  "IOMMU: mapping reserved region failed\n");
1702                 }
1703         }
1704
1705         iommu_prepare_gfx_mapping();
1706
1707         iommu_prepare_isa();
1708
1709         /*
1710          * for each drhd
1711          *   enable fault log
1712          *   global invalidate context cache
1713          *   global invalidate iotlb
1714          *   enable translation
1715          */
1716         for_each_drhd_unit(drhd) {
1717                 if (drhd->ignored)
1718                         continue;
1719                 iommu = drhd->iommu;
1720                 sprintf (iommu->name, "dmar%d", unit++);
1721
1722                 iommu_flush_write_buffer(iommu);
1723
1724                 ret = dmar_set_interrupt(iommu);
1725                 if (ret)
1726                         goto error;
1727
1728                 iommu_set_root_entry(iommu);
1729
1730                 iommu_flush_context_global(iommu, 0);
1731                 iommu_flush_iotlb_global(iommu, 0);
1732
1733                 ret = iommu_enable_translation(iommu);
1734                 if (ret)
1735                         goto error;
1736         }
1737
1738         return 0;
1739 error:
1740         for_each_drhd_unit(drhd) {
1741                 if (drhd->ignored)
1742                         continue;
1743                 iommu = drhd->iommu;
1744                 free_iommu(iommu);
1745         }
1746         return ret;
1747 }
1748
1749 static inline u64 aligned_size(u64 host_addr, size_t size)
1750 {
1751         u64 addr;
1752         addr = (host_addr & (~PAGE_MASK_4K)) + size;
1753         return PAGE_ALIGN_4K(addr);
1754 }
1755
1756 struct iova *
1757 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1758 {
1759         struct iova *piova;
1760
1761         /* Make sure it's in range */
1762         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1763         if (!size || (IOVA_START_ADDR + size > end))
1764                 return NULL;
1765
1766         piova = alloc_iova(&domain->iovad,
1767                         size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1768         return piova;
1769 }
1770
1771 static struct iova *
1772 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1773                 size_t size)
1774 {
1775         struct pci_dev *pdev = to_pci_dev(dev);
1776         struct iova *iova = NULL;
1777
1778         if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1779                 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1780         } else  {
1781                 /*
1782                  * First try to allocate an io virtual address in
1783                  * DMA_32BIT_MASK and if that fails then try allocating
1784                  * from higher range
1785                  */
1786                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1787                 if (!iova)
1788                         iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1789         }
1790
1791         if (!iova) {
1792                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1793                 return NULL;
1794         }
1795
1796         return iova;
1797 }
1798
1799 static struct dmar_domain *
1800 get_valid_domain_for_dev(struct pci_dev *pdev)
1801 {
1802         struct dmar_domain *domain;
1803         int ret;
1804
1805         domain = get_domain_for_dev(pdev,
1806                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1807         if (!domain) {
1808                 printk(KERN_ERR
1809                         "Allocating domain for %s failed", pci_name(pdev));
1810                 return NULL;
1811         }
1812
1813         /* make sure context mapping is ok */
1814         if (unlikely(!domain_context_mapped(domain, pdev))) {
1815                 ret = domain_context_mapping(domain, pdev);
1816                 if (ret) {
1817                         printk(KERN_ERR
1818                                 "Domain context map for %s failed",
1819                                 pci_name(pdev));
1820                         return NULL;
1821                 }
1822         }
1823
1824         return domain;
1825 }
1826
1827 static dma_addr_t intel_map_single(struct device *hwdev, void *addr,
1828         size_t size, int dir)
1829 {
1830         struct pci_dev *pdev = to_pci_dev(hwdev);
1831         int ret;
1832         struct dmar_domain *domain;
1833         unsigned long start_addr;
1834         struct iova *iova;
1835         int prot = 0;
1836
1837         BUG_ON(dir == DMA_NONE);
1838         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1839                 return virt_to_bus(addr);
1840
1841         domain = get_valid_domain_for_dev(pdev);
1842         if (!domain)
1843                 return 0;
1844
1845         addr = (void *)virt_to_phys(addr);
1846         size = aligned_size((u64)addr, size);
1847
1848         iova = __intel_alloc_iova(hwdev, domain, size);
1849         if (!iova)
1850                 goto error;
1851
1852         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1853
1854         /*
1855          * Check if DMAR supports zero-length reads on write only
1856          * mappings..
1857          */
1858         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1859                         !cap_zlr(domain->iommu->cap))
1860                 prot |= DMA_PTE_READ;
1861         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1862                 prot |= DMA_PTE_WRITE;
1863         /*
1864          * addr - (addr + size) might be partial page, we should map the whole
1865          * page.  Note: if two part of one page are separately mapped, we
1866          * might have two guest_addr mapping to the same host addr, but this
1867          * is not a big problem
1868          */
1869         ret = domain_page_mapping(domain, start_addr,
1870                 ((u64)addr) & PAGE_MASK_4K, size, prot);
1871         if (ret)
1872                 goto error;
1873
1874         pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1875                 pci_name(pdev), size, (u64)addr,
1876                 size, (u64)start_addr, dir);
1877
1878         /* it's a non-present to present mapping */
1879         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1880                         start_addr, size >> PAGE_SHIFT_4K, 1);
1881         if (ret)
1882                 iommu_flush_write_buffer(domain->iommu);
1883
1884         return (start_addr + ((u64)addr & (~PAGE_MASK_4K)));
1885
1886 error:
1887         if (iova)
1888                 __free_iova(&domain->iovad, iova);
1889         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1890                 pci_name(pdev), size, (u64)addr, dir);
1891         return 0;
1892 }
1893
1894 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1895         size_t size, int dir)
1896 {
1897         struct pci_dev *pdev = to_pci_dev(dev);
1898         struct dmar_domain *domain;
1899         unsigned long start_addr;
1900         struct iova *iova;
1901
1902         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1903                 return;
1904         domain = find_domain(pdev);
1905         BUG_ON(!domain);
1906
1907         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1908         if (!iova)
1909                 return;
1910
1911         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1912         size = aligned_size((u64)dev_addr, size);
1913
1914         pr_debug("Device %s unmapping: %lx@%llx\n",
1915                 pci_name(pdev), size, (u64)start_addr);
1916
1917         /*  clear the whole page */
1918         dma_pte_clear_range(domain, start_addr, start_addr + size);
1919         /* free page tables */
1920         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1921
1922         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
1923                         size >> PAGE_SHIFT_4K, 0))
1924                 iommu_flush_write_buffer(domain->iommu);
1925
1926         /* free iova */
1927         __free_iova(&domain->iovad, iova);
1928 }
1929
1930 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
1931                        dma_addr_t *dma_handle, gfp_t flags)
1932 {
1933         void *vaddr;
1934         int order;
1935
1936         size = PAGE_ALIGN_4K(size);
1937         order = get_order(size);
1938         flags &= ~(GFP_DMA | GFP_DMA32);
1939
1940         vaddr = (void *)__get_free_pages(flags, order);
1941         if (!vaddr)
1942                 return NULL;
1943         memset(vaddr, 0, size);
1944
1945         *dma_handle = intel_map_single(hwdev, vaddr, size, DMA_BIDIRECTIONAL);
1946         if (*dma_handle)
1947                 return vaddr;
1948         free_pages((unsigned long)vaddr, order);
1949         return NULL;
1950 }
1951
1952 static void intel_free_coherent(struct device *hwdev, size_t size,
1953         void *vaddr, dma_addr_t dma_handle)
1954 {
1955         int order;
1956
1957         size = PAGE_ALIGN_4K(size);
1958         order = get_order(size);
1959
1960         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
1961         free_pages((unsigned long)vaddr, order);
1962 }
1963
1964 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
1965 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
1966         int nelems, int dir)
1967 {
1968         int i;
1969         struct pci_dev *pdev = to_pci_dev(hwdev);
1970         struct dmar_domain *domain;
1971         unsigned long start_addr;
1972         struct iova *iova;
1973         size_t size = 0;
1974         void *addr;
1975         struct scatterlist *sg;
1976
1977         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1978                 return;
1979
1980         domain = find_domain(pdev);
1981
1982         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
1983         if (!iova)
1984                 return;
1985         for_each_sg(sglist, sg, nelems, i) {
1986                 addr = SG_ENT_VIRT_ADDRESS(sg);
1987                 size += aligned_size((u64)addr, sg->length);
1988         }
1989
1990         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1991
1992         /*  clear the whole page */
1993         dma_pte_clear_range(domain, start_addr, start_addr + size);
1994         /* free page tables */
1995         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1996
1997         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
1998                         size >> PAGE_SHIFT_4K, 0))
1999                 iommu_flush_write_buffer(domain->iommu);
2000
2001         /* free iova */
2002         __free_iova(&domain->iovad, iova);
2003 }
2004
2005 static int intel_nontranslate_map_sg(struct device *hddev,
2006         struct scatterlist *sglist, int nelems, int dir)
2007 {
2008         int i;
2009         struct scatterlist *sg;
2010
2011         for_each_sg(sglist, sg, nelems, i) {
2012                 BUG_ON(!sg_page(sg));
2013                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2014                 sg->dma_length = sg->length;
2015         }
2016         return nelems;
2017 }
2018
2019 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2020                                 int nelems, int dir)
2021 {
2022         void *addr;
2023         int i;
2024         struct pci_dev *pdev = to_pci_dev(hwdev);
2025         struct dmar_domain *domain;
2026         size_t size = 0;
2027         int prot = 0;
2028         size_t offset = 0;
2029         struct iova *iova = NULL;
2030         int ret;
2031         struct scatterlist *sg;
2032         unsigned long start_addr;
2033
2034         BUG_ON(dir == DMA_NONE);
2035         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2036                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2037
2038         domain = get_valid_domain_for_dev(pdev);
2039         if (!domain)
2040                 return 0;
2041
2042         for_each_sg(sglist, sg, nelems, i) {
2043                 addr = SG_ENT_VIRT_ADDRESS(sg);
2044                 addr = (void *)virt_to_phys(addr);
2045                 size += aligned_size((u64)addr, sg->length);
2046         }
2047
2048         iova = __intel_alloc_iova(hwdev, domain, size);
2049         if (!iova) {
2050                 sglist->dma_length = 0;
2051                 return 0;
2052         }
2053
2054         /*
2055          * Check if DMAR supports zero-length reads on write only
2056          * mappings..
2057          */
2058         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2059                         !cap_zlr(domain->iommu->cap))
2060                 prot |= DMA_PTE_READ;
2061         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2062                 prot |= DMA_PTE_WRITE;
2063
2064         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2065         offset = 0;
2066         for_each_sg(sglist, sg, nelems, i) {
2067                 addr = SG_ENT_VIRT_ADDRESS(sg);
2068                 addr = (void *)virt_to_phys(addr);
2069                 size = aligned_size((u64)addr, sg->length);
2070                 ret = domain_page_mapping(domain, start_addr + offset,
2071                         ((u64)addr) & PAGE_MASK_4K,
2072                         size, prot);
2073                 if (ret) {
2074                         /*  clear the page */
2075                         dma_pte_clear_range(domain, start_addr,
2076                                   start_addr + offset);
2077                         /* free page tables */
2078                         dma_pte_free_pagetable(domain, start_addr,
2079                                   start_addr + offset);
2080                         /* free iova */
2081                         __free_iova(&domain->iovad, iova);
2082                         return 0;
2083                 }
2084                 sg->dma_address = start_addr + offset +
2085                                 ((u64)addr & (~PAGE_MASK_4K));
2086                 sg->dma_length = sg->length;
2087                 offset += size;
2088         }
2089
2090         /* it's a non-present to present mapping */
2091         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2092                         start_addr, offset >> PAGE_SHIFT_4K, 1))
2093                 iommu_flush_write_buffer(domain->iommu);
2094         return nelems;
2095 }
2096
2097 static struct dma_mapping_ops intel_dma_ops = {
2098         .alloc_coherent = intel_alloc_coherent,
2099         .free_coherent = intel_free_coherent,
2100         .map_single = intel_map_single,
2101         .unmap_single = intel_unmap_single,
2102         .map_sg = intel_map_sg,
2103         .unmap_sg = intel_unmap_sg,
2104 };
2105
2106 static inline int iommu_domain_cache_init(void)
2107 {
2108         int ret = 0;
2109
2110         iommu_domain_cache = kmem_cache_create("iommu_domain",
2111                                          sizeof(struct dmar_domain),
2112                                          0,
2113                                          SLAB_HWCACHE_ALIGN,
2114
2115                                          NULL);
2116         if (!iommu_domain_cache) {
2117                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2118                 ret = -ENOMEM;
2119         }
2120
2121         return ret;
2122 }
2123
2124 static inline int iommu_devinfo_cache_init(void)
2125 {
2126         int ret = 0;
2127
2128         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2129                                          sizeof(struct device_domain_info),
2130                                          0,
2131                                          SLAB_HWCACHE_ALIGN,
2132
2133                                          NULL);
2134         if (!iommu_devinfo_cache) {
2135                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2136                 ret = -ENOMEM;
2137         }
2138
2139         return ret;
2140 }
2141
2142 static inline int iommu_iova_cache_init(void)
2143 {
2144         int ret = 0;
2145
2146         iommu_iova_cache = kmem_cache_create("iommu_iova",
2147                                          sizeof(struct iova),
2148                                          0,
2149                                          SLAB_HWCACHE_ALIGN,
2150
2151                                          NULL);
2152         if (!iommu_iova_cache) {
2153                 printk(KERN_ERR "Couldn't create iova cache\n");
2154                 ret = -ENOMEM;
2155         }
2156
2157         return ret;
2158 }
2159
2160 static int __init iommu_init_mempool(void)
2161 {
2162         int ret;
2163         ret = iommu_iova_cache_init();
2164         if (ret)
2165                 return ret;
2166
2167         ret = iommu_domain_cache_init();
2168         if (ret)
2169                 goto domain_error;
2170
2171         ret = iommu_devinfo_cache_init();
2172         if (!ret)
2173                 return ret;
2174
2175         kmem_cache_destroy(iommu_domain_cache);
2176 domain_error:
2177         kmem_cache_destroy(iommu_iova_cache);
2178
2179         return -ENOMEM;
2180 }
2181
2182 static void __init iommu_exit_mempool(void)
2183 {
2184         kmem_cache_destroy(iommu_devinfo_cache);
2185         kmem_cache_destroy(iommu_domain_cache);
2186         kmem_cache_destroy(iommu_iova_cache);
2187
2188 }
2189
2190 void __init detect_intel_iommu(void)
2191 {
2192         if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2193                 return;
2194         if (early_dmar_detect()) {
2195                 iommu_detected = 1;
2196         }
2197 }
2198
2199 static void __init init_no_remapping_devices(void)
2200 {
2201         struct dmar_drhd_unit *drhd;
2202
2203         for_each_drhd_unit(drhd) {
2204                 if (!drhd->include_all) {
2205                         int i;
2206                         for (i = 0; i < drhd->devices_cnt; i++)
2207                                 if (drhd->devices[i] != NULL)
2208                                         break;
2209                         /* ignore DMAR unit if no pci devices exist */
2210                         if (i == drhd->devices_cnt)
2211                                 drhd->ignored = 1;
2212                 }
2213         }
2214
2215         if (dmar_map_gfx)
2216                 return;
2217
2218         for_each_drhd_unit(drhd) {
2219                 int i;
2220                 if (drhd->ignored || drhd->include_all)
2221                         continue;
2222
2223                 for (i = 0; i < drhd->devices_cnt; i++)
2224                         if (drhd->devices[i] &&
2225                                 !IS_GFX_DEVICE(drhd->devices[i]))
2226                                 break;
2227
2228                 if (i < drhd->devices_cnt)
2229                         continue;
2230
2231                 /* bypass IOMMU if it is just for gfx devices */
2232                 drhd->ignored = 1;
2233                 for (i = 0; i < drhd->devices_cnt; i++) {
2234                         if (!drhd->devices[i])
2235                                 continue;
2236                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2237                 }
2238         }
2239 }
2240
2241 int __init intel_iommu_init(void)
2242 {
2243         int ret = 0;
2244
2245         if (no_iommu || swiotlb || dmar_disabled)
2246                 return -ENODEV;
2247
2248         if (dmar_table_init())
2249                 return  -ENODEV;
2250
2251         iommu_init_mempool();
2252         dmar_init_reserved_ranges();
2253
2254         init_no_remapping_devices();
2255
2256         ret = init_dmars();
2257         if (ret) {
2258                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2259                 put_iova_domain(&reserved_iova_list);
2260                 iommu_exit_mempool();
2261                 return ret;
2262         }
2263         printk(KERN_INFO
2264         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2265
2266         force_iommu = 1;
2267         dma_ops = &intel_dma_ops;
2268         return 0;
2269 }
2270