Intel IOMMU: Iommu floppy workaround
[pandora-kernel.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18  * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19  * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
20  */
21
22 #include <linux/init.h>
23 #include <linux/bitmap.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/sysdev.h>
28 #include <linux/spinlock.h>
29 #include <linux/pci.h>
30 #include <linux/dmar.h>
31 #include <linux/dma-mapping.h>
32 #include <linux/mempool.h>
33 #include "iova.h"
34 #include "intel-iommu.h"
35 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
36 #include <asm/cacheflush.h>
37 #include <asm/iommu.h>
38 #include "pci.h"
39
40 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
41 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
42
43 #define IOAPIC_RANGE_START      (0xfee00000)
44 #define IOAPIC_RANGE_END        (0xfeefffff)
45 #define IOVA_START_ADDR         (0x1000)
46
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
48
49 #define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */
50
51 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
52
53 static void domain_remove_dev_info(struct dmar_domain *domain);
54
55 static int dmar_disabled;
56 static int __initdata dmar_map_gfx = 1;
57 static int dmar_forcedac;
58
59 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
60 static DEFINE_SPINLOCK(device_domain_lock);
61 static LIST_HEAD(device_domain_list);
62
63 static int __init intel_iommu_setup(char *str)
64 {
65         if (!str)
66                 return -EINVAL;
67         while (*str) {
68                 if (!strncmp(str, "off", 3)) {
69                         dmar_disabled = 1;
70                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
71                 } else if (!strncmp(str, "igfx_off", 8)) {
72                         dmar_map_gfx = 0;
73                         printk(KERN_INFO
74                                 "Intel-IOMMU: disable GFX device mapping\n");
75                 } else if (!strncmp(str, "forcedac", 8)) {
76                         printk (KERN_INFO
77                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
78                         dmar_forcedac = 1;
79                 }
80
81                 str += strcspn(str, ",");
82                 while (*str == ',')
83                         str++;
84         }
85         return 0;
86 }
87 __setup("intel_iommu=", intel_iommu_setup);
88
89 static struct kmem_cache *iommu_domain_cache;
90 static struct kmem_cache *iommu_devinfo_cache;
91 static struct kmem_cache *iommu_iova_cache;
92
93 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
94 {
95         unsigned int flags;
96         void *vaddr;
97
98         /* trying to avoid low memory issues */
99         flags = current->flags & PF_MEMALLOC;
100         current->flags |= PF_MEMALLOC;
101         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
102         current->flags &= (~PF_MEMALLOC | flags);
103         return vaddr;
104 }
105
106
107 static inline void *alloc_pgtable_page(void)
108 {
109         unsigned int flags;
110         void *vaddr;
111
112         /* trying to avoid low memory issues */
113         flags = current->flags & PF_MEMALLOC;
114         current->flags |= PF_MEMALLOC;
115         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
116         current->flags &= (~PF_MEMALLOC | flags);
117         return vaddr;
118 }
119
120 static inline void free_pgtable_page(void *vaddr)
121 {
122         free_page((unsigned long)vaddr);
123 }
124
125 static inline void *alloc_domain_mem(void)
126 {
127         return iommu_kmem_cache_alloc(iommu_domain_cache);
128 }
129
130 static inline void free_domain_mem(void *vaddr)
131 {
132         kmem_cache_free(iommu_domain_cache, vaddr);
133 }
134
135 static inline void * alloc_devinfo_mem(void)
136 {
137         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
138 }
139
140 static inline void free_devinfo_mem(void *vaddr)
141 {
142         kmem_cache_free(iommu_devinfo_cache, vaddr);
143 }
144
145 struct iova *alloc_iova_mem(void)
146 {
147         return iommu_kmem_cache_alloc(iommu_iova_cache);
148 }
149
150 void free_iova_mem(struct iova *iova)
151 {
152         kmem_cache_free(iommu_iova_cache, iova);
153 }
154
155 static inline void __iommu_flush_cache(
156         struct intel_iommu *iommu, void *addr, int size)
157 {
158         if (!ecap_coherent(iommu->ecap))
159                 clflush_cache_range(addr, size);
160 }
161
162 /* Gets context entry for a given bus and devfn */
163 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
164                 u8 bus, u8 devfn)
165 {
166         struct root_entry *root;
167         struct context_entry *context;
168         unsigned long phy_addr;
169         unsigned long flags;
170
171         spin_lock_irqsave(&iommu->lock, flags);
172         root = &iommu->root_entry[bus];
173         context = get_context_addr_from_root(root);
174         if (!context) {
175                 context = (struct context_entry *)alloc_pgtable_page();
176                 if (!context) {
177                         spin_unlock_irqrestore(&iommu->lock, flags);
178                         return NULL;
179                 }
180                 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
181                 phy_addr = virt_to_phys((void *)context);
182                 set_root_value(root, phy_addr);
183                 set_root_present(root);
184                 __iommu_flush_cache(iommu, root, sizeof(*root));
185         }
186         spin_unlock_irqrestore(&iommu->lock, flags);
187         return &context[devfn];
188 }
189
190 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
191 {
192         struct root_entry *root;
193         struct context_entry *context;
194         int ret;
195         unsigned long flags;
196
197         spin_lock_irqsave(&iommu->lock, flags);
198         root = &iommu->root_entry[bus];
199         context = get_context_addr_from_root(root);
200         if (!context) {
201                 ret = 0;
202                 goto out;
203         }
204         ret = context_present(context[devfn]);
205 out:
206         spin_unlock_irqrestore(&iommu->lock, flags);
207         return ret;
208 }
209
210 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
211 {
212         struct root_entry *root;
213         struct context_entry *context;
214         unsigned long flags;
215
216         spin_lock_irqsave(&iommu->lock, flags);
217         root = &iommu->root_entry[bus];
218         context = get_context_addr_from_root(root);
219         if (context) {
220                 context_clear_entry(context[devfn]);
221                 __iommu_flush_cache(iommu, &context[devfn], \
222                         sizeof(*context));
223         }
224         spin_unlock_irqrestore(&iommu->lock, flags);
225 }
226
227 static void free_context_table(struct intel_iommu *iommu)
228 {
229         struct root_entry *root;
230         int i;
231         unsigned long flags;
232         struct context_entry *context;
233
234         spin_lock_irqsave(&iommu->lock, flags);
235         if (!iommu->root_entry) {
236                 goto out;
237         }
238         for (i = 0; i < ROOT_ENTRY_NR; i++) {
239                 root = &iommu->root_entry[i];
240                 context = get_context_addr_from_root(root);
241                 if (context)
242                         free_pgtable_page(context);
243         }
244         free_pgtable_page(iommu->root_entry);
245         iommu->root_entry = NULL;
246 out:
247         spin_unlock_irqrestore(&iommu->lock, flags);
248 }
249
250 /* page table handling */
251 #define LEVEL_STRIDE            (9)
252 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
253
254 static inline int agaw_to_level(int agaw)
255 {
256         return agaw + 2;
257 }
258
259 static inline int agaw_to_width(int agaw)
260 {
261         return 30 + agaw * LEVEL_STRIDE;
262
263 }
264
265 static inline int width_to_agaw(int width)
266 {
267         return (width - 30) / LEVEL_STRIDE;
268 }
269
270 static inline unsigned int level_to_offset_bits(int level)
271 {
272         return (12 + (level - 1) * LEVEL_STRIDE);
273 }
274
275 static inline int address_level_offset(u64 addr, int level)
276 {
277         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
278 }
279
280 static inline u64 level_mask(int level)
281 {
282         return ((u64)-1 << level_to_offset_bits(level));
283 }
284
285 static inline u64 level_size(int level)
286 {
287         return ((u64)1 << level_to_offset_bits(level));
288 }
289
290 static inline u64 align_to_level(u64 addr, int level)
291 {
292         return ((addr + level_size(level) - 1) & level_mask(level));
293 }
294
295 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
296 {
297         int addr_width = agaw_to_width(domain->agaw);
298         struct dma_pte *parent, *pte = NULL;
299         int level = agaw_to_level(domain->agaw);
300         int offset;
301         unsigned long flags;
302
303         BUG_ON(!domain->pgd);
304
305         addr &= (((u64)1) << addr_width) - 1;
306         parent = domain->pgd;
307
308         spin_lock_irqsave(&domain->mapping_lock, flags);
309         while (level > 0) {
310                 void *tmp_page;
311
312                 offset = address_level_offset(addr, level);
313                 pte = &parent[offset];
314                 if (level == 1)
315                         break;
316
317                 if (!dma_pte_present(*pte)) {
318                         tmp_page = alloc_pgtable_page();
319
320                         if (!tmp_page) {
321                                 spin_unlock_irqrestore(&domain->mapping_lock,
322                                         flags);
323                                 return NULL;
324                         }
325                         __iommu_flush_cache(domain->iommu, tmp_page,
326                                         PAGE_SIZE_4K);
327                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
328                         /*
329                          * high level table always sets r/w, last level page
330                          * table control read/write
331                          */
332                         dma_set_pte_readable(*pte);
333                         dma_set_pte_writable(*pte);
334                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
335                 }
336                 parent = phys_to_virt(dma_pte_addr(*pte));
337                 level--;
338         }
339
340         spin_unlock_irqrestore(&domain->mapping_lock, flags);
341         return pte;
342 }
343
344 /* return address's pte at specific level */
345 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
346                 int level)
347 {
348         struct dma_pte *parent, *pte = NULL;
349         int total = agaw_to_level(domain->agaw);
350         int offset;
351
352         parent = domain->pgd;
353         while (level <= total) {
354                 offset = address_level_offset(addr, total);
355                 pte = &parent[offset];
356                 if (level == total)
357                         return pte;
358
359                 if (!dma_pte_present(*pte))
360                         break;
361                 parent = phys_to_virt(dma_pte_addr(*pte));
362                 total--;
363         }
364         return NULL;
365 }
366
367 /* clear one page's page table */
368 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
369 {
370         struct dma_pte *pte = NULL;
371
372         /* get last level pte */
373         pte = dma_addr_level_pte(domain, addr, 1);
374
375         if (pte) {
376                 dma_clear_pte(*pte);
377                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
378         }
379 }
380
381 /* clear last level pte, a tlb flush should be followed */
382 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
383 {
384         int addr_width = agaw_to_width(domain->agaw);
385
386         start &= (((u64)1) << addr_width) - 1;
387         end &= (((u64)1) << addr_width) - 1;
388         /* in case it's partial page */
389         start = PAGE_ALIGN_4K(start);
390         end &= PAGE_MASK_4K;
391
392         /* we don't need lock here, nobody else touches the iova range */
393         while (start < end) {
394                 dma_pte_clear_one(domain, start);
395                 start += PAGE_SIZE_4K;
396         }
397 }
398
399 /* free page table pages. last level pte should already be cleared */
400 static void dma_pte_free_pagetable(struct dmar_domain *domain,
401         u64 start, u64 end)
402 {
403         int addr_width = agaw_to_width(domain->agaw);
404         struct dma_pte *pte;
405         int total = agaw_to_level(domain->agaw);
406         int level;
407         u64 tmp;
408
409         start &= (((u64)1) << addr_width) - 1;
410         end &= (((u64)1) << addr_width) - 1;
411
412         /* we don't need lock here, nobody else touches the iova range */
413         level = 2;
414         while (level <= total) {
415                 tmp = align_to_level(start, level);
416                 if (tmp >= end || (tmp + level_size(level) > end))
417                         return;
418
419                 while (tmp < end) {
420                         pte = dma_addr_level_pte(domain, tmp, level);
421                         if (pte) {
422                                 free_pgtable_page(
423                                         phys_to_virt(dma_pte_addr(*pte)));
424                                 dma_clear_pte(*pte);
425                                 __iommu_flush_cache(domain->iommu,
426                                                 pte, sizeof(*pte));
427                         }
428                         tmp += level_size(level);
429                 }
430                 level++;
431         }
432         /* free pgd */
433         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
434                 free_pgtable_page(domain->pgd);
435                 domain->pgd = NULL;
436         }
437 }
438
439 /* iommu handling */
440 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
441 {
442         struct root_entry *root;
443         unsigned long flags;
444
445         root = (struct root_entry *)alloc_pgtable_page();
446         if (!root)
447                 return -ENOMEM;
448
449         __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
450
451         spin_lock_irqsave(&iommu->lock, flags);
452         iommu->root_entry = root;
453         spin_unlock_irqrestore(&iommu->lock, flags);
454
455         return 0;
456 }
457
458 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
459 {\
460         unsigned long start_time = jiffies;\
461         while (1) {\
462                 sts = op (iommu->reg + offset);\
463                 if (cond)\
464                         break;\
465                 if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\
466                         panic("DMAR hardware is malfunctioning\n");\
467                 cpu_relax();\
468         }\
469 }
470
471 static void iommu_set_root_entry(struct intel_iommu *iommu)
472 {
473         void *addr;
474         u32 cmd, sts;
475         unsigned long flag;
476
477         addr = iommu->root_entry;
478
479         spin_lock_irqsave(&iommu->register_lock, flag);
480         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
481
482         cmd = iommu->gcmd | DMA_GCMD_SRTP;
483         writel(cmd, iommu->reg + DMAR_GCMD_REG);
484
485         /* Make sure hardware complete it */
486         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
487                 readl, (sts & DMA_GSTS_RTPS), sts);
488
489         spin_unlock_irqrestore(&iommu->register_lock, flag);
490 }
491
492 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
493 {
494         u32 val;
495         unsigned long flag;
496
497         if (!cap_rwbf(iommu->cap))
498                 return;
499         val = iommu->gcmd | DMA_GCMD_WBF;
500
501         spin_lock_irqsave(&iommu->register_lock, flag);
502         writel(val, iommu->reg + DMAR_GCMD_REG);
503
504         /* Make sure hardware complete it */
505         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
506                         readl, (!(val & DMA_GSTS_WBFS)), val);
507
508         spin_unlock_irqrestore(&iommu->register_lock, flag);
509 }
510
511 /* return value determine if we need a write buffer flush */
512 static int __iommu_flush_context(struct intel_iommu *iommu,
513         u16 did, u16 source_id, u8 function_mask, u64 type,
514         int non_present_entry_flush)
515 {
516         u64 val = 0;
517         unsigned long flag;
518
519         /*
520          * In the non-present entry flush case, if hardware doesn't cache
521          * non-present entry we do nothing and if hardware cache non-present
522          * entry, we flush entries of domain 0 (the domain id is used to cache
523          * any non-present entries)
524          */
525         if (non_present_entry_flush) {
526                 if (!cap_caching_mode(iommu->cap))
527                         return 1;
528                 else
529                         did = 0;
530         }
531
532         switch (type) {
533         case DMA_CCMD_GLOBAL_INVL:
534                 val = DMA_CCMD_GLOBAL_INVL;
535                 break;
536         case DMA_CCMD_DOMAIN_INVL:
537                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
538                 break;
539         case DMA_CCMD_DEVICE_INVL:
540                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
541                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
542                 break;
543         default:
544                 BUG();
545         }
546         val |= DMA_CCMD_ICC;
547
548         spin_lock_irqsave(&iommu->register_lock, flag);
549         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
550
551         /* Make sure hardware complete it */
552         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
553                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
554
555         spin_unlock_irqrestore(&iommu->register_lock, flag);
556
557         /* flush context entry will implictly flush write buffer */
558         return 0;
559 }
560
561 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
562         int non_present_entry_flush)
563 {
564         return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
565                 non_present_entry_flush);
566 }
567
568 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
569         int non_present_entry_flush)
570 {
571         return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
572                 non_present_entry_flush);
573 }
574
575 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
576         u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
577 {
578         return __iommu_flush_context(iommu, did, source_id, function_mask,
579                 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
580 }
581
582 /* return value determine if we need a write buffer flush */
583 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
584         u64 addr, unsigned int size_order, u64 type,
585         int non_present_entry_flush)
586 {
587         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
588         u64 val = 0, val_iva = 0;
589         unsigned long flag;
590
591         /*
592          * In the non-present entry flush case, if hardware doesn't cache
593          * non-present entry we do nothing and if hardware cache non-present
594          * entry, we flush entries of domain 0 (the domain id is used to cache
595          * any non-present entries)
596          */
597         if (non_present_entry_flush) {
598                 if (!cap_caching_mode(iommu->cap))
599                         return 1;
600                 else
601                         did = 0;
602         }
603
604         switch (type) {
605         case DMA_TLB_GLOBAL_FLUSH:
606                 /* global flush doesn't need set IVA_REG */
607                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
608                 break;
609         case DMA_TLB_DSI_FLUSH:
610                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
611                 break;
612         case DMA_TLB_PSI_FLUSH:
613                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
614                 /* Note: always flush non-leaf currently */
615                 val_iva = size_order | addr;
616                 break;
617         default:
618                 BUG();
619         }
620         /* Note: set drain read/write */
621 #if 0
622         /*
623          * This is probably to be super secure.. Looks like we can
624          * ignore it without any impact.
625          */
626         if (cap_read_drain(iommu->cap))
627                 val |= DMA_TLB_READ_DRAIN;
628 #endif
629         if (cap_write_drain(iommu->cap))
630                 val |= DMA_TLB_WRITE_DRAIN;
631
632         spin_lock_irqsave(&iommu->register_lock, flag);
633         /* Note: Only uses first TLB reg currently */
634         if (val_iva)
635                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
636         dmar_writeq(iommu->reg + tlb_offset + 8, val);
637
638         /* Make sure hardware complete it */
639         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
640                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
641
642         spin_unlock_irqrestore(&iommu->register_lock, flag);
643
644         /* check IOTLB invalidation granularity */
645         if (DMA_TLB_IAIG(val) == 0)
646                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
647         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
648                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
649                         DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
650         /* flush context entry will implictly flush write buffer */
651         return 0;
652 }
653
654 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
655         int non_present_entry_flush)
656 {
657         return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
658                 non_present_entry_flush);
659 }
660
661 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
662         int non_present_entry_flush)
663 {
664         return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
665                 non_present_entry_flush);
666 }
667
668 static int iommu_get_alignment(u64 base, unsigned int size)
669 {
670         int t = 0;
671         u64 end;
672
673         end = base + size - 1;
674         while (base != end) {
675                 t++;
676                 base >>= 1;
677                 end >>= 1;
678         }
679         return t;
680 }
681
682 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
683         u64 addr, unsigned int pages, int non_present_entry_flush)
684 {
685         unsigned int align;
686
687         BUG_ON(addr & (~PAGE_MASK_4K));
688         BUG_ON(pages == 0);
689
690         /* Fallback to domain selective flush if no PSI support */
691         if (!cap_pgsel_inv(iommu->cap))
692                 return iommu_flush_iotlb_dsi(iommu, did,
693                         non_present_entry_flush);
694
695         /*
696          * PSI requires page size to be 2 ^ x, and the base address is naturally
697          * aligned to the size
698          */
699         align = iommu_get_alignment(addr >> PAGE_SHIFT_4K, pages);
700         /* Fallback to domain selective flush if size is too big */
701         if (align > cap_max_amask_val(iommu->cap))
702                 return iommu_flush_iotlb_dsi(iommu, did,
703                         non_present_entry_flush);
704
705         addr >>= PAGE_SHIFT_4K + align;
706         addr <<= PAGE_SHIFT_4K + align;
707
708         return __iommu_flush_iotlb(iommu, did, addr, align,
709                 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
710 }
711
712 static int iommu_enable_translation(struct intel_iommu *iommu)
713 {
714         u32 sts;
715         unsigned long flags;
716
717         spin_lock_irqsave(&iommu->register_lock, flags);
718         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
719
720         /* Make sure hardware complete it */
721         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
722                 readl, (sts & DMA_GSTS_TES), sts);
723
724         iommu->gcmd |= DMA_GCMD_TE;
725         spin_unlock_irqrestore(&iommu->register_lock, flags);
726         return 0;
727 }
728
729 static int iommu_disable_translation(struct intel_iommu *iommu)
730 {
731         u32 sts;
732         unsigned long flag;
733
734         spin_lock_irqsave(&iommu->register_lock, flag);
735         iommu->gcmd &= ~DMA_GCMD_TE;
736         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
737
738         /* Make sure hardware complete it */
739         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
740                 readl, (!(sts & DMA_GSTS_TES)), sts);
741
742         spin_unlock_irqrestore(&iommu->register_lock, flag);
743         return 0;
744 }
745
746 /* iommu interrupt handling. Most stuff are MSI-like. */
747
748 static char *fault_reason_strings[] =
749 {
750         "Software",
751         "Present bit in root entry is clear",
752         "Present bit in context entry is clear",
753         "Invalid context entry",
754         "Access beyond MGAW",
755         "PTE Write access is not set",
756         "PTE Read access is not set",
757         "Next page table ptr is invalid",
758         "Root table address invalid",
759         "Context table ptr is invalid",
760         "non-zero reserved fields in RTP",
761         "non-zero reserved fields in CTP",
762         "non-zero reserved fields in PTE",
763         "Unknown"
764 };
765 #define MAX_FAULT_REASON_IDX    ARRAY_SIZE(fault_reason_strings)
766
767 char *dmar_get_fault_reason(u8 fault_reason)
768 {
769         if (fault_reason > MAX_FAULT_REASON_IDX)
770                 return fault_reason_strings[MAX_FAULT_REASON_IDX];
771         else
772                 return fault_reason_strings[fault_reason];
773 }
774
775 void dmar_msi_unmask(unsigned int irq)
776 {
777         struct intel_iommu *iommu = get_irq_data(irq);
778         unsigned long flag;
779
780         /* unmask it */
781         spin_lock_irqsave(&iommu->register_lock, flag);
782         writel(0, iommu->reg + DMAR_FECTL_REG);
783         /* Read a reg to force flush the post write */
784         readl(iommu->reg + DMAR_FECTL_REG);
785         spin_unlock_irqrestore(&iommu->register_lock, flag);
786 }
787
788 void dmar_msi_mask(unsigned int irq)
789 {
790         unsigned long flag;
791         struct intel_iommu *iommu = get_irq_data(irq);
792
793         /* mask it */
794         spin_lock_irqsave(&iommu->register_lock, flag);
795         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
796         /* Read a reg to force flush the post write */
797         readl(iommu->reg + DMAR_FECTL_REG);
798         spin_unlock_irqrestore(&iommu->register_lock, flag);
799 }
800
801 void dmar_msi_write(int irq, struct msi_msg *msg)
802 {
803         struct intel_iommu *iommu = get_irq_data(irq);
804         unsigned long flag;
805
806         spin_lock_irqsave(&iommu->register_lock, flag);
807         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
808         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
809         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
810         spin_unlock_irqrestore(&iommu->register_lock, flag);
811 }
812
813 void dmar_msi_read(int irq, struct msi_msg *msg)
814 {
815         struct intel_iommu *iommu = get_irq_data(irq);
816         unsigned long flag;
817
818         spin_lock_irqsave(&iommu->register_lock, flag);
819         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
820         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
821         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
822         spin_unlock_irqrestore(&iommu->register_lock, flag);
823 }
824
825 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
826                 u8 fault_reason, u16 source_id, u64 addr)
827 {
828         char *reason;
829
830         reason = dmar_get_fault_reason(fault_reason);
831
832         printk(KERN_ERR
833                 "DMAR:[%s] Request device [%02x:%02x.%d] "
834                 "fault addr %llx \n"
835                 "DMAR:[fault reason %02d] %s\n",
836                 (type ? "DMA Read" : "DMA Write"),
837                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
838                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
839         return 0;
840 }
841
842 #define PRIMARY_FAULT_REG_LEN (16)
843 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
844 {
845         struct intel_iommu *iommu = dev_id;
846         int reg, fault_index;
847         u32 fault_status;
848         unsigned long flag;
849
850         spin_lock_irqsave(&iommu->register_lock, flag);
851         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
852
853         /* TBD: ignore advanced fault log currently */
854         if (!(fault_status & DMA_FSTS_PPF))
855                 goto clear_overflow;
856
857         fault_index = dma_fsts_fault_record_index(fault_status);
858         reg = cap_fault_reg_offset(iommu->cap);
859         while (1) {
860                 u8 fault_reason;
861                 u16 source_id;
862                 u64 guest_addr;
863                 int type;
864                 u32 data;
865
866                 /* highest 32 bits */
867                 data = readl(iommu->reg + reg +
868                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
869                 if (!(data & DMA_FRCD_F))
870                         break;
871
872                 fault_reason = dma_frcd_fault_reason(data);
873                 type = dma_frcd_type(data);
874
875                 data = readl(iommu->reg + reg +
876                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
877                 source_id = dma_frcd_source_id(data);
878
879                 guest_addr = dmar_readq(iommu->reg + reg +
880                                 fault_index * PRIMARY_FAULT_REG_LEN);
881                 guest_addr = dma_frcd_page_addr(guest_addr);
882                 /* clear the fault */
883                 writel(DMA_FRCD_F, iommu->reg + reg +
884                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
885
886                 spin_unlock_irqrestore(&iommu->register_lock, flag);
887
888                 iommu_page_fault_do_one(iommu, type, fault_reason,
889                                 source_id, guest_addr);
890
891                 fault_index++;
892                 if (fault_index > cap_num_fault_regs(iommu->cap))
893                         fault_index = 0;
894                 spin_lock_irqsave(&iommu->register_lock, flag);
895         }
896 clear_overflow:
897         /* clear primary fault overflow */
898         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
899         if (fault_status & DMA_FSTS_PFO)
900                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
901
902         spin_unlock_irqrestore(&iommu->register_lock, flag);
903         return IRQ_HANDLED;
904 }
905
906 int dmar_set_interrupt(struct intel_iommu *iommu)
907 {
908         int irq, ret;
909
910         irq = create_irq();
911         if (!irq) {
912                 printk(KERN_ERR "IOMMU: no free vectors\n");
913                 return -EINVAL;
914         }
915
916         set_irq_data(irq, iommu);
917         iommu->irq = irq;
918
919         ret = arch_setup_dmar_msi(irq);
920         if (ret) {
921                 set_irq_data(irq, NULL);
922                 iommu->irq = 0;
923                 destroy_irq(irq);
924                 return 0;
925         }
926
927         /* Force fault register is cleared */
928         iommu_page_fault(irq, iommu);
929
930         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
931         if (ret)
932                 printk(KERN_ERR "IOMMU: can't request irq\n");
933         return ret;
934 }
935
936 static int iommu_init_domains(struct intel_iommu *iommu)
937 {
938         unsigned long ndomains;
939         unsigned long nlongs;
940
941         ndomains = cap_ndoms(iommu->cap);
942         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
943         nlongs = BITS_TO_LONGS(ndomains);
944
945         /* TBD: there might be 64K domains,
946          * consider other allocation for future chip
947          */
948         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
949         if (!iommu->domain_ids) {
950                 printk(KERN_ERR "Allocating domain id array failed\n");
951                 return -ENOMEM;
952         }
953         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
954                         GFP_KERNEL);
955         if (!iommu->domains) {
956                 printk(KERN_ERR "Allocating domain array failed\n");
957                 kfree(iommu->domain_ids);
958                 return -ENOMEM;
959         }
960
961         /*
962          * if Caching mode is set, then invalid translations are tagged
963          * with domainid 0. Hence we need to pre-allocate it.
964          */
965         if (cap_caching_mode(iommu->cap))
966                 set_bit(0, iommu->domain_ids);
967         return 0;
968 }
969
970 static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
971 {
972         struct intel_iommu *iommu;
973         int ret;
974         int map_size;
975         u32 ver;
976
977         iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
978         if (!iommu)
979                 return NULL;
980         iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
981         if (!iommu->reg) {
982                 printk(KERN_ERR "IOMMU: can't map the region\n");
983                 goto error;
984         }
985         iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
986         iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
987
988         /* the registers might be more than one page */
989         map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
990                 cap_max_fault_reg_offset(iommu->cap));
991         map_size = PAGE_ALIGN_4K(map_size);
992         if (map_size > PAGE_SIZE_4K) {
993                 iounmap(iommu->reg);
994                 iommu->reg = ioremap(drhd->reg_base_addr, map_size);
995                 if (!iommu->reg) {
996                         printk(KERN_ERR "IOMMU: can't map the region\n");
997                         goto error;
998                 }
999         }
1000
1001         ver = readl(iommu->reg + DMAR_VER_REG);
1002         pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
1003                 drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
1004                 iommu->cap, iommu->ecap);
1005         ret = iommu_init_domains(iommu);
1006         if (ret)
1007                 goto error_unmap;
1008         spin_lock_init(&iommu->lock);
1009         spin_lock_init(&iommu->register_lock);
1010
1011         drhd->iommu = iommu;
1012         return iommu;
1013 error_unmap:
1014         iounmap(iommu->reg);
1015         iommu->reg = 0;
1016 error:
1017         kfree(iommu);
1018         return NULL;
1019 }
1020
1021 static void domain_exit(struct dmar_domain *domain);
1022 static void free_iommu(struct intel_iommu *iommu)
1023 {
1024         struct dmar_domain *domain;
1025         int i;
1026
1027         if (!iommu)
1028                 return;
1029
1030         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1031         for (; i < cap_ndoms(iommu->cap); ) {
1032                 domain = iommu->domains[i];
1033                 clear_bit(i, iommu->domain_ids);
1034                 domain_exit(domain);
1035                 i = find_next_bit(iommu->domain_ids,
1036                         cap_ndoms(iommu->cap), i+1);
1037         }
1038
1039         if (iommu->gcmd & DMA_GCMD_TE)
1040                 iommu_disable_translation(iommu);
1041
1042         if (iommu->irq) {
1043                 set_irq_data(iommu->irq, NULL);
1044                 /* This will mask the irq */
1045                 free_irq(iommu->irq, iommu);
1046                 destroy_irq(iommu->irq);
1047         }
1048
1049         kfree(iommu->domains);
1050         kfree(iommu->domain_ids);
1051
1052         /* free context mapping */
1053         free_context_table(iommu);
1054
1055         if (iommu->reg)
1056                 iounmap(iommu->reg);
1057         kfree(iommu);
1058 }
1059
1060 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1061 {
1062         unsigned long num;
1063         unsigned long ndomains;
1064         struct dmar_domain *domain;
1065         unsigned long flags;
1066
1067         domain = alloc_domain_mem();
1068         if (!domain)
1069                 return NULL;
1070
1071         ndomains = cap_ndoms(iommu->cap);
1072
1073         spin_lock_irqsave(&iommu->lock, flags);
1074         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1075         if (num >= ndomains) {
1076                 spin_unlock_irqrestore(&iommu->lock, flags);
1077                 free_domain_mem(domain);
1078                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1079                 return NULL;
1080         }
1081
1082         set_bit(num, iommu->domain_ids);
1083         domain->id = num;
1084         domain->iommu = iommu;
1085         iommu->domains[num] = domain;
1086         spin_unlock_irqrestore(&iommu->lock, flags);
1087
1088         return domain;
1089 }
1090
1091 static void iommu_free_domain(struct dmar_domain *domain)
1092 {
1093         unsigned long flags;
1094
1095         spin_lock_irqsave(&domain->iommu->lock, flags);
1096         clear_bit(domain->id, domain->iommu->domain_ids);
1097         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1098 }
1099
1100 static struct iova_domain reserved_iova_list;
1101
1102 static void dmar_init_reserved_ranges(void)
1103 {
1104         struct pci_dev *pdev = NULL;
1105         struct iova *iova;
1106         int i;
1107         u64 addr, size;
1108
1109         init_iova_domain(&reserved_iova_list);
1110
1111         /* IOAPIC ranges shouldn't be accessed by DMA */
1112         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1113                 IOVA_PFN(IOAPIC_RANGE_END));
1114         if (!iova)
1115                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1116
1117         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1118         for_each_pci_dev(pdev) {
1119                 struct resource *r;
1120
1121                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1122                         r = &pdev->resource[i];
1123                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1124                                 continue;
1125                         addr = r->start;
1126                         addr &= PAGE_MASK_4K;
1127                         size = r->end - addr;
1128                         size = PAGE_ALIGN_4K(size);
1129                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1130                                 IOVA_PFN(size + addr) - 1);
1131                         if (!iova)
1132                                 printk(KERN_ERR "Reserve iova failed\n");
1133                 }
1134         }
1135
1136 }
1137
1138 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1139 {
1140         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1141 }
1142
1143 static inline int guestwidth_to_adjustwidth(int gaw)
1144 {
1145         int agaw;
1146         int r = (gaw - 12) % 9;
1147
1148         if (r == 0)
1149                 agaw = gaw;
1150         else
1151                 agaw = gaw + 9 - r;
1152         if (agaw > 64)
1153                 agaw = 64;
1154         return agaw;
1155 }
1156
1157 static int domain_init(struct dmar_domain *domain, int guest_width)
1158 {
1159         struct intel_iommu *iommu;
1160         int adjust_width, agaw;
1161         unsigned long sagaw;
1162
1163         init_iova_domain(&domain->iovad);
1164         spin_lock_init(&domain->mapping_lock);
1165
1166         domain_reserve_special_ranges(domain);
1167
1168         /* calculate AGAW */
1169         iommu = domain->iommu;
1170         if (guest_width > cap_mgaw(iommu->cap))
1171                 guest_width = cap_mgaw(iommu->cap);
1172         domain->gaw = guest_width;
1173         adjust_width = guestwidth_to_adjustwidth(guest_width);
1174         agaw = width_to_agaw(adjust_width);
1175         sagaw = cap_sagaw(iommu->cap);
1176         if (!test_bit(agaw, &sagaw)) {
1177                 /* hardware doesn't support it, choose a bigger one */
1178                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1179                 agaw = find_next_bit(&sagaw, 5, agaw);
1180                 if (agaw >= 5)
1181                         return -ENODEV;
1182         }
1183         domain->agaw = agaw;
1184         INIT_LIST_HEAD(&domain->devices);
1185
1186         /* always allocate the top pgd */
1187         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1188         if (!domain->pgd)
1189                 return -ENOMEM;
1190         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1191         return 0;
1192 }
1193
1194 static void domain_exit(struct dmar_domain *domain)
1195 {
1196         u64 end;
1197
1198         /* Domain 0 is reserved, so dont process it */
1199         if (!domain)
1200                 return;
1201
1202         domain_remove_dev_info(domain);
1203         /* destroy iovas */
1204         put_iova_domain(&domain->iovad);
1205         end = DOMAIN_MAX_ADDR(domain->gaw);
1206         end = end & (~PAGE_MASK_4K);
1207
1208         /* clear ptes */
1209         dma_pte_clear_range(domain, 0, end);
1210
1211         /* free page tables */
1212         dma_pte_free_pagetable(domain, 0, end);
1213
1214         iommu_free_domain(domain);
1215         free_domain_mem(domain);
1216 }
1217
1218 static int domain_context_mapping_one(struct dmar_domain *domain,
1219                 u8 bus, u8 devfn)
1220 {
1221         struct context_entry *context;
1222         struct intel_iommu *iommu = domain->iommu;
1223         unsigned long flags;
1224
1225         pr_debug("Set context mapping for %02x:%02x.%d\n",
1226                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1227         BUG_ON(!domain->pgd);
1228         context = device_to_context_entry(iommu, bus, devfn);
1229         if (!context)
1230                 return -ENOMEM;
1231         spin_lock_irqsave(&iommu->lock, flags);
1232         if (context_present(*context)) {
1233                 spin_unlock_irqrestore(&iommu->lock, flags);
1234                 return 0;
1235         }
1236
1237         context_set_domain_id(*context, domain->id);
1238         context_set_address_width(*context, domain->agaw);
1239         context_set_address_root(*context, virt_to_phys(domain->pgd));
1240         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1241         context_set_fault_enable(*context);
1242         context_set_present(*context);
1243         __iommu_flush_cache(iommu, context, sizeof(*context));
1244
1245         /* it's a non-present to present mapping */
1246         if (iommu_flush_context_device(iommu, domain->id,
1247                         (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1248                 iommu_flush_write_buffer(iommu);
1249         else
1250                 iommu_flush_iotlb_dsi(iommu, 0, 0);
1251         spin_unlock_irqrestore(&iommu->lock, flags);
1252         return 0;
1253 }
1254
1255 static int
1256 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1257 {
1258         int ret;
1259         struct pci_dev *tmp, *parent;
1260
1261         ret = domain_context_mapping_one(domain, pdev->bus->number,
1262                 pdev->devfn);
1263         if (ret)
1264                 return ret;
1265
1266         /* dependent device mapping */
1267         tmp = pci_find_upstream_pcie_bridge(pdev);
1268         if (!tmp)
1269                 return 0;
1270         /* Secondary interface's bus number and devfn 0 */
1271         parent = pdev->bus->self;
1272         while (parent != tmp) {
1273                 ret = domain_context_mapping_one(domain, parent->bus->number,
1274                         parent->devfn);
1275                 if (ret)
1276                         return ret;
1277                 parent = parent->bus->self;
1278         }
1279         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1280                 return domain_context_mapping_one(domain,
1281                         tmp->subordinate->number, 0);
1282         else /* this is a legacy PCI bridge */
1283                 return domain_context_mapping_one(domain,
1284                         tmp->bus->number, tmp->devfn);
1285 }
1286
1287 static int domain_context_mapped(struct dmar_domain *domain,
1288         struct pci_dev *pdev)
1289 {
1290         int ret;
1291         struct pci_dev *tmp, *parent;
1292
1293         ret = device_context_mapped(domain->iommu,
1294                 pdev->bus->number, pdev->devfn);
1295         if (!ret)
1296                 return ret;
1297         /* dependent device mapping */
1298         tmp = pci_find_upstream_pcie_bridge(pdev);
1299         if (!tmp)
1300                 return ret;
1301         /* Secondary interface's bus number and devfn 0 */
1302         parent = pdev->bus->self;
1303         while (parent != tmp) {
1304                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1305                         parent->devfn);
1306                 if (!ret)
1307                         return ret;
1308                 parent = parent->bus->self;
1309         }
1310         if (tmp->is_pcie)
1311                 return device_context_mapped(domain->iommu,
1312                         tmp->subordinate->number, 0);
1313         else
1314                 return device_context_mapped(domain->iommu,
1315                         tmp->bus->number, tmp->devfn);
1316 }
1317
1318 static int
1319 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1320                         u64 hpa, size_t size, int prot)
1321 {
1322         u64 start_pfn, end_pfn;
1323         struct dma_pte *pte;
1324         int index;
1325
1326         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1327                 return -EINVAL;
1328         iova &= PAGE_MASK_4K;
1329         start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1330         end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1331         index = 0;
1332         while (start_pfn < end_pfn) {
1333                 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1334                 if (!pte)
1335                         return -ENOMEM;
1336                 /* We don't need lock here, nobody else
1337                  * touches the iova range
1338                  */
1339                 BUG_ON(dma_pte_addr(*pte));
1340                 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1341                 dma_set_pte_prot(*pte, prot);
1342                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1343                 start_pfn++;
1344                 index++;
1345         }
1346         return 0;
1347 }
1348
1349 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1350 {
1351         clear_context_table(domain->iommu, bus, devfn);
1352         iommu_flush_context_global(domain->iommu, 0);
1353         iommu_flush_iotlb_global(domain->iommu, 0);
1354 }
1355
1356 static void domain_remove_dev_info(struct dmar_domain *domain)
1357 {
1358         struct device_domain_info *info;
1359         unsigned long flags;
1360
1361         spin_lock_irqsave(&device_domain_lock, flags);
1362         while (!list_empty(&domain->devices)) {
1363                 info = list_entry(domain->devices.next,
1364                         struct device_domain_info, link);
1365                 list_del(&info->link);
1366                 list_del(&info->global);
1367                 if (info->dev)
1368                         info->dev->sysdata = NULL;
1369                 spin_unlock_irqrestore(&device_domain_lock, flags);
1370
1371                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1372                 free_devinfo_mem(info);
1373
1374                 spin_lock_irqsave(&device_domain_lock, flags);
1375         }
1376         spin_unlock_irqrestore(&device_domain_lock, flags);
1377 }
1378
1379 /*
1380  * find_domain
1381  * Note: we use struct pci_dev->sysdata stores the info
1382  */
1383 struct dmar_domain *
1384 find_domain(struct pci_dev *pdev)
1385 {
1386         struct device_domain_info *info;
1387
1388         /* No lock here, assumes no domain exit in normal case */
1389         info = pdev->sysdata;
1390         if (info)
1391                 return info->domain;
1392         return NULL;
1393 }
1394
1395 static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
1396      struct pci_dev *dev)
1397 {
1398         int index;
1399
1400         while (dev) {
1401                 for (index = 0; index < cnt; index ++)
1402                         if (dev == devices[index])
1403                                 return 1;
1404
1405                 /* Check our parent */
1406                 dev = dev->bus->self;
1407         }
1408
1409         return 0;
1410 }
1411
1412 static struct dmar_drhd_unit *
1413 dmar_find_matched_drhd_unit(struct pci_dev *dev)
1414 {
1415         struct dmar_drhd_unit *drhd = NULL;
1416
1417         list_for_each_entry(drhd, &dmar_drhd_units, list) {
1418                 if (drhd->include_all || dmar_pci_device_match(drhd->devices,
1419                                                 drhd->devices_cnt, dev))
1420                         return drhd;
1421         }
1422
1423         return NULL;
1424 }
1425
1426 /* domain is initialized */
1427 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1428 {
1429         struct dmar_domain *domain, *found = NULL;
1430         struct intel_iommu *iommu;
1431         struct dmar_drhd_unit *drhd;
1432         struct device_domain_info *info, *tmp;
1433         struct pci_dev *dev_tmp;
1434         unsigned long flags;
1435         int bus = 0, devfn = 0;
1436
1437         domain = find_domain(pdev);
1438         if (domain)
1439                 return domain;
1440
1441         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1442         if (dev_tmp) {
1443                 if (dev_tmp->is_pcie) {
1444                         bus = dev_tmp->subordinate->number;
1445                         devfn = 0;
1446                 } else {
1447                         bus = dev_tmp->bus->number;
1448                         devfn = dev_tmp->devfn;
1449                 }
1450                 spin_lock_irqsave(&device_domain_lock, flags);
1451                 list_for_each_entry(info, &device_domain_list, global) {
1452                         if (info->bus == bus && info->devfn == devfn) {
1453                                 found = info->domain;
1454                                 break;
1455                         }
1456                 }
1457                 spin_unlock_irqrestore(&device_domain_lock, flags);
1458                 /* pcie-pci bridge already has a domain, uses it */
1459                 if (found) {
1460                         domain = found;
1461                         goto found_domain;
1462                 }
1463         }
1464
1465         /* Allocate new domain for the device */
1466         drhd = dmar_find_matched_drhd_unit(pdev);
1467         if (!drhd) {
1468                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1469                         pci_name(pdev));
1470                 return NULL;
1471         }
1472         iommu = drhd->iommu;
1473
1474         domain = iommu_alloc_domain(iommu);
1475         if (!domain)
1476                 goto error;
1477
1478         if (domain_init(domain, gaw)) {
1479                 domain_exit(domain);
1480                 goto error;
1481         }
1482
1483         /* register pcie-to-pci device */
1484         if (dev_tmp) {
1485                 info = alloc_devinfo_mem();
1486                 if (!info) {
1487                         domain_exit(domain);
1488                         goto error;
1489                 }
1490                 info->bus = bus;
1491                 info->devfn = devfn;
1492                 info->dev = NULL;
1493                 info->domain = domain;
1494                 /* This domain is shared by devices under p2p bridge */
1495                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1496
1497                 /* pcie-to-pci bridge already has a domain, uses it */
1498                 found = NULL;
1499                 spin_lock_irqsave(&device_domain_lock, flags);
1500                 list_for_each_entry(tmp, &device_domain_list, global) {
1501                         if (tmp->bus == bus && tmp->devfn == devfn) {
1502                                 found = tmp->domain;
1503                                 break;
1504                         }
1505                 }
1506                 if (found) {
1507                         free_devinfo_mem(info);
1508                         domain_exit(domain);
1509                         domain = found;
1510                 } else {
1511                         list_add(&info->link, &domain->devices);
1512                         list_add(&info->global, &device_domain_list);
1513                 }
1514                 spin_unlock_irqrestore(&device_domain_lock, flags);
1515         }
1516
1517 found_domain:
1518         info = alloc_devinfo_mem();
1519         if (!info)
1520                 goto error;
1521         info->bus = pdev->bus->number;
1522         info->devfn = pdev->devfn;
1523         info->dev = pdev;
1524         info->domain = domain;
1525         spin_lock_irqsave(&device_domain_lock, flags);
1526         /* somebody is fast */
1527         found = find_domain(pdev);
1528         if (found != NULL) {
1529                 spin_unlock_irqrestore(&device_domain_lock, flags);
1530                 if (found != domain) {
1531                         domain_exit(domain);
1532                         domain = found;
1533                 }
1534                 free_devinfo_mem(info);
1535                 return domain;
1536         }
1537         list_add(&info->link, &domain->devices);
1538         list_add(&info->global, &device_domain_list);
1539         pdev->sysdata = info;
1540         spin_unlock_irqrestore(&device_domain_lock, flags);
1541         return domain;
1542 error:
1543         /* recheck it here, maybe others set it */
1544         return find_domain(pdev);
1545 }
1546
1547 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1548 {
1549         struct dmar_domain *domain;
1550         unsigned long size;
1551         u64 base;
1552         int ret;
1553
1554         printk(KERN_INFO
1555                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1556                 pci_name(pdev), start, end);
1557         /* page table init */
1558         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1559         if (!domain)
1560                 return -ENOMEM;
1561
1562         /* The address might not be aligned */
1563         base = start & PAGE_MASK_4K;
1564         size = end - base;
1565         size = PAGE_ALIGN_4K(size);
1566         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1567                         IOVA_PFN(base + size) - 1)) {
1568                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1569                 ret = -ENOMEM;
1570                 goto error;
1571         }
1572
1573         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1574                 size, base, pci_name(pdev));
1575         /*
1576          * RMRR range might have overlap with physical memory range,
1577          * clear it first
1578          */
1579         dma_pte_clear_range(domain, base, base + size);
1580
1581         ret = domain_page_mapping(domain, base, base, size,
1582                 DMA_PTE_READ|DMA_PTE_WRITE);
1583         if (ret)
1584                 goto error;
1585
1586         /* context entry init */
1587         ret = domain_context_mapping(domain, pdev);
1588         if (!ret)
1589                 return 0;
1590 error:
1591         domain_exit(domain);
1592         return ret;
1593
1594 }
1595
1596 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1597         struct pci_dev *pdev)
1598 {
1599         if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
1600                 return 0;
1601         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1602                 rmrr->end_address + 1);
1603 }
1604
1605 #ifdef CONFIG_DMAR_GFX_WA
1606 extern int arch_get_ram_range(int slot, u64 *addr, u64 *size);
1607 static void __init iommu_prepare_gfx_mapping(void)
1608 {
1609         struct pci_dev *pdev = NULL;
1610         u64 base, size;
1611         int slot;
1612         int ret;
1613
1614         for_each_pci_dev(pdev) {
1615                 if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO ||
1616                                 !IS_GFX_DEVICE(pdev))
1617                         continue;
1618                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1619                         pci_name(pdev));
1620                 slot = arch_get_ram_range(0, &base, &size);
1621                 while (slot >= 0) {
1622                         ret = iommu_prepare_identity_map(pdev,
1623                                         base, base + size);
1624                         if (ret)
1625                                 goto error;
1626                         slot = arch_get_ram_range(slot, &base, &size);
1627                 }
1628                 continue;
1629 error:
1630                 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1631         }
1632 }
1633 #endif
1634
1635 #ifdef CONFIG_DMAR_FLOPPY_WA
1636 static inline void iommu_prepare_isa(void)
1637 {
1638         struct pci_dev *pdev;
1639         int ret;
1640
1641         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1642         if (!pdev)
1643                 return;
1644
1645         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1646         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1647
1648         if (ret)
1649                 printk("IOMMU: Failed to create 0-64M identity map, "
1650                         "floppy might not work\n");
1651
1652 }
1653 #else
1654 static inline void iommu_prepare_isa(void)
1655 {
1656         return;
1657 }
1658 #endif /* !CONFIG_DMAR_FLPY_WA */
1659
1660 int __init init_dmars(void)
1661 {
1662         struct dmar_drhd_unit *drhd;
1663         struct dmar_rmrr_unit *rmrr;
1664         struct pci_dev *pdev;
1665         struct intel_iommu *iommu;
1666         int ret, unit = 0;
1667
1668         /*
1669          * for each drhd
1670          *    allocate root
1671          *    initialize and program root entry to not present
1672          * endfor
1673          */
1674         for_each_drhd_unit(drhd) {
1675                 if (drhd->ignored)
1676                         continue;
1677                 iommu = alloc_iommu(drhd);
1678                 if (!iommu) {
1679                         ret = -ENOMEM;
1680                         goto error;
1681                 }
1682
1683                 /*
1684                  * TBD:
1685                  * we could share the same root & context tables
1686                  * amoung all IOMMU's. Need to Split it later.
1687                  */
1688                 ret = iommu_alloc_root_entry(iommu);
1689                 if (ret) {
1690                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1691                         goto error;
1692                 }
1693         }
1694
1695         /*
1696          * For each rmrr
1697          *   for each dev attached to rmrr
1698          *   do
1699          *     locate drhd for dev, alloc domain for dev
1700          *     allocate free domain
1701          *     allocate page table entries for rmrr
1702          *     if context not allocated for bus
1703          *           allocate and init context
1704          *           set present in root table for this bus
1705          *     init context with domain, translation etc
1706          *    endfor
1707          * endfor
1708          */
1709         for_each_rmrr_units(rmrr) {
1710                 int i;
1711                 for (i = 0; i < rmrr->devices_cnt; i++) {
1712                         pdev = rmrr->devices[i];
1713                         /* some BIOS lists non-exist devices in DMAR table */
1714                         if (!pdev)
1715                                 continue;
1716                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1717                         if (ret)
1718                                 printk(KERN_ERR
1719                                  "IOMMU: mapping reserved region failed\n");
1720                 }
1721         }
1722
1723         iommu_prepare_gfx_mapping();
1724
1725         iommu_prepare_isa();
1726
1727         /*
1728          * for each drhd
1729          *   enable fault log
1730          *   global invalidate context cache
1731          *   global invalidate iotlb
1732          *   enable translation
1733          */
1734         for_each_drhd_unit(drhd) {
1735                 if (drhd->ignored)
1736                         continue;
1737                 iommu = drhd->iommu;
1738                 sprintf (iommu->name, "dmar%d", unit++);
1739
1740                 iommu_flush_write_buffer(iommu);
1741
1742                 ret = dmar_set_interrupt(iommu);
1743                 if (ret)
1744                         goto error;
1745
1746                 iommu_set_root_entry(iommu);
1747
1748                 iommu_flush_context_global(iommu, 0);
1749                 iommu_flush_iotlb_global(iommu, 0);
1750
1751                 ret = iommu_enable_translation(iommu);
1752                 if (ret)
1753                         goto error;
1754         }
1755
1756         return 0;
1757 error:
1758         for_each_drhd_unit(drhd) {
1759                 if (drhd->ignored)
1760                         continue;
1761                 iommu = drhd->iommu;
1762                 free_iommu(iommu);
1763         }
1764         return ret;
1765 }
1766
1767 static inline u64 aligned_size(u64 host_addr, size_t size)
1768 {
1769         u64 addr;
1770         addr = (host_addr & (~PAGE_MASK_4K)) + size;
1771         return PAGE_ALIGN_4K(addr);
1772 }
1773
1774 struct iova *
1775 iommu_alloc_iova(struct dmar_domain *domain, void *host_addr, size_t size,
1776                 u64 start, u64 end)
1777 {
1778         u64 start_addr;
1779         struct iova *piova;
1780
1781         /* Make sure it's in range */
1782         if ((start > DOMAIN_MAX_ADDR(domain->gaw)) || end < start)
1783                 return NULL;
1784
1785         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1786         start_addr = PAGE_ALIGN_4K(start);
1787         size = aligned_size((u64)host_addr, size);
1788         if (!size || (start_addr + size > end))
1789                 return NULL;
1790
1791         piova = alloc_iova(&domain->iovad,
1792                         size >> PAGE_SHIFT_4K, IOVA_PFN(end));
1793
1794         return piova;
1795 }
1796
1797 static dma_addr_t __intel_map_single(struct device *dev, void *addr,
1798         size_t size, int dir, u64 *flush_addr, unsigned int *flush_size)
1799 {
1800         struct dmar_domain *domain;
1801         struct pci_dev *pdev = to_pci_dev(dev);
1802         int ret;
1803         int prot = 0;
1804         struct iova *iova = NULL;
1805         u64 start_addr;
1806
1807         addr = (void *)virt_to_phys(addr);
1808
1809         domain = get_domain_for_dev(pdev,
1810                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1811         if (!domain) {
1812                 printk(KERN_ERR
1813                         "Allocating domain for %s failed", pci_name(pdev));
1814                 return 0;
1815         }
1816
1817         start_addr = IOVA_START_ADDR;
1818
1819         if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1820                 iova = iommu_alloc_iova(domain, addr, size, start_addr,
1821                         pdev->dma_mask);
1822         } else  {
1823                 /*
1824                  * First try to allocate an io virtual address in
1825                  * DMA_32BIT_MASK and if that fails then try allocating
1826                  * from higer range
1827                  */
1828                 iova = iommu_alloc_iova(domain, addr, size, start_addr,
1829                         DMA_32BIT_MASK);
1830                 if (!iova)
1831                         iova = iommu_alloc_iova(domain, addr, size, start_addr,
1832                         pdev->dma_mask);
1833         }
1834
1835         if (!iova) {
1836                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1837                 return 0;
1838         }
1839
1840         /* make sure context mapping is ok */
1841         if (unlikely(!domain_context_mapped(domain, pdev))) {
1842                 ret = domain_context_mapping(domain, pdev);
1843                 if (ret)
1844                         goto error;
1845         }
1846
1847         /*
1848          * Check if DMAR supports zero-length reads on write only
1849          * mappings..
1850          */
1851         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1852                         !cap_zlr(domain->iommu->cap))
1853                 prot |= DMA_PTE_READ;
1854         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1855                 prot |= DMA_PTE_WRITE;
1856         /*
1857          * addr - (addr + size) might be partial page, we should map the whole
1858          * page.  Note: if two part of one page are separately mapped, we
1859          * might have two guest_addr mapping to the same host addr, but this
1860          * is not a big problem
1861          */
1862         ret = domain_page_mapping(domain, iova->pfn_lo << PAGE_SHIFT_4K,
1863                 ((u64)addr) & PAGE_MASK_4K,
1864                 (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K, prot);
1865         if (ret)
1866                 goto error;
1867
1868         pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1869                 pci_name(pdev), size, (u64)addr,
1870                 (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K,
1871                 (u64)(iova->pfn_lo << PAGE_SHIFT_4K), dir);
1872
1873         *flush_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1874         *flush_size = (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K;
1875         return (iova->pfn_lo << PAGE_SHIFT_4K) + ((u64)addr & (~PAGE_MASK_4K));
1876 error:
1877         __free_iova(&domain->iovad, iova);
1878         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1879                 pci_name(pdev), size, (u64)addr, dir);
1880         return 0;
1881 }
1882
1883 static dma_addr_t intel_map_single(struct device *hwdev, void *addr,
1884         size_t size, int dir)
1885 {
1886         struct pci_dev *pdev = to_pci_dev(hwdev);
1887         dma_addr_t ret;
1888         struct dmar_domain *domain;
1889         u64 flush_addr;
1890         unsigned int flush_size;
1891
1892         BUG_ON(dir == DMA_NONE);
1893         if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
1894                 return virt_to_bus(addr);
1895
1896         ret = __intel_map_single(hwdev, addr, size,
1897                         dir, &flush_addr, &flush_size);
1898         if (ret) {
1899                 domain = find_domain(pdev);
1900                 /* it's a non-present to present mapping */
1901                 if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
1902                                 flush_addr, flush_size >> PAGE_SHIFT_4K, 1))
1903                         iommu_flush_write_buffer(domain->iommu);
1904         }
1905         return ret;
1906 }
1907
1908 static void __intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1909         size_t size, int dir, u64 *flush_addr, unsigned int *flush_size)
1910 {
1911         struct dmar_domain *domain;
1912         struct pci_dev *pdev = to_pci_dev(dev);
1913         struct iova *iova;
1914
1915         domain = find_domain(pdev);
1916         BUG_ON(!domain);
1917
1918         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1919         if (!iova) {
1920                 *flush_size = 0;
1921                 return;
1922         }
1923         pr_debug("Device %s unmapping: %lx@%llx\n",
1924                 pci_name(pdev),
1925                 (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K,
1926                 (u64)(iova->pfn_lo << PAGE_SHIFT_4K));
1927
1928         *flush_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1929         *flush_size = (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K;
1930         /*  clear the whole page, not just dev_addr - (dev_addr + size) */
1931         dma_pte_clear_range(domain, *flush_addr, *flush_addr + *flush_size);
1932         /* free page tables */
1933         dma_pte_free_pagetable(domain, *flush_addr, *flush_addr + *flush_size);
1934         /* free iova */
1935         __free_iova(&domain->iovad, iova);
1936 }
1937
1938 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1939         size_t size, int dir)
1940 {
1941         struct pci_dev *pdev = to_pci_dev(dev);
1942         struct dmar_domain *domain;
1943         u64 flush_addr;
1944         unsigned int flush_size;
1945
1946         if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
1947                 return;
1948
1949         domain = find_domain(pdev);
1950         __intel_unmap_single(dev, dev_addr, size,
1951                 dir, &flush_addr, &flush_size);
1952         if (flush_size == 0)
1953                 return;
1954         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, flush_addr,
1955                         flush_size >> PAGE_SHIFT_4K, 0))
1956                 iommu_flush_write_buffer(domain->iommu);
1957 }
1958
1959 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
1960                        dma_addr_t *dma_handle, gfp_t flags)
1961 {
1962         void *vaddr;
1963         int order;
1964
1965         size = PAGE_ALIGN_4K(size);
1966         order = get_order(size);
1967         flags &= ~(GFP_DMA | GFP_DMA32);
1968
1969         vaddr = (void *)__get_free_pages(flags, order);
1970         if (!vaddr)
1971                 return NULL;
1972         memset(vaddr, 0, size);
1973
1974         *dma_handle = intel_map_single(hwdev, vaddr, size, DMA_BIDIRECTIONAL);
1975         if (*dma_handle)
1976                 return vaddr;
1977         free_pages((unsigned long)vaddr, order);
1978         return NULL;
1979 }
1980
1981 static void intel_free_coherent(struct device *hwdev, size_t size,
1982         void *vaddr, dma_addr_t dma_handle)
1983 {
1984         int order;
1985
1986         size = PAGE_ALIGN_4K(size);
1987         order = get_order(size);
1988
1989         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
1990         free_pages((unsigned long)vaddr, order);
1991 }
1992
1993 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sg,
1994         int nelems, int dir)
1995 {
1996         int i;
1997         struct pci_dev *pdev = to_pci_dev(hwdev);
1998         struct dmar_domain *domain;
1999         u64 flush_addr;
2000         unsigned int flush_size;
2001
2002         if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
2003                 return;
2004
2005         domain = find_domain(pdev);
2006         for (i = 0; i < nelems; i++, sg++)
2007                 __intel_unmap_single(hwdev, sg->dma_address,
2008                         sg->dma_length, dir, &flush_addr, &flush_size);
2009
2010         if (iommu_flush_iotlb_dsi(domain->iommu, domain->id, 0))
2011                 iommu_flush_write_buffer(domain->iommu);
2012 }
2013
2014 #define SG_ENT_VIRT_ADDRESS(sg) (page_address((sg)->page) + (sg)->offset)
2015 static int intel_nontranslate_map_sg(struct device *hddev,
2016         struct scatterlist *sg, int nelems, int dir)
2017 {
2018         int i;
2019
2020         for (i = 0; i < nelems; i++) {
2021                 struct scatterlist *s = &sg[i];
2022                 BUG_ON(!s->page);
2023                 s->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(s));
2024                 s->dma_length = s->length;
2025         }
2026         return nelems;
2027 }
2028
2029 static int intel_map_sg(struct device *hwdev, struct scatterlist *sg,
2030         int nelems, int dir)
2031 {
2032         void *addr;
2033         int i;
2034         dma_addr_t dma_handle;
2035         struct pci_dev *pdev = to_pci_dev(hwdev);
2036         struct dmar_domain *domain;
2037         u64 flush_addr;
2038         unsigned int flush_size;
2039
2040         BUG_ON(dir == DMA_NONE);
2041         if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
2042                 return intel_nontranslate_map_sg(hwdev, sg, nelems, dir);
2043
2044         for (i = 0; i < nelems; i++, sg++) {
2045                 addr = SG_ENT_VIRT_ADDRESS(sg);
2046                 dma_handle = __intel_map_single(hwdev, addr,
2047                                 sg->length, dir, &flush_addr, &flush_size);
2048                 if (!dma_handle) {
2049                         intel_unmap_sg(hwdev, sg - i, i, dir);
2050                         sg[0].dma_length = 0;
2051                         return 0;
2052                 }
2053                 sg->dma_address = dma_handle;
2054                 sg->dma_length = sg->length;
2055         }
2056
2057         domain = find_domain(pdev);
2058
2059         /* it's a non-present to present mapping */
2060         if (iommu_flush_iotlb_dsi(domain->iommu, domain->id, 1))
2061                 iommu_flush_write_buffer(domain->iommu);
2062         return nelems;
2063 }
2064
2065 static struct dma_mapping_ops intel_dma_ops = {
2066         .alloc_coherent = intel_alloc_coherent,
2067         .free_coherent = intel_free_coherent,
2068         .map_single = intel_map_single,
2069         .unmap_single = intel_unmap_single,
2070         .map_sg = intel_map_sg,
2071         .unmap_sg = intel_unmap_sg,
2072 };
2073
2074 static inline int iommu_domain_cache_init(void)
2075 {
2076         int ret = 0;
2077
2078         iommu_domain_cache = kmem_cache_create("iommu_domain",
2079                                          sizeof(struct dmar_domain),
2080                                          0,
2081                                          SLAB_HWCACHE_ALIGN,
2082
2083                                          NULL);
2084         if (!iommu_domain_cache) {
2085                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2086                 ret = -ENOMEM;
2087         }
2088
2089         return ret;
2090 }
2091
2092 static inline int iommu_devinfo_cache_init(void)
2093 {
2094         int ret = 0;
2095
2096         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2097                                          sizeof(struct device_domain_info),
2098                                          0,
2099                                          SLAB_HWCACHE_ALIGN,
2100
2101                                          NULL);
2102         if (!iommu_devinfo_cache) {
2103                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2104                 ret = -ENOMEM;
2105         }
2106
2107         return ret;
2108 }
2109
2110 static inline int iommu_iova_cache_init(void)
2111 {
2112         int ret = 0;
2113
2114         iommu_iova_cache = kmem_cache_create("iommu_iova",
2115                                          sizeof(struct iova),
2116                                          0,
2117                                          SLAB_HWCACHE_ALIGN,
2118
2119                                          NULL);
2120         if (!iommu_iova_cache) {
2121                 printk(KERN_ERR "Couldn't create iova cache\n");
2122                 ret = -ENOMEM;
2123         }
2124
2125         return ret;
2126 }
2127
2128 static int __init iommu_init_mempool(void)
2129 {
2130         int ret;
2131         ret = iommu_iova_cache_init();
2132         if (ret)
2133                 return ret;
2134
2135         ret = iommu_domain_cache_init();
2136         if (ret)
2137                 goto domain_error;
2138
2139         ret = iommu_devinfo_cache_init();
2140         if (!ret)
2141                 return ret;
2142
2143         kmem_cache_destroy(iommu_domain_cache);
2144 domain_error:
2145         kmem_cache_destroy(iommu_iova_cache);
2146
2147         return -ENOMEM;
2148 }
2149
2150 static void __init iommu_exit_mempool(void)
2151 {
2152         kmem_cache_destroy(iommu_devinfo_cache);
2153         kmem_cache_destroy(iommu_domain_cache);
2154         kmem_cache_destroy(iommu_iova_cache);
2155
2156 }
2157
2158 void __init detect_intel_iommu(void)
2159 {
2160         if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2161                 return;
2162         if (early_dmar_detect()) {
2163                 iommu_detected = 1;
2164         }
2165 }
2166
2167 static void __init init_no_remapping_devices(void)
2168 {
2169         struct dmar_drhd_unit *drhd;
2170
2171         for_each_drhd_unit(drhd) {
2172                 if (!drhd->include_all) {
2173                         int i;
2174                         for (i = 0; i < drhd->devices_cnt; i++)
2175                                 if (drhd->devices[i] != NULL)
2176                                         break;
2177                         /* ignore DMAR unit if no pci devices exist */
2178                         if (i == drhd->devices_cnt)
2179                                 drhd->ignored = 1;
2180                 }
2181         }
2182
2183         if (dmar_map_gfx)
2184                 return;
2185
2186         for_each_drhd_unit(drhd) {
2187                 int i;
2188                 if (drhd->ignored || drhd->include_all)
2189                         continue;
2190
2191                 for (i = 0; i < drhd->devices_cnt; i++)
2192                         if (drhd->devices[i] &&
2193                                 !IS_GFX_DEVICE(drhd->devices[i]))
2194                                 break;
2195
2196                 if (i < drhd->devices_cnt)
2197                         continue;
2198
2199                 /* bypass IOMMU if it is just for gfx devices */
2200                 drhd->ignored = 1;
2201                 for (i = 0; i < drhd->devices_cnt; i++) {
2202                         if (!drhd->devices[i])
2203                                 continue;
2204                         drhd->devices[i]->sysdata = DUMMY_DEVICE_DOMAIN_INFO;
2205                 }
2206         }
2207 }
2208
2209 int __init intel_iommu_init(void)
2210 {
2211         int ret = 0;
2212
2213         if (no_iommu || swiotlb || dmar_disabled)
2214                 return -ENODEV;
2215
2216         if (dmar_table_init())
2217                 return  -ENODEV;
2218
2219         iommu_init_mempool();
2220         dmar_init_reserved_ranges();
2221
2222         init_no_remapping_devices();
2223
2224         ret = init_dmars();
2225         if (ret) {
2226                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2227                 put_iova_domain(&reserved_iova_list);
2228                 iommu_exit_mempool();
2229                 return ret;
2230         }
2231         printk(KERN_INFO
2232         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2233
2234         force_iommu = 1;
2235         dma_ops = &intel_dma_ops;
2236         return 0;
2237 }
2238