iommu coherency
[pandora-kernel.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
40 #include "pci.h"
41
42 #define ROOT_SIZE               VTD_PAGE_SIZE
43 #define CONTEXT_SIZE            VTD_PAGE_SIZE
44
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
47
48 #define IOAPIC_RANGE_START      (0xfee00000)
49 #define IOAPIC_RANGE_END        (0xfeefffff)
50 #define IOVA_START_ADDR         (0x1000)
51
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
57 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
58 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
59
60 /* global iommu list, set NULL for ignored DMAR units */
61 static struct intel_iommu **g_iommus;
62
63 /*
64  * 0: Present
65  * 1-11: Reserved
66  * 12-63: Context Ptr (12 - (haw-1))
67  * 64-127: Reserved
68  */
69 struct root_entry {
70         u64     val;
71         u64     rsvd1;
72 };
73 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
74 static inline bool root_present(struct root_entry *root)
75 {
76         return (root->val & 1);
77 }
78 static inline void set_root_present(struct root_entry *root)
79 {
80         root->val |= 1;
81 }
82 static inline void set_root_value(struct root_entry *root, unsigned long value)
83 {
84         root->val |= value & VTD_PAGE_MASK;
85 }
86
87 static inline struct context_entry *
88 get_context_addr_from_root(struct root_entry *root)
89 {
90         return (struct context_entry *)
91                 (root_present(root)?phys_to_virt(
92                 root->val & VTD_PAGE_MASK) :
93                 NULL);
94 }
95
96 /*
97  * low 64 bits:
98  * 0: present
99  * 1: fault processing disable
100  * 2-3: translation type
101  * 12-63: address space root
102  * high 64 bits:
103  * 0-2: address width
104  * 3-6: aval
105  * 8-23: domain id
106  */
107 struct context_entry {
108         u64 lo;
109         u64 hi;
110 };
111
112 static inline bool context_present(struct context_entry *context)
113 {
114         return (context->lo & 1);
115 }
116 static inline void context_set_present(struct context_entry *context)
117 {
118         context->lo |= 1;
119 }
120
121 static inline void context_set_fault_enable(struct context_entry *context)
122 {
123         context->lo &= (((u64)-1) << 2) | 1;
124 }
125
126 #define CONTEXT_TT_MULTI_LEVEL 0
127
128 static inline void context_set_translation_type(struct context_entry *context,
129                                                 unsigned long value)
130 {
131         context->lo &= (((u64)-1) << 4) | 3;
132         context->lo |= (value & 3) << 2;
133 }
134
135 static inline void context_set_address_root(struct context_entry *context,
136                                             unsigned long value)
137 {
138         context->lo |= value & VTD_PAGE_MASK;
139 }
140
141 static inline void context_set_address_width(struct context_entry *context,
142                                              unsigned long value)
143 {
144         context->hi |= value & 7;
145 }
146
147 static inline void context_set_domain_id(struct context_entry *context,
148                                          unsigned long value)
149 {
150         context->hi |= (value & ((1 << 16) - 1)) << 8;
151 }
152
153 static inline void context_clear_entry(struct context_entry *context)
154 {
155         context->lo = 0;
156         context->hi = 0;
157 }
158
159 /*
160  * 0: readable
161  * 1: writable
162  * 2-6: reserved
163  * 7: super page
164  * 8-11: available
165  * 12-63: Host physcial address
166  */
167 struct dma_pte {
168         u64 val;
169 };
170
171 static inline void dma_clear_pte(struct dma_pte *pte)
172 {
173         pte->val = 0;
174 }
175
176 static inline void dma_set_pte_readable(struct dma_pte *pte)
177 {
178         pte->val |= DMA_PTE_READ;
179 }
180
181 static inline void dma_set_pte_writable(struct dma_pte *pte)
182 {
183         pte->val |= DMA_PTE_WRITE;
184 }
185
186 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
187 {
188         pte->val = (pte->val & ~3) | (prot & 3);
189 }
190
191 static inline u64 dma_pte_addr(struct dma_pte *pte)
192 {
193         return (pte->val & VTD_PAGE_MASK);
194 }
195
196 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
197 {
198         pte->val |= (addr & VTD_PAGE_MASK);
199 }
200
201 static inline bool dma_pte_present(struct dma_pte *pte)
202 {
203         return (pte->val & 3) != 0;
204 }
205
206 /* devices under the same p2p bridge are owned in one domain */
207 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 < 0)
208
209 struct dmar_domain {
210         int     id;                     /* domain id */
211         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
212
213         struct list_head devices;       /* all devices' list */
214         struct iova_domain iovad;       /* iova's that belong to this domain */
215
216         struct dma_pte  *pgd;           /* virtual address */
217         spinlock_t      mapping_lock;   /* page table lock */
218         int             gaw;            /* max guest address width */
219
220         /* adjusted guest address width, 0 is level 2 30-bit */
221         int             agaw;
222
223         int             flags;          /* flags to find out type of domain */
224
225         int             iommu_coherency;/* indicate coherency of iommu access */
226 };
227
228 /* PCI domain-device relationship */
229 struct device_domain_info {
230         struct list_head link;  /* link to domain siblings */
231         struct list_head global; /* link to global list */
232         u8 bus;                 /* PCI bus numer */
233         u8 devfn;               /* PCI devfn number */
234         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
235         struct dmar_domain *domain; /* pointer to domain */
236 };
237
238 static void flush_unmaps_timeout(unsigned long data);
239
240 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
241
242 #define HIGH_WATER_MARK 250
243 struct deferred_flush_tables {
244         int next;
245         struct iova *iova[HIGH_WATER_MARK];
246         struct dmar_domain *domain[HIGH_WATER_MARK];
247 };
248
249 static struct deferred_flush_tables *deferred_flush;
250
251 /* bitmap for indexing intel_iommus */
252 static int g_num_of_iommus;
253
254 static DEFINE_SPINLOCK(async_umap_flush_lock);
255 static LIST_HEAD(unmaps_to_do);
256
257 static int timer_on;
258 static long list_size;
259
260 static void domain_remove_dev_info(struct dmar_domain *domain);
261
262 int dmar_disabled;
263 static int __initdata dmar_map_gfx = 1;
264 static int dmar_forcedac;
265 static int intel_iommu_strict;
266
267 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
268 static DEFINE_SPINLOCK(device_domain_lock);
269 static LIST_HEAD(device_domain_list);
270
271 static int __init intel_iommu_setup(char *str)
272 {
273         if (!str)
274                 return -EINVAL;
275         while (*str) {
276                 if (!strncmp(str, "off", 3)) {
277                         dmar_disabled = 1;
278                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
279                 } else if (!strncmp(str, "igfx_off", 8)) {
280                         dmar_map_gfx = 0;
281                         printk(KERN_INFO
282                                 "Intel-IOMMU: disable GFX device mapping\n");
283                 } else if (!strncmp(str, "forcedac", 8)) {
284                         printk(KERN_INFO
285                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
286                         dmar_forcedac = 1;
287                 } else if (!strncmp(str, "strict", 6)) {
288                         printk(KERN_INFO
289                                 "Intel-IOMMU: disable batched IOTLB flush\n");
290                         intel_iommu_strict = 1;
291                 }
292
293                 str += strcspn(str, ",");
294                 while (*str == ',')
295                         str++;
296         }
297         return 0;
298 }
299 __setup("intel_iommu=", intel_iommu_setup);
300
301 static struct kmem_cache *iommu_domain_cache;
302 static struct kmem_cache *iommu_devinfo_cache;
303 static struct kmem_cache *iommu_iova_cache;
304
305 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
306 {
307         unsigned int flags;
308         void *vaddr;
309
310         /* trying to avoid low memory issues */
311         flags = current->flags & PF_MEMALLOC;
312         current->flags |= PF_MEMALLOC;
313         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
314         current->flags &= (~PF_MEMALLOC | flags);
315         return vaddr;
316 }
317
318
319 static inline void *alloc_pgtable_page(void)
320 {
321         unsigned int flags;
322         void *vaddr;
323
324         /* trying to avoid low memory issues */
325         flags = current->flags & PF_MEMALLOC;
326         current->flags |= PF_MEMALLOC;
327         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
328         current->flags &= (~PF_MEMALLOC | flags);
329         return vaddr;
330 }
331
332 static inline void free_pgtable_page(void *vaddr)
333 {
334         free_page((unsigned long)vaddr);
335 }
336
337 static inline void *alloc_domain_mem(void)
338 {
339         return iommu_kmem_cache_alloc(iommu_domain_cache);
340 }
341
342 static void free_domain_mem(void *vaddr)
343 {
344         kmem_cache_free(iommu_domain_cache, vaddr);
345 }
346
347 static inline void * alloc_devinfo_mem(void)
348 {
349         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
350 }
351
352 static inline void free_devinfo_mem(void *vaddr)
353 {
354         kmem_cache_free(iommu_devinfo_cache, vaddr);
355 }
356
357 struct iova *alloc_iova_mem(void)
358 {
359         return iommu_kmem_cache_alloc(iommu_iova_cache);
360 }
361
362 void free_iova_mem(struct iova *iova)
363 {
364         kmem_cache_free(iommu_iova_cache, iova);
365 }
366
367
368 static inline int width_to_agaw(int width);
369
370 /* calculate agaw for each iommu.
371  * "SAGAW" may be different across iommus, use a default agaw, and
372  * get a supported less agaw for iommus that don't support the default agaw.
373  */
374 int iommu_calculate_agaw(struct intel_iommu *iommu)
375 {
376         unsigned long sagaw;
377         int agaw = -1;
378
379         sagaw = cap_sagaw(iommu->cap);
380         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
381              agaw >= 0; agaw--) {
382                 if (test_bit(agaw, &sagaw))
383                         break;
384         }
385
386         return agaw;
387 }
388
389 /* in native case, each domain is related to only one iommu */
390 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
391 {
392         int iommu_id;
393
394         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
395         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
396                 return NULL;
397
398         return g_iommus[iommu_id];
399 }
400
401 /* "Coherency" capability may be different across iommus */
402 static void domain_update_iommu_coherency(struct dmar_domain *domain)
403 {
404         int i;
405
406         domain->iommu_coherency = 1;
407
408         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
409         for (; i < g_num_of_iommus; ) {
410                 if (!ecap_coherent(g_iommus[i]->ecap)) {
411                         domain->iommu_coherency = 0;
412                         break;
413                 }
414                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
415         }
416 }
417
418 /* Gets context entry for a given bus and devfn */
419 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
420                 u8 bus, u8 devfn)
421 {
422         struct root_entry *root;
423         struct context_entry *context;
424         unsigned long phy_addr;
425         unsigned long flags;
426
427         spin_lock_irqsave(&iommu->lock, flags);
428         root = &iommu->root_entry[bus];
429         context = get_context_addr_from_root(root);
430         if (!context) {
431                 context = (struct context_entry *)alloc_pgtable_page();
432                 if (!context) {
433                         spin_unlock_irqrestore(&iommu->lock, flags);
434                         return NULL;
435                 }
436                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
437                 phy_addr = virt_to_phys((void *)context);
438                 set_root_value(root, phy_addr);
439                 set_root_present(root);
440                 __iommu_flush_cache(iommu, root, sizeof(*root));
441         }
442         spin_unlock_irqrestore(&iommu->lock, flags);
443         return &context[devfn];
444 }
445
446 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
447 {
448         struct root_entry *root;
449         struct context_entry *context;
450         int ret;
451         unsigned long flags;
452
453         spin_lock_irqsave(&iommu->lock, flags);
454         root = &iommu->root_entry[bus];
455         context = get_context_addr_from_root(root);
456         if (!context) {
457                 ret = 0;
458                 goto out;
459         }
460         ret = context_present(&context[devfn]);
461 out:
462         spin_unlock_irqrestore(&iommu->lock, flags);
463         return ret;
464 }
465
466 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
467 {
468         struct root_entry *root;
469         struct context_entry *context;
470         unsigned long flags;
471
472         spin_lock_irqsave(&iommu->lock, flags);
473         root = &iommu->root_entry[bus];
474         context = get_context_addr_from_root(root);
475         if (context) {
476                 context_clear_entry(&context[devfn]);
477                 __iommu_flush_cache(iommu, &context[devfn], \
478                         sizeof(*context));
479         }
480         spin_unlock_irqrestore(&iommu->lock, flags);
481 }
482
483 static void free_context_table(struct intel_iommu *iommu)
484 {
485         struct root_entry *root;
486         int i;
487         unsigned long flags;
488         struct context_entry *context;
489
490         spin_lock_irqsave(&iommu->lock, flags);
491         if (!iommu->root_entry) {
492                 goto out;
493         }
494         for (i = 0; i < ROOT_ENTRY_NR; i++) {
495                 root = &iommu->root_entry[i];
496                 context = get_context_addr_from_root(root);
497                 if (context)
498                         free_pgtable_page(context);
499         }
500         free_pgtable_page(iommu->root_entry);
501         iommu->root_entry = NULL;
502 out:
503         spin_unlock_irqrestore(&iommu->lock, flags);
504 }
505
506 /* page table handling */
507 #define LEVEL_STRIDE            (9)
508 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
509
510 static inline int agaw_to_level(int agaw)
511 {
512         return agaw + 2;
513 }
514
515 static inline int agaw_to_width(int agaw)
516 {
517         return 30 + agaw * LEVEL_STRIDE;
518
519 }
520
521 static inline int width_to_agaw(int width)
522 {
523         return (width - 30) / LEVEL_STRIDE;
524 }
525
526 static inline unsigned int level_to_offset_bits(int level)
527 {
528         return (12 + (level - 1) * LEVEL_STRIDE);
529 }
530
531 static inline int address_level_offset(u64 addr, int level)
532 {
533         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
534 }
535
536 static inline u64 level_mask(int level)
537 {
538         return ((u64)-1 << level_to_offset_bits(level));
539 }
540
541 static inline u64 level_size(int level)
542 {
543         return ((u64)1 << level_to_offset_bits(level));
544 }
545
546 static inline u64 align_to_level(u64 addr, int level)
547 {
548         return ((addr + level_size(level) - 1) & level_mask(level));
549 }
550
551 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
552 {
553         int addr_width = agaw_to_width(domain->agaw);
554         struct dma_pte *parent, *pte = NULL;
555         int level = agaw_to_level(domain->agaw);
556         int offset;
557         unsigned long flags;
558         struct intel_iommu *iommu = domain_get_iommu(domain);
559
560         BUG_ON(!domain->pgd);
561
562         addr &= (((u64)1) << addr_width) - 1;
563         parent = domain->pgd;
564
565         spin_lock_irqsave(&domain->mapping_lock, flags);
566         while (level > 0) {
567                 void *tmp_page;
568
569                 offset = address_level_offset(addr, level);
570                 pte = &parent[offset];
571                 if (level == 1)
572                         break;
573
574                 if (!dma_pte_present(pte)) {
575                         tmp_page = alloc_pgtable_page();
576
577                         if (!tmp_page) {
578                                 spin_unlock_irqrestore(&domain->mapping_lock,
579                                         flags);
580                                 return NULL;
581                         }
582                         __iommu_flush_cache(iommu, tmp_page,
583                                         PAGE_SIZE);
584                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
585                         /*
586                          * high level table always sets r/w, last level page
587                          * table control read/write
588                          */
589                         dma_set_pte_readable(pte);
590                         dma_set_pte_writable(pte);
591                         __iommu_flush_cache(iommu, pte, sizeof(*pte));
592                 }
593                 parent = phys_to_virt(dma_pte_addr(pte));
594                 level--;
595         }
596
597         spin_unlock_irqrestore(&domain->mapping_lock, flags);
598         return pte;
599 }
600
601 /* return address's pte at specific level */
602 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
603                 int level)
604 {
605         struct dma_pte *parent, *pte = NULL;
606         int total = agaw_to_level(domain->agaw);
607         int offset;
608
609         parent = domain->pgd;
610         while (level <= total) {
611                 offset = address_level_offset(addr, total);
612                 pte = &parent[offset];
613                 if (level == total)
614                         return pte;
615
616                 if (!dma_pte_present(pte))
617                         break;
618                 parent = phys_to_virt(dma_pte_addr(pte));
619                 total--;
620         }
621         return NULL;
622 }
623
624 /* clear one page's page table */
625 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
626 {
627         struct dma_pte *pte = NULL;
628         struct intel_iommu *iommu = domain_get_iommu(domain);
629
630         /* get last level pte */
631         pte = dma_addr_level_pte(domain, addr, 1);
632
633         if (pte) {
634                 dma_clear_pte(pte);
635                 __iommu_flush_cache(iommu, pte, sizeof(*pte));
636         }
637 }
638
639 /* clear last level pte, a tlb flush should be followed */
640 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
641 {
642         int addr_width = agaw_to_width(domain->agaw);
643
644         start &= (((u64)1) << addr_width) - 1;
645         end &= (((u64)1) << addr_width) - 1;
646         /* in case it's partial page */
647         start = PAGE_ALIGN(start);
648         end &= PAGE_MASK;
649
650         /* we don't need lock here, nobody else touches the iova range */
651         while (start < end) {
652                 dma_pte_clear_one(domain, start);
653                 start += VTD_PAGE_SIZE;
654         }
655 }
656
657 /* free page table pages. last level pte should already be cleared */
658 static void dma_pte_free_pagetable(struct dmar_domain *domain,
659         u64 start, u64 end)
660 {
661         int addr_width = agaw_to_width(domain->agaw);
662         struct dma_pte *pte;
663         int total = agaw_to_level(domain->agaw);
664         int level;
665         u64 tmp;
666         struct intel_iommu *iommu = domain_get_iommu(domain);
667
668         start &= (((u64)1) << addr_width) - 1;
669         end &= (((u64)1) << addr_width) - 1;
670
671         /* we don't need lock here, nobody else touches the iova range */
672         level = 2;
673         while (level <= total) {
674                 tmp = align_to_level(start, level);
675                 if (tmp >= end || (tmp + level_size(level) > end))
676                         return;
677
678                 while (tmp < end) {
679                         pte = dma_addr_level_pte(domain, tmp, level);
680                         if (pte) {
681                                 free_pgtable_page(
682                                         phys_to_virt(dma_pte_addr(pte)));
683                                 dma_clear_pte(pte);
684                                 __iommu_flush_cache(iommu,
685                                                 pte, sizeof(*pte));
686                         }
687                         tmp += level_size(level);
688                 }
689                 level++;
690         }
691         /* free pgd */
692         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
693                 free_pgtable_page(domain->pgd);
694                 domain->pgd = NULL;
695         }
696 }
697
698 /* iommu handling */
699 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
700 {
701         struct root_entry *root;
702         unsigned long flags;
703
704         root = (struct root_entry *)alloc_pgtable_page();
705         if (!root)
706                 return -ENOMEM;
707
708         __iommu_flush_cache(iommu, root, ROOT_SIZE);
709
710         spin_lock_irqsave(&iommu->lock, flags);
711         iommu->root_entry = root;
712         spin_unlock_irqrestore(&iommu->lock, flags);
713
714         return 0;
715 }
716
717 static void iommu_set_root_entry(struct intel_iommu *iommu)
718 {
719         void *addr;
720         u32 cmd, sts;
721         unsigned long flag;
722
723         addr = iommu->root_entry;
724
725         spin_lock_irqsave(&iommu->register_lock, flag);
726         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
727
728         cmd = iommu->gcmd | DMA_GCMD_SRTP;
729         writel(cmd, iommu->reg + DMAR_GCMD_REG);
730
731         /* Make sure hardware complete it */
732         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
733                 readl, (sts & DMA_GSTS_RTPS), sts);
734
735         spin_unlock_irqrestore(&iommu->register_lock, flag);
736 }
737
738 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
739 {
740         u32 val;
741         unsigned long flag;
742
743         if (!cap_rwbf(iommu->cap))
744                 return;
745         val = iommu->gcmd | DMA_GCMD_WBF;
746
747         spin_lock_irqsave(&iommu->register_lock, flag);
748         writel(val, iommu->reg + DMAR_GCMD_REG);
749
750         /* Make sure hardware complete it */
751         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
752                         readl, (!(val & DMA_GSTS_WBFS)), val);
753
754         spin_unlock_irqrestore(&iommu->register_lock, flag);
755 }
756
757 /* return value determine if we need a write buffer flush */
758 static int __iommu_flush_context(struct intel_iommu *iommu,
759         u16 did, u16 source_id, u8 function_mask, u64 type,
760         int non_present_entry_flush)
761 {
762         u64 val = 0;
763         unsigned long flag;
764
765         /*
766          * In the non-present entry flush case, if hardware doesn't cache
767          * non-present entry we do nothing and if hardware cache non-present
768          * entry, we flush entries of domain 0 (the domain id is used to cache
769          * any non-present entries)
770          */
771         if (non_present_entry_flush) {
772                 if (!cap_caching_mode(iommu->cap))
773                         return 1;
774                 else
775                         did = 0;
776         }
777
778         switch (type) {
779         case DMA_CCMD_GLOBAL_INVL:
780                 val = DMA_CCMD_GLOBAL_INVL;
781                 break;
782         case DMA_CCMD_DOMAIN_INVL:
783                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
784                 break;
785         case DMA_CCMD_DEVICE_INVL:
786                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
787                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
788                 break;
789         default:
790                 BUG();
791         }
792         val |= DMA_CCMD_ICC;
793
794         spin_lock_irqsave(&iommu->register_lock, flag);
795         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
796
797         /* Make sure hardware complete it */
798         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
799                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
800
801         spin_unlock_irqrestore(&iommu->register_lock, flag);
802
803         /* flush context entry will implicitly flush write buffer */
804         return 0;
805 }
806
807 /* return value determine if we need a write buffer flush */
808 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
809         u64 addr, unsigned int size_order, u64 type,
810         int non_present_entry_flush)
811 {
812         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
813         u64 val = 0, val_iva = 0;
814         unsigned long flag;
815
816         /*
817          * In the non-present entry flush case, if hardware doesn't cache
818          * non-present entry we do nothing and if hardware cache non-present
819          * entry, we flush entries of domain 0 (the domain id is used to cache
820          * any non-present entries)
821          */
822         if (non_present_entry_flush) {
823                 if (!cap_caching_mode(iommu->cap))
824                         return 1;
825                 else
826                         did = 0;
827         }
828
829         switch (type) {
830         case DMA_TLB_GLOBAL_FLUSH:
831                 /* global flush doesn't need set IVA_REG */
832                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
833                 break;
834         case DMA_TLB_DSI_FLUSH:
835                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
836                 break;
837         case DMA_TLB_PSI_FLUSH:
838                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
839                 /* Note: always flush non-leaf currently */
840                 val_iva = size_order | addr;
841                 break;
842         default:
843                 BUG();
844         }
845         /* Note: set drain read/write */
846 #if 0
847         /*
848          * This is probably to be super secure.. Looks like we can
849          * ignore it without any impact.
850          */
851         if (cap_read_drain(iommu->cap))
852                 val |= DMA_TLB_READ_DRAIN;
853 #endif
854         if (cap_write_drain(iommu->cap))
855                 val |= DMA_TLB_WRITE_DRAIN;
856
857         spin_lock_irqsave(&iommu->register_lock, flag);
858         /* Note: Only uses first TLB reg currently */
859         if (val_iva)
860                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
861         dmar_writeq(iommu->reg + tlb_offset + 8, val);
862
863         /* Make sure hardware complete it */
864         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
865                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
866
867         spin_unlock_irqrestore(&iommu->register_lock, flag);
868
869         /* check IOTLB invalidation granularity */
870         if (DMA_TLB_IAIG(val) == 0)
871                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
872         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
873                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
874                         (unsigned long long)DMA_TLB_IIRG(type),
875                         (unsigned long long)DMA_TLB_IAIG(val));
876         /* flush iotlb entry will implicitly flush write buffer */
877         return 0;
878 }
879
880 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
881         u64 addr, unsigned int pages, int non_present_entry_flush)
882 {
883         unsigned int mask;
884
885         BUG_ON(addr & (~VTD_PAGE_MASK));
886         BUG_ON(pages == 0);
887
888         /* Fallback to domain selective flush if no PSI support */
889         if (!cap_pgsel_inv(iommu->cap))
890                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
891                                                 DMA_TLB_DSI_FLUSH,
892                                                 non_present_entry_flush);
893
894         /*
895          * PSI requires page size to be 2 ^ x, and the base address is naturally
896          * aligned to the size
897          */
898         mask = ilog2(__roundup_pow_of_two(pages));
899         /* Fallback to domain selective flush if size is too big */
900         if (mask > cap_max_amask_val(iommu->cap))
901                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
902                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
903
904         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
905                                         DMA_TLB_PSI_FLUSH,
906                                         non_present_entry_flush);
907 }
908
909 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
910 {
911         u32 pmen;
912         unsigned long flags;
913
914         spin_lock_irqsave(&iommu->register_lock, flags);
915         pmen = readl(iommu->reg + DMAR_PMEN_REG);
916         pmen &= ~DMA_PMEN_EPM;
917         writel(pmen, iommu->reg + DMAR_PMEN_REG);
918
919         /* wait for the protected region status bit to clear */
920         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
921                 readl, !(pmen & DMA_PMEN_PRS), pmen);
922
923         spin_unlock_irqrestore(&iommu->register_lock, flags);
924 }
925
926 static int iommu_enable_translation(struct intel_iommu *iommu)
927 {
928         u32 sts;
929         unsigned long flags;
930
931         spin_lock_irqsave(&iommu->register_lock, flags);
932         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
933
934         /* Make sure hardware complete it */
935         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
936                 readl, (sts & DMA_GSTS_TES), sts);
937
938         iommu->gcmd |= DMA_GCMD_TE;
939         spin_unlock_irqrestore(&iommu->register_lock, flags);
940         return 0;
941 }
942
943 static int iommu_disable_translation(struct intel_iommu *iommu)
944 {
945         u32 sts;
946         unsigned long flag;
947
948         spin_lock_irqsave(&iommu->register_lock, flag);
949         iommu->gcmd &= ~DMA_GCMD_TE;
950         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
951
952         /* Make sure hardware complete it */
953         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
954                 readl, (!(sts & DMA_GSTS_TES)), sts);
955
956         spin_unlock_irqrestore(&iommu->register_lock, flag);
957         return 0;
958 }
959
960 /* iommu interrupt handling. Most stuff are MSI-like. */
961
962 static const char *fault_reason_strings[] =
963 {
964         "Software",
965         "Present bit in root entry is clear",
966         "Present bit in context entry is clear",
967         "Invalid context entry",
968         "Access beyond MGAW",
969         "PTE Write access is not set",
970         "PTE Read access is not set",
971         "Next page table ptr is invalid",
972         "Root table address invalid",
973         "Context table ptr is invalid",
974         "non-zero reserved fields in RTP",
975         "non-zero reserved fields in CTP",
976         "non-zero reserved fields in PTE",
977 };
978 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
979
980 const char *dmar_get_fault_reason(u8 fault_reason)
981 {
982         if (fault_reason > MAX_FAULT_REASON_IDX)
983                 return "Unknown";
984         else
985                 return fault_reason_strings[fault_reason];
986 }
987
988 void dmar_msi_unmask(unsigned int irq)
989 {
990         struct intel_iommu *iommu = get_irq_data(irq);
991         unsigned long flag;
992
993         /* unmask it */
994         spin_lock_irqsave(&iommu->register_lock, flag);
995         writel(0, iommu->reg + DMAR_FECTL_REG);
996         /* Read a reg to force flush the post write */
997         readl(iommu->reg + DMAR_FECTL_REG);
998         spin_unlock_irqrestore(&iommu->register_lock, flag);
999 }
1000
1001 void dmar_msi_mask(unsigned int irq)
1002 {
1003         unsigned long flag;
1004         struct intel_iommu *iommu = get_irq_data(irq);
1005
1006         /* mask it */
1007         spin_lock_irqsave(&iommu->register_lock, flag);
1008         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1009         /* Read a reg to force flush the post write */
1010         readl(iommu->reg + DMAR_FECTL_REG);
1011         spin_unlock_irqrestore(&iommu->register_lock, flag);
1012 }
1013
1014 void dmar_msi_write(int irq, struct msi_msg *msg)
1015 {
1016         struct intel_iommu *iommu = get_irq_data(irq);
1017         unsigned long flag;
1018
1019         spin_lock_irqsave(&iommu->register_lock, flag);
1020         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1021         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1022         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1023         spin_unlock_irqrestore(&iommu->register_lock, flag);
1024 }
1025
1026 void dmar_msi_read(int irq, struct msi_msg *msg)
1027 {
1028         struct intel_iommu *iommu = get_irq_data(irq);
1029         unsigned long flag;
1030
1031         spin_lock_irqsave(&iommu->register_lock, flag);
1032         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1033         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1034         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1035         spin_unlock_irqrestore(&iommu->register_lock, flag);
1036 }
1037
1038 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1039                 u8 fault_reason, u16 source_id, unsigned long long addr)
1040 {
1041         const char *reason;
1042
1043         reason = dmar_get_fault_reason(fault_reason);
1044
1045         printk(KERN_ERR
1046                 "DMAR:[%s] Request device [%02x:%02x.%d] "
1047                 "fault addr %llx \n"
1048                 "DMAR:[fault reason %02d] %s\n",
1049                 (type ? "DMA Read" : "DMA Write"),
1050                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1051                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1052         return 0;
1053 }
1054
1055 #define PRIMARY_FAULT_REG_LEN (16)
1056 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1057 {
1058         struct intel_iommu *iommu = dev_id;
1059         int reg, fault_index;
1060         u32 fault_status;
1061         unsigned long flag;
1062
1063         spin_lock_irqsave(&iommu->register_lock, flag);
1064         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1065
1066         /* TBD: ignore advanced fault log currently */
1067         if (!(fault_status & DMA_FSTS_PPF))
1068                 goto clear_overflow;
1069
1070         fault_index = dma_fsts_fault_record_index(fault_status);
1071         reg = cap_fault_reg_offset(iommu->cap);
1072         while (1) {
1073                 u8 fault_reason;
1074                 u16 source_id;
1075                 u64 guest_addr;
1076                 int type;
1077                 u32 data;
1078
1079                 /* highest 32 bits */
1080                 data = readl(iommu->reg + reg +
1081                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1082                 if (!(data & DMA_FRCD_F))
1083                         break;
1084
1085                 fault_reason = dma_frcd_fault_reason(data);
1086                 type = dma_frcd_type(data);
1087
1088                 data = readl(iommu->reg + reg +
1089                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1090                 source_id = dma_frcd_source_id(data);
1091
1092                 guest_addr = dmar_readq(iommu->reg + reg +
1093                                 fault_index * PRIMARY_FAULT_REG_LEN);
1094                 guest_addr = dma_frcd_page_addr(guest_addr);
1095                 /* clear the fault */
1096                 writel(DMA_FRCD_F, iommu->reg + reg +
1097                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1098
1099                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1100
1101                 iommu_page_fault_do_one(iommu, type, fault_reason,
1102                                 source_id, guest_addr);
1103
1104                 fault_index++;
1105                 if (fault_index > cap_num_fault_regs(iommu->cap))
1106                         fault_index = 0;
1107                 spin_lock_irqsave(&iommu->register_lock, flag);
1108         }
1109 clear_overflow:
1110         /* clear primary fault overflow */
1111         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1112         if (fault_status & DMA_FSTS_PFO)
1113                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1114
1115         spin_unlock_irqrestore(&iommu->register_lock, flag);
1116         return IRQ_HANDLED;
1117 }
1118
1119 int dmar_set_interrupt(struct intel_iommu *iommu)
1120 {
1121         int irq, ret;
1122
1123         irq = create_irq();
1124         if (!irq) {
1125                 printk(KERN_ERR "IOMMU: no free vectors\n");
1126                 return -EINVAL;
1127         }
1128
1129         set_irq_data(irq, iommu);
1130         iommu->irq = irq;
1131
1132         ret = arch_setup_dmar_msi(irq);
1133         if (ret) {
1134                 set_irq_data(irq, NULL);
1135                 iommu->irq = 0;
1136                 destroy_irq(irq);
1137                 return 0;
1138         }
1139
1140         /* Force fault register is cleared */
1141         iommu_page_fault(irq, iommu);
1142
1143         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1144         if (ret)
1145                 printk(KERN_ERR "IOMMU: can't request irq\n");
1146         return ret;
1147 }
1148
1149 static int iommu_init_domains(struct intel_iommu *iommu)
1150 {
1151         unsigned long ndomains;
1152         unsigned long nlongs;
1153
1154         ndomains = cap_ndoms(iommu->cap);
1155         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1156         nlongs = BITS_TO_LONGS(ndomains);
1157
1158         /* TBD: there might be 64K domains,
1159          * consider other allocation for future chip
1160          */
1161         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1162         if (!iommu->domain_ids) {
1163                 printk(KERN_ERR "Allocating domain id array failed\n");
1164                 return -ENOMEM;
1165         }
1166         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1167                         GFP_KERNEL);
1168         if (!iommu->domains) {
1169                 printk(KERN_ERR "Allocating domain array failed\n");
1170                 kfree(iommu->domain_ids);
1171                 return -ENOMEM;
1172         }
1173
1174         spin_lock_init(&iommu->lock);
1175
1176         /*
1177          * if Caching mode is set, then invalid translations are tagged
1178          * with domainid 0. Hence we need to pre-allocate it.
1179          */
1180         if (cap_caching_mode(iommu->cap))
1181                 set_bit(0, iommu->domain_ids);
1182         return 0;
1183 }
1184
1185
1186 static void domain_exit(struct dmar_domain *domain);
1187
1188 void free_dmar_iommu(struct intel_iommu *iommu)
1189 {
1190         struct dmar_domain *domain;
1191         int i;
1192
1193         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1194         for (; i < cap_ndoms(iommu->cap); ) {
1195                 domain = iommu->domains[i];
1196                 clear_bit(i, iommu->domain_ids);
1197                 domain_exit(domain);
1198                 i = find_next_bit(iommu->domain_ids,
1199                         cap_ndoms(iommu->cap), i+1);
1200         }
1201
1202         if (iommu->gcmd & DMA_GCMD_TE)
1203                 iommu_disable_translation(iommu);
1204
1205         if (iommu->irq) {
1206                 set_irq_data(iommu->irq, NULL);
1207                 /* This will mask the irq */
1208                 free_irq(iommu->irq, iommu);
1209                 destroy_irq(iommu->irq);
1210         }
1211
1212         kfree(iommu->domains);
1213         kfree(iommu->domain_ids);
1214
1215         g_iommus[iommu->seq_id] = NULL;
1216
1217         /* if all iommus are freed, free g_iommus */
1218         for (i = 0; i < g_num_of_iommus; i++) {
1219                 if (g_iommus[i])
1220                         break;
1221         }
1222
1223         if (i == g_num_of_iommus)
1224                 kfree(g_iommus);
1225
1226         /* free context mapping */
1227         free_context_table(iommu);
1228 }
1229
1230 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1231 {
1232         unsigned long num;
1233         unsigned long ndomains;
1234         struct dmar_domain *domain;
1235         unsigned long flags;
1236
1237         domain = alloc_domain_mem();
1238         if (!domain)
1239                 return NULL;
1240
1241         ndomains = cap_ndoms(iommu->cap);
1242
1243         spin_lock_irqsave(&iommu->lock, flags);
1244         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1245         if (num >= ndomains) {
1246                 spin_unlock_irqrestore(&iommu->lock, flags);
1247                 free_domain_mem(domain);
1248                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1249                 return NULL;
1250         }
1251
1252         set_bit(num, iommu->domain_ids);
1253         domain->id = num;
1254         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1255         set_bit(iommu->seq_id, &domain->iommu_bmp);
1256         domain->flags = 0;
1257         iommu->domains[num] = domain;
1258         spin_unlock_irqrestore(&iommu->lock, flags);
1259
1260         return domain;
1261 }
1262
1263 static void iommu_free_domain(struct dmar_domain *domain)
1264 {
1265         unsigned long flags;
1266         struct intel_iommu *iommu;
1267
1268         iommu = domain_get_iommu(domain);
1269
1270         spin_lock_irqsave(&iommu->lock, flags);
1271         clear_bit(domain->id, iommu->domain_ids);
1272         spin_unlock_irqrestore(&iommu->lock, flags);
1273 }
1274
1275 static struct iova_domain reserved_iova_list;
1276 static struct lock_class_key reserved_alloc_key;
1277 static struct lock_class_key reserved_rbtree_key;
1278
1279 static void dmar_init_reserved_ranges(void)
1280 {
1281         struct pci_dev *pdev = NULL;
1282         struct iova *iova;
1283         int i;
1284         u64 addr, size;
1285
1286         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1287
1288         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1289                 &reserved_alloc_key);
1290         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1291                 &reserved_rbtree_key);
1292
1293         /* IOAPIC ranges shouldn't be accessed by DMA */
1294         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1295                 IOVA_PFN(IOAPIC_RANGE_END));
1296         if (!iova)
1297                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1298
1299         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1300         for_each_pci_dev(pdev) {
1301                 struct resource *r;
1302
1303                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1304                         r = &pdev->resource[i];
1305                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1306                                 continue;
1307                         addr = r->start;
1308                         addr &= PAGE_MASK;
1309                         size = r->end - addr;
1310                         size = PAGE_ALIGN(size);
1311                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1312                                 IOVA_PFN(size + addr) - 1);
1313                         if (!iova)
1314                                 printk(KERN_ERR "Reserve iova failed\n");
1315                 }
1316         }
1317
1318 }
1319
1320 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1321 {
1322         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1323 }
1324
1325 static inline int guestwidth_to_adjustwidth(int gaw)
1326 {
1327         int agaw;
1328         int r = (gaw - 12) % 9;
1329
1330         if (r == 0)
1331                 agaw = gaw;
1332         else
1333                 agaw = gaw + 9 - r;
1334         if (agaw > 64)
1335                 agaw = 64;
1336         return agaw;
1337 }
1338
1339 static int domain_init(struct dmar_domain *domain, int guest_width)
1340 {
1341         struct intel_iommu *iommu;
1342         int adjust_width, agaw;
1343         unsigned long sagaw;
1344
1345         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1346         spin_lock_init(&domain->mapping_lock);
1347
1348         domain_reserve_special_ranges(domain);
1349
1350         /* calculate AGAW */
1351         iommu = domain_get_iommu(domain);
1352         if (guest_width > cap_mgaw(iommu->cap))
1353                 guest_width = cap_mgaw(iommu->cap);
1354         domain->gaw = guest_width;
1355         adjust_width = guestwidth_to_adjustwidth(guest_width);
1356         agaw = width_to_agaw(adjust_width);
1357         sagaw = cap_sagaw(iommu->cap);
1358         if (!test_bit(agaw, &sagaw)) {
1359                 /* hardware doesn't support it, choose a bigger one */
1360                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1361                 agaw = find_next_bit(&sagaw, 5, agaw);
1362                 if (agaw >= 5)
1363                         return -ENODEV;
1364         }
1365         domain->agaw = agaw;
1366         INIT_LIST_HEAD(&domain->devices);
1367
1368         if (ecap_coherent(iommu->ecap))
1369                 domain->iommu_coherency = 1;
1370         else
1371                 domain->iommu_coherency = 0;
1372
1373         /* always allocate the top pgd */
1374         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1375         if (!domain->pgd)
1376                 return -ENOMEM;
1377         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1378         return 0;
1379 }
1380
1381 static void domain_exit(struct dmar_domain *domain)
1382 {
1383         u64 end;
1384
1385         /* Domain 0 is reserved, so dont process it */
1386         if (!domain)
1387                 return;
1388
1389         domain_remove_dev_info(domain);
1390         /* destroy iovas */
1391         put_iova_domain(&domain->iovad);
1392         end = DOMAIN_MAX_ADDR(domain->gaw);
1393         end = end & (~PAGE_MASK);
1394
1395         /* clear ptes */
1396         dma_pte_clear_range(domain, 0, end);
1397
1398         /* free page tables */
1399         dma_pte_free_pagetable(domain, 0, end);
1400
1401         iommu_free_domain(domain);
1402         free_domain_mem(domain);
1403 }
1404
1405 static int domain_context_mapping_one(struct dmar_domain *domain,
1406                 u8 bus, u8 devfn)
1407 {
1408         struct context_entry *context;
1409         struct intel_iommu *iommu = domain_get_iommu(domain);
1410         unsigned long flags;
1411
1412         pr_debug("Set context mapping for %02x:%02x.%d\n",
1413                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1414         BUG_ON(!domain->pgd);
1415         context = device_to_context_entry(iommu, bus, devfn);
1416         if (!context)
1417                 return -ENOMEM;
1418         spin_lock_irqsave(&iommu->lock, flags);
1419         if (context_present(context)) {
1420                 spin_unlock_irqrestore(&iommu->lock, flags);
1421                 return 0;
1422         }
1423
1424         context_set_domain_id(context, domain->id);
1425         context_set_address_width(context, domain->agaw);
1426         context_set_address_root(context, virt_to_phys(domain->pgd));
1427         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1428         context_set_fault_enable(context);
1429         context_set_present(context);
1430         __iommu_flush_cache(iommu, context, sizeof(*context));
1431
1432         /* it's a non-present to present mapping */
1433         if (iommu->flush.flush_context(iommu, domain->id,
1434                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1435                 DMA_CCMD_DEVICE_INVL, 1))
1436                 iommu_flush_write_buffer(iommu);
1437         else
1438                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1439
1440         spin_unlock_irqrestore(&iommu->lock, flags);
1441         return 0;
1442 }
1443
1444 static int
1445 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1446 {
1447         int ret;
1448         struct pci_dev *tmp, *parent;
1449
1450         ret = domain_context_mapping_one(domain, pdev->bus->number,
1451                 pdev->devfn);
1452         if (ret)
1453                 return ret;
1454
1455         /* dependent device mapping */
1456         tmp = pci_find_upstream_pcie_bridge(pdev);
1457         if (!tmp)
1458                 return 0;
1459         /* Secondary interface's bus number and devfn 0 */
1460         parent = pdev->bus->self;
1461         while (parent != tmp) {
1462                 ret = domain_context_mapping_one(domain, parent->bus->number,
1463                         parent->devfn);
1464                 if (ret)
1465                         return ret;
1466                 parent = parent->bus->self;
1467         }
1468         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1469                 return domain_context_mapping_one(domain,
1470                         tmp->subordinate->number, 0);
1471         else /* this is a legacy PCI bridge */
1472                 return domain_context_mapping_one(domain,
1473                         tmp->bus->number, tmp->devfn);
1474 }
1475
1476 static int domain_context_mapped(struct dmar_domain *domain,
1477         struct pci_dev *pdev)
1478 {
1479         int ret;
1480         struct pci_dev *tmp, *parent;
1481         struct intel_iommu *iommu = domain_get_iommu(domain);
1482
1483         ret = device_context_mapped(iommu,
1484                 pdev->bus->number, pdev->devfn);
1485         if (!ret)
1486                 return ret;
1487         /* dependent device mapping */
1488         tmp = pci_find_upstream_pcie_bridge(pdev);
1489         if (!tmp)
1490                 return ret;
1491         /* Secondary interface's bus number and devfn 0 */
1492         parent = pdev->bus->self;
1493         while (parent != tmp) {
1494                 ret = device_context_mapped(iommu, parent->bus->number,
1495                         parent->devfn);
1496                 if (!ret)
1497                         return ret;
1498                 parent = parent->bus->self;
1499         }
1500         if (tmp->is_pcie)
1501                 return device_context_mapped(iommu,
1502                         tmp->subordinate->number, 0);
1503         else
1504                 return device_context_mapped(iommu,
1505                         tmp->bus->number, tmp->devfn);
1506 }
1507
1508 static int
1509 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1510                         u64 hpa, size_t size, int prot)
1511 {
1512         u64 start_pfn, end_pfn;
1513         struct dma_pte *pte;
1514         int index;
1515         int addr_width = agaw_to_width(domain->agaw);
1516         struct intel_iommu *iommu = domain_get_iommu(domain);
1517
1518         hpa &= (((u64)1) << addr_width) - 1;
1519
1520         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1521                 return -EINVAL;
1522         iova &= PAGE_MASK;
1523         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1524         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1525         index = 0;
1526         while (start_pfn < end_pfn) {
1527                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1528                 if (!pte)
1529                         return -ENOMEM;
1530                 /* We don't need lock here, nobody else
1531                  * touches the iova range
1532                  */
1533                 BUG_ON(dma_pte_addr(pte));
1534                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1535                 dma_set_pte_prot(pte, prot);
1536                 __iommu_flush_cache(iommu, pte, sizeof(*pte));
1537                 start_pfn++;
1538                 index++;
1539         }
1540         return 0;
1541 }
1542
1543 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1544 {
1545         struct intel_iommu *iommu = domain_get_iommu(domain);
1546
1547         clear_context_table(iommu, bus, devfn);
1548         iommu->flush.flush_context(iommu, 0, 0, 0,
1549                                            DMA_CCMD_GLOBAL_INVL, 0);
1550         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1551                                          DMA_TLB_GLOBAL_FLUSH, 0);
1552 }
1553
1554 static void domain_remove_dev_info(struct dmar_domain *domain)
1555 {
1556         struct device_domain_info *info;
1557         unsigned long flags;
1558
1559         spin_lock_irqsave(&device_domain_lock, flags);
1560         while (!list_empty(&domain->devices)) {
1561                 info = list_entry(domain->devices.next,
1562                         struct device_domain_info, link);
1563                 list_del(&info->link);
1564                 list_del(&info->global);
1565                 if (info->dev)
1566                         info->dev->dev.archdata.iommu = NULL;
1567                 spin_unlock_irqrestore(&device_domain_lock, flags);
1568
1569                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1570                 free_devinfo_mem(info);
1571
1572                 spin_lock_irqsave(&device_domain_lock, flags);
1573         }
1574         spin_unlock_irqrestore(&device_domain_lock, flags);
1575 }
1576
1577 /*
1578  * find_domain
1579  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1580  */
1581 static struct dmar_domain *
1582 find_domain(struct pci_dev *pdev)
1583 {
1584         struct device_domain_info *info;
1585
1586         /* No lock here, assumes no domain exit in normal case */
1587         info = pdev->dev.archdata.iommu;
1588         if (info)
1589                 return info->domain;
1590         return NULL;
1591 }
1592
1593 /* domain is initialized */
1594 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1595 {
1596         struct dmar_domain *domain, *found = NULL;
1597         struct intel_iommu *iommu;
1598         struct dmar_drhd_unit *drhd;
1599         struct device_domain_info *info, *tmp;
1600         struct pci_dev *dev_tmp;
1601         unsigned long flags;
1602         int bus = 0, devfn = 0;
1603
1604         domain = find_domain(pdev);
1605         if (domain)
1606                 return domain;
1607
1608         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1609         if (dev_tmp) {
1610                 if (dev_tmp->is_pcie) {
1611                         bus = dev_tmp->subordinate->number;
1612                         devfn = 0;
1613                 } else {
1614                         bus = dev_tmp->bus->number;
1615                         devfn = dev_tmp->devfn;
1616                 }
1617                 spin_lock_irqsave(&device_domain_lock, flags);
1618                 list_for_each_entry(info, &device_domain_list, global) {
1619                         if (info->bus == bus && info->devfn == devfn) {
1620                                 found = info->domain;
1621                                 break;
1622                         }
1623                 }
1624                 spin_unlock_irqrestore(&device_domain_lock, flags);
1625                 /* pcie-pci bridge already has a domain, uses it */
1626                 if (found) {
1627                         domain = found;
1628                         goto found_domain;
1629                 }
1630         }
1631
1632         /* Allocate new domain for the device */
1633         drhd = dmar_find_matched_drhd_unit(pdev);
1634         if (!drhd) {
1635                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1636                         pci_name(pdev));
1637                 return NULL;
1638         }
1639         iommu = drhd->iommu;
1640
1641         domain = iommu_alloc_domain(iommu);
1642         if (!domain)
1643                 goto error;
1644
1645         if (domain_init(domain, gaw)) {
1646                 domain_exit(domain);
1647                 goto error;
1648         }
1649
1650         /* register pcie-to-pci device */
1651         if (dev_tmp) {
1652                 info = alloc_devinfo_mem();
1653                 if (!info) {
1654                         domain_exit(domain);
1655                         goto error;
1656                 }
1657                 info->bus = bus;
1658                 info->devfn = devfn;
1659                 info->dev = NULL;
1660                 info->domain = domain;
1661                 /* This domain is shared by devices under p2p bridge */
1662                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1663
1664                 /* pcie-to-pci bridge already has a domain, uses it */
1665                 found = NULL;
1666                 spin_lock_irqsave(&device_domain_lock, flags);
1667                 list_for_each_entry(tmp, &device_domain_list, global) {
1668                         if (tmp->bus == bus && tmp->devfn == devfn) {
1669                                 found = tmp->domain;
1670                                 break;
1671                         }
1672                 }
1673                 if (found) {
1674                         free_devinfo_mem(info);
1675                         domain_exit(domain);
1676                         domain = found;
1677                 } else {
1678                         list_add(&info->link, &domain->devices);
1679                         list_add(&info->global, &device_domain_list);
1680                 }
1681                 spin_unlock_irqrestore(&device_domain_lock, flags);
1682         }
1683
1684 found_domain:
1685         info = alloc_devinfo_mem();
1686         if (!info)
1687                 goto error;
1688         info->bus = pdev->bus->number;
1689         info->devfn = pdev->devfn;
1690         info->dev = pdev;
1691         info->domain = domain;
1692         spin_lock_irqsave(&device_domain_lock, flags);
1693         /* somebody is fast */
1694         found = find_domain(pdev);
1695         if (found != NULL) {
1696                 spin_unlock_irqrestore(&device_domain_lock, flags);
1697                 if (found != domain) {
1698                         domain_exit(domain);
1699                         domain = found;
1700                 }
1701                 free_devinfo_mem(info);
1702                 return domain;
1703         }
1704         list_add(&info->link, &domain->devices);
1705         list_add(&info->global, &device_domain_list);
1706         pdev->dev.archdata.iommu = info;
1707         spin_unlock_irqrestore(&device_domain_lock, flags);
1708         return domain;
1709 error:
1710         /* recheck it here, maybe others set it */
1711         return find_domain(pdev);
1712 }
1713
1714 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1715                                       unsigned long long start,
1716                                       unsigned long long end)
1717 {
1718         struct dmar_domain *domain;
1719         unsigned long size;
1720         unsigned long long base;
1721         int ret;
1722
1723         printk(KERN_INFO
1724                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1725                 pci_name(pdev), start, end);
1726         /* page table init */
1727         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1728         if (!domain)
1729                 return -ENOMEM;
1730
1731         /* The address might not be aligned */
1732         base = start & PAGE_MASK;
1733         size = end - base;
1734         size = PAGE_ALIGN(size);
1735         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1736                         IOVA_PFN(base + size) - 1)) {
1737                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1738                 ret = -ENOMEM;
1739                 goto error;
1740         }
1741
1742         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1743                 size, base, pci_name(pdev));
1744         /*
1745          * RMRR range might have overlap with physical memory range,
1746          * clear it first
1747          */
1748         dma_pte_clear_range(domain, base, base + size);
1749
1750         ret = domain_page_mapping(domain, base, base, size,
1751                 DMA_PTE_READ|DMA_PTE_WRITE);
1752         if (ret)
1753                 goto error;
1754
1755         /* context entry init */
1756         ret = domain_context_mapping(domain, pdev);
1757         if (!ret)
1758                 return 0;
1759 error:
1760         domain_exit(domain);
1761         return ret;
1762
1763 }
1764
1765 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1766         struct pci_dev *pdev)
1767 {
1768         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1769                 return 0;
1770         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1771                 rmrr->end_address + 1);
1772 }
1773
1774 #ifdef CONFIG_DMAR_GFX_WA
1775 struct iommu_prepare_data {
1776         struct pci_dev *pdev;
1777         int ret;
1778 };
1779
1780 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1781                                          unsigned long end_pfn, void *datax)
1782 {
1783         struct iommu_prepare_data *data;
1784
1785         data = (struct iommu_prepare_data *)datax;
1786
1787         data->ret = iommu_prepare_identity_map(data->pdev,
1788                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1789         return data->ret;
1790
1791 }
1792
1793 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1794 {
1795         int nid;
1796         struct iommu_prepare_data data;
1797
1798         data.pdev = pdev;
1799         data.ret = 0;
1800
1801         for_each_online_node(nid) {
1802                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1803                 if (data.ret)
1804                         return data.ret;
1805         }
1806         return data.ret;
1807 }
1808
1809 static void __init iommu_prepare_gfx_mapping(void)
1810 {
1811         struct pci_dev *pdev = NULL;
1812         int ret;
1813
1814         for_each_pci_dev(pdev) {
1815                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1816                                 !IS_GFX_DEVICE(pdev))
1817                         continue;
1818                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1819                         pci_name(pdev));
1820                 ret = iommu_prepare_with_active_regions(pdev);
1821                 if (ret)
1822                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1823         }
1824 }
1825 #else /* !CONFIG_DMAR_GFX_WA */
1826 static inline void iommu_prepare_gfx_mapping(void)
1827 {
1828         return;
1829 }
1830 #endif
1831
1832 #ifdef CONFIG_DMAR_FLOPPY_WA
1833 static inline void iommu_prepare_isa(void)
1834 {
1835         struct pci_dev *pdev;
1836         int ret;
1837
1838         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1839         if (!pdev)
1840                 return;
1841
1842         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1843         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1844
1845         if (ret)
1846                 printk("IOMMU: Failed to create 0-64M identity map, "
1847                         "floppy might not work\n");
1848
1849 }
1850 #else
1851 static inline void iommu_prepare_isa(void)
1852 {
1853         return;
1854 }
1855 #endif /* !CONFIG_DMAR_FLPY_WA */
1856
1857 static int __init init_dmars(void)
1858 {
1859         struct dmar_drhd_unit *drhd;
1860         struct dmar_rmrr_unit *rmrr;
1861         struct pci_dev *pdev;
1862         struct intel_iommu *iommu;
1863         int i, ret, unit = 0;
1864
1865         /*
1866          * for each drhd
1867          *    allocate root
1868          *    initialize and program root entry to not present
1869          * endfor
1870          */
1871         for_each_drhd_unit(drhd) {
1872                 g_num_of_iommus++;
1873                 /*
1874                  * lock not needed as this is only incremented in the single
1875                  * threaded kernel __init code path all other access are read
1876                  * only
1877                  */
1878         }
1879
1880         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1881                         GFP_KERNEL);
1882         if (!g_iommus) {
1883                 printk(KERN_ERR "Allocating global iommu array failed\n");
1884                 ret = -ENOMEM;
1885                 goto error;
1886         }
1887
1888         deferred_flush = kzalloc(g_num_of_iommus *
1889                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1890         if (!deferred_flush) {
1891                 kfree(g_iommus);
1892                 ret = -ENOMEM;
1893                 goto error;
1894         }
1895
1896         for_each_drhd_unit(drhd) {
1897                 if (drhd->ignored)
1898                         continue;
1899
1900                 iommu = drhd->iommu;
1901                 g_iommus[iommu->seq_id] = iommu;
1902
1903                 ret = iommu_init_domains(iommu);
1904                 if (ret)
1905                         goto error;
1906
1907                 /*
1908                  * TBD:
1909                  * we could share the same root & context tables
1910                  * amoung all IOMMU's. Need to Split it later.
1911                  */
1912                 ret = iommu_alloc_root_entry(iommu);
1913                 if (ret) {
1914                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1915                         goto error;
1916                 }
1917         }
1918
1919         for_each_drhd_unit(drhd) {
1920                 if (drhd->ignored)
1921                         continue;
1922
1923                 iommu = drhd->iommu;
1924                 if (dmar_enable_qi(iommu)) {
1925                         /*
1926                          * Queued Invalidate not enabled, use Register Based
1927                          * Invalidate
1928                          */
1929                         iommu->flush.flush_context = __iommu_flush_context;
1930                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1931                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1932                                "invalidation\n",
1933                                (unsigned long long)drhd->reg_base_addr);
1934                 } else {
1935                         iommu->flush.flush_context = qi_flush_context;
1936                         iommu->flush.flush_iotlb = qi_flush_iotlb;
1937                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1938                                "invalidation\n",
1939                                (unsigned long long)drhd->reg_base_addr);
1940                 }
1941         }
1942
1943         /*
1944          * For each rmrr
1945          *   for each dev attached to rmrr
1946          *   do
1947          *     locate drhd for dev, alloc domain for dev
1948          *     allocate free domain
1949          *     allocate page table entries for rmrr
1950          *     if context not allocated for bus
1951          *           allocate and init context
1952          *           set present in root table for this bus
1953          *     init context with domain, translation etc
1954          *    endfor
1955          * endfor
1956          */
1957         for_each_rmrr_units(rmrr) {
1958                 for (i = 0; i < rmrr->devices_cnt; i++) {
1959                         pdev = rmrr->devices[i];
1960                         /* some BIOS lists non-exist devices in DMAR table */
1961                         if (!pdev)
1962                                 continue;
1963                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1964                         if (ret)
1965                                 printk(KERN_ERR
1966                                  "IOMMU: mapping reserved region failed\n");
1967                 }
1968         }
1969
1970         iommu_prepare_gfx_mapping();
1971
1972         iommu_prepare_isa();
1973
1974         /*
1975          * for each drhd
1976          *   enable fault log
1977          *   global invalidate context cache
1978          *   global invalidate iotlb
1979          *   enable translation
1980          */
1981         for_each_drhd_unit(drhd) {
1982                 if (drhd->ignored)
1983                         continue;
1984                 iommu = drhd->iommu;
1985                 sprintf (iommu->name, "dmar%d", unit++);
1986
1987                 iommu_flush_write_buffer(iommu);
1988
1989                 ret = dmar_set_interrupt(iommu);
1990                 if (ret)
1991                         goto error;
1992
1993                 iommu_set_root_entry(iommu);
1994
1995                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
1996                                            0);
1997                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
1998                                          0);
1999                 iommu_disable_protect_mem_regions(iommu);
2000
2001                 ret = iommu_enable_translation(iommu);
2002                 if (ret)
2003                         goto error;
2004         }
2005
2006         return 0;
2007 error:
2008         for_each_drhd_unit(drhd) {
2009                 if (drhd->ignored)
2010                         continue;
2011                 iommu = drhd->iommu;
2012                 free_iommu(iommu);
2013         }
2014         kfree(g_iommus);
2015         return ret;
2016 }
2017
2018 static inline u64 aligned_size(u64 host_addr, size_t size)
2019 {
2020         u64 addr;
2021         addr = (host_addr & (~PAGE_MASK)) + size;
2022         return PAGE_ALIGN(addr);
2023 }
2024
2025 struct iova *
2026 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2027 {
2028         struct iova *piova;
2029
2030         /* Make sure it's in range */
2031         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2032         if (!size || (IOVA_START_ADDR + size > end))
2033                 return NULL;
2034
2035         piova = alloc_iova(&domain->iovad,
2036                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2037         return piova;
2038 }
2039
2040 static struct iova *
2041 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2042                    size_t size, u64 dma_mask)
2043 {
2044         struct pci_dev *pdev = to_pci_dev(dev);
2045         struct iova *iova = NULL;
2046
2047         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2048                 iova = iommu_alloc_iova(domain, size, dma_mask);
2049         else {
2050                 /*
2051                  * First try to allocate an io virtual address in
2052                  * DMA_32BIT_MASK and if that fails then try allocating
2053                  * from higher range
2054                  */
2055                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2056                 if (!iova)
2057                         iova = iommu_alloc_iova(domain, size, dma_mask);
2058         }
2059
2060         if (!iova) {
2061                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2062                 return NULL;
2063         }
2064
2065         return iova;
2066 }
2067
2068 static struct dmar_domain *
2069 get_valid_domain_for_dev(struct pci_dev *pdev)
2070 {
2071         struct dmar_domain *domain;
2072         int ret;
2073
2074         domain = get_domain_for_dev(pdev,
2075                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2076         if (!domain) {
2077                 printk(KERN_ERR
2078                         "Allocating domain for %s failed", pci_name(pdev));
2079                 return NULL;
2080         }
2081
2082         /* make sure context mapping is ok */
2083         if (unlikely(!domain_context_mapped(domain, pdev))) {
2084                 ret = domain_context_mapping(domain, pdev);
2085                 if (ret) {
2086                         printk(KERN_ERR
2087                                 "Domain context map for %s failed",
2088                                 pci_name(pdev));
2089                         return NULL;
2090                 }
2091         }
2092
2093         return domain;
2094 }
2095
2096 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2097                                      size_t size, int dir, u64 dma_mask)
2098 {
2099         struct pci_dev *pdev = to_pci_dev(hwdev);
2100         struct dmar_domain *domain;
2101         phys_addr_t start_paddr;
2102         struct iova *iova;
2103         int prot = 0;
2104         int ret;
2105         struct intel_iommu *iommu;
2106
2107         BUG_ON(dir == DMA_NONE);
2108         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2109                 return paddr;
2110
2111         domain = get_valid_domain_for_dev(pdev);
2112         if (!domain)
2113                 return 0;
2114
2115         iommu = domain_get_iommu(domain);
2116         size = aligned_size((u64)paddr, size);
2117
2118         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2119         if (!iova)
2120                 goto error;
2121
2122         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2123
2124         /*
2125          * Check if DMAR supports zero-length reads on write only
2126          * mappings..
2127          */
2128         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2129                         !cap_zlr(iommu->cap))
2130                 prot |= DMA_PTE_READ;
2131         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2132                 prot |= DMA_PTE_WRITE;
2133         /*
2134          * paddr - (paddr + size) might be partial page, we should map the whole
2135          * page.  Note: if two part of one page are separately mapped, we
2136          * might have two guest_addr mapping to the same host paddr, but this
2137          * is not a big problem
2138          */
2139         ret = domain_page_mapping(domain, start_paddr,
2140                 ((u64)paddr) & PAGE_MASK, size, prot);
2141         if (ret)
2142                 goto error;
2143
2144         /* it's a non-present to present mapping */
2145         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2146                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2147         if (ret)
2148                 iommu_flush_write_buffer(iommu);
2149
2150         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2151
2152 error:
2153         if (iova)
2154                 __free_iova(&domain->iovad, iova);
2155         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2156                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2157         return 0;
2158 }
2159
2160 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2161                             size_t size, int dir)
2162 {
2163         return __intel_map_single(hwdev, paddr, size, dir,
2164                                   to_pci_dev(hwdev)->dma_mask);
2165 }
2166
2167 static void flush_unmaps(void)
2168 {
2169         int i, j;
2170
2171         timer_on = 0;
2172
2173         /* just flush them all */
2174         for (i = 0; i < g_num_of_iommus; i++) {
2175                 struct intel_iommu *iommu = g_iommus[i];
2176                 if (!iommu)
2177                         continue;
2178
2179                 if (deferred_flush[i].next) {
2180                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2181                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2182                         for (j = 0; j < deferred_flush[i].next; j++) {
2183                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2184                                                 deferred_flush[i].iova[j]);
2185                         }
2186                         deferred_flush[i].next = 0;
2187                 }
2188         }
2189
2190         list_size = 0;
2191 }
2192
2193 static void flush_unmaps_timeout(unsigned long data)
2194 {
2195         unsigned long flags;
2196
2197         spin_lock_irqsave(&async_umap_flush_lock, flags);
2198         flush_unmaps();
2199         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2200 }
2201
2202 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2203 {
2204         unsigned long flags;
2205         int next, iommu_id;
2206         struct intel_iommu *iommu;
2207
2208         spin_lock_irqsave(&async_umap_flush_lock, flags);
2209         if (list_size == HIGH_WATER_MARK)
2210                 flush_unmaps();
2211
2212         iommu = domain_get_iommu(dom);
2213         iommu_id = iommu->seq_id;
2214
2215         next = deferred_flush[iommu_id].next;
2216         deferred_flush[iommu_id].domain[next] = dom;
2217         deferred_flush[iommu_id].iova[next] = iova;
2218         deferred_flush[iommu_id].next++;
2219
2220         if (!timer_on) {
2221                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2222                 timer_on = 1;
2223         }
2224         list_size++;
2225         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2226 }
2227
2228 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2229                         int dir)
2230 {
2231         struct pci_dev *pdev = to_pci_dev(dev);
2232         struct dmar_domain *domain;
2233         unsigned long start_addr;
2234         struct iova *iova;
2235         struct intel_iommu *iommu;
2236
2237         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2238                 return;
2239         domain = find_domain(pdev);
2240         BUG_ON(!domain);
2241
2242         iommu = domain_get_iommu(domain);
2243
2244         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2245         if (!iova)
2246                 return;
2247
2248         start_addr = iova->pfn_lo << PAGE_SHIFT;
2249         size = aligned_size((u64)dev_addr, size);
2250
2251         pr_debug("Device %s unmapping: %lx@%llx\n",
2252                 pci_name(pdev), size, (unsigned long long)start_addr);
2253
2254         /*  clear the whole page */
2255         dma_pte_clear_range(domain, start_addr, start_addr + size);
2256         /* free page tables */
2257         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2258         if (intel_iommu_strict) {
2259                 if (iommu_flush_iotlb_psi(iommu,
2260                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2261                         iommu_flush_write_buffer(iommu);
2262                 /* free iova */
2263                 __free_iova(&domain->iovad, iova);
2264         } else {
2265                 add_unmap(domain, iova);
2266                 /*
2267                  * queue up the release of the unmap to save the 1/6th of the
2268                  * cpu used up by the iotlb flush operation...
2269                  */
2270         }
2271 }
2272
2273 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2274                            dma_addr_t *dma_handle, gfp_t flags)
2275 {
2276         void *vaddr;
2277         int order;
2278
2279         size = PAGE_ALIGN(size);
2280         order = get_order(size);
2281         flags &= ~(GFP_DMA | GFP_DMA32);
2282
2283         vaddr = (void *)__get_free_pages(flags, order);
2284         if (!vaddr)
2285                 return NULL;
2286         memset(vaddr, 0, size);
2287
2288         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2289                                          DMA_BIDIRECTIONAL,
2290                                          hwdev->coherent_dma_mask);
2291         if (*dma_handle)
2292                 return vaddr;
2293         free_pages((unsigned long)vaddr, order);
2294         return NULL;
2295 }
2296
2297 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2298                          dma_addr_t dma_handle)
2299 {
2300         int order;
2301
2302         size = PAGE_ALIGN(size);
2303         order = get_order(size);
2304
2305         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2306         free_pages((unsigned long)vaddr, order);
2307 }
2308
2309 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2310
2311 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2312                     int nelems, int dir)
2313 {
2314         int i;
2315         struct pci_dev *pdev = to_pci_dev(hwdev);
2316         struct dmar_domain *domain;
2317         unsigned long start_addr;
2318         struct iova *iova;
2319         size_t size = 0;
2320         void *addr;
2321         struct scatterlist *sg;
2322         struct intel_iommu *iommu;
2323
2324         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2325                 return;
2326
2327         domain = find_domain(pdev);
2328         BUG_ON(!domain);
2329
2330         iommu = domain_get_iommu(domain);
2331
2332         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2333         if (!iova)
2334                 return;
2335         for_each_sg(sglist, sg, nelems, i) {
2336                 addr = SG_ENT_VIRT_ADDRESS(sg);
2337                 size += aligned_size((u64)addr, sg->length);
2338         }
2339
2340         start_addr = iova->pfn_lo << PAGE_SHIFT;
2341
2342         /*  clear the whole page */
2343         dma_pte_clear_range(domain, start_addr, start_addr + size);
2344         /* free page tables */
2345         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2346
2347         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2348                         size >> VTD_PAGE_SHIFT, 0))
2349                 iommu_flush_write_buffer(iommu);
2350
2351         /* free iova */
2352         __free_iova(&domain->iovad, iova);
2353 }
2354
2355 static int intel_nontranslate_map_sg(struct device *hddev,
2356         struct scatterlist *sglist, int nelems, int dir)
2357 {
2358         int i;
2359         struct scatterlist *sg;
2360
2361         for_each_sg(sglist, sg, nelems, i) {
2362                 BUG_ON(!sg_page(sg));
2363                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2364                 sg->dma_length = sg->length;
2365         }
2366         return nelems;
2367 }
2368
2369 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2370                  int dir)
2371 {
2372         void *addr;
2373         int i;
2374         struct pci_dev *pdev = to_pci_dev(hwdev);
2375         struct dmar_domain *domain;
2376         size_t size = 0;
2377         int prot = 0;
2378         size_t offset = 0;
2379         struct iova *iova = NULL;
2380         int ret;
2381         struct scatterlist *sg;
2382         unsigned long start_addr;
2383         struct intel_iommu *iommu;
2384
2385         BUG_ON(dir == DMA_NONE);
2386         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2387                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2388
2389         domain = get_valid_domain_for_dev(pdev);
2390         if (!domain)
2391                 return 0;
2392
2393         iommu = domain_get_iommu(domain);
2394
2395         for_each_sg(sglist, sg, nelems, i) {
2396                 addr = SG_ENT_VIRT_ADDRESS(sg);
2397                 addr = (void *)virt_to_phys(addr);
2398                 size += aligned_size((u64)addr, sg->length);
2399         }
2400
2401         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2402         if (!iova) {
2403                 sglist->dma_length = 0;
2404                 return 0;
2405         }
2406
2407         /*
2408          * Check if DMAR supports zero-length reads on write only
2409          * mappings..
2410          */
2411         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2412                         !cap_zlr(iommu->cap))
2413                 prot |= DMA_PTE_READ;
2414         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2415                 prot |= DMA_PTE_WRITE;
2416
2417         start_addr = iova->pfn_lo << PAGE_SHIFT;
2418         offset = 0;
2419         for_each_sg(sglist, sg, nelems, i) {
2420                 addr = SG_ENT_VIRT_ADDRESS(sg);
2421                 addr = (void *)virt_to_phys(addr);
2422                 size = aligned_size((u64)addr, sg->length);
2423                 ret = domain_page_mapping(domain, start_addr + offset,
2424                         ((u64)addr) & PAGE_MASK,
2425                         size, prot);
2426                 if (ret) {
2427                         /*  clear the page */
2428                         dma_pte_clear_range(domain, start_addr,
2429                                   start_addr + offset);
2430                         /* free page tables */
2431                         dma_pte_free_pagetable(domain, start_addr,
2432                                   start_addr + offset);
2433                         /* free iova */
2434                         __free_iova(&domain->iovad, iova);
2435                         return 0;
2436                 }
2437                 sg->dma_address = start_addr + offset +
2438                                 ((u64)addr & (~PAGE_MASK));
2439                 sg->dma_length = sg->length;
2440                 offset += size;
2441         }
2442
2443         /* it's a non-present to present mapping */
2444         if (iommu_flush_iotlb_psi(iommu, domain->id,
2445                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2446                 iommu_flush_write_buffer(iommu);
2447         return nelems;
2448 }
2449
2450 static struct dma_mapping_ops intel_dma_ops = {
2451         .alloc_coherent = intel_alloc_coherent,
2452         .free_coherent = intel_free_coherent,
2453         .map_single = intel_map_single,
2454         .unmap_single = intel_unmap_single,
2455         .map_sg = intel_map_sg,
2456         .unmap_sg = intel_unmap_sg,
2457 };
2458
2459 static inline int iommu_domain_cache_init(void)
2460 {
2461         int ret = 0;
2462
2463         iommu_domain_cache = kmem_cache_create("iommu_domain",
2464                                          sizeof(struct dmar_domain),
2465                                          0,
2466                                          SLAB_HWCACHE_ALIGN,
2467
2468                                          NULL);
2469         if (!iommu_domain_cache) {
2470                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2471                 ret = -ENOMEM;
2472         }
2473
2474         return ret;
2475 }
2476
2477 static inline int iommu_devinfo_cache_init(void)
2478 {
2479         int ret = 0;
2480
2481         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2482                                          sizeof(struct device_domain_info),
2483                                          0,
2484                                          SLAB_HWCACHE_ALIGN,
2485                                          NULL);
2486         if (!iommu_devinfo_cache) {
2487                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2488                 ret = -ENOMEM;
2489         }
2490
2491         return ret;
2492 }
2493
2494 static inline int iommu_iova_cache_init(void)
2495 {
2496         int ret = 0;
2497
2498         iommu_iova_cache = kmem_cache_create("iommu_iova",
2499                                          sizeof(struct iova),
2500                                          0,
2501                                          SLAB_HWCACHE_ALIGN,
2502                                          NULL);
2503         if (!iommu_iova_cache) {
2504                 printk(KERN_ERR "Couldn't create iova cache\n");
2505                 ret = -ENOMEM;
2506         }
2507
2508         return ret;
2509 }
2510
2511 static int __init iommu_init_mempool(void)
2512 {
2513         int ret;
2514         ret = iommu_iova_cache_init();
2515         if (ret)
2516                 return ret;
2517
2518         ret = iommu_domain_cache_init();
2519         if (ret)
2520                 goto domain_error;
2521
2522         ret = iommu_devinfo_cache_init();
2523         if (!ret)
2524                 return ret;
2525
2526         kmem_cache_destroy(iommu_domain_cache);
2527 domain_error:
2528         kmem_cache_destroy(iommu_iova_cache);
2529
2530         return -ENOMEM;
2531 }
2532
2533 static void __init iommu_exit_mempool(void)
2534 {
2535         kmem_cache_destroy(iommu_devinfo_cache);
2536         kmem_cache_destroy(iommu_domain_cache);
2537         kmem_cache_destroy(iommu_iova_cache);
2538
2539 }
2540
2541 static void __init init_no_remapping_devices(void)
2542 {
2543         struct dmar_drhd_unit *drhd;
2544
2545         for_each_drhd_unit(drhd) {
2546                 if (!drhd->include_all) {
2547                         int i;
2548                         for (i = 0; i < drhd->devices_cnt; i++)
2549                                 if (drhd->devices[i] != NULL)
2550                                         break;
2551                         /* ignore DMAR unit if no pci devices exist */
2552                         if (i == drhd->devices_cnt)
2553                                 drhd->ignored = 1;
2554                 }
2555         }
2556
2557         if (dmar_map_gfx)
2558                 return;
2559
2560         for_each_drhd_unit(drhd) {
2561                 int i;
2562                 if (drhd->ignored || drhd->include_all)
2563                         continue;
2564
2565                 for (i = 0; i < drhd->devices_cnt; i++)
2566                         if (drhd->devices[i] &&
2567                                 !IS_GFX_DEVICE(drhd->devices[i]))
2568                                 break;
2569
2570                 if (i < drhd->devices_cnt)
2571                         continue;
2572
2573                 /* bypass IOMMU if it is just for gfx devices */
2574                 drhd->ignored = 1;
2575                 for (i = 0; i < drhd->devices_cnt; i++) {
2576                         if (!drhd->devices[i])
2577                                 continue;
2578                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2579                 }
2580         }
2581 }
2582
2583 int __init intel_iommu_init(void)
2584 {
2585         int ret = 0;
2586
2587         if (dmar_table_init())
2588                 return  -ENODEV;
2589
2590         if (dmar_dev_scope_init())
2591                 return  -ENODEV;
2592
2593         /*
2594          * Check the need for DMA-remapping initialization now.
2595          * Above initialization will also be used by Interrupt-remapping.
2596          */
2597         if (no_iommu || swiotlb || dmar_disabled)
2598                 return -ENODEV;
2599
2600         iommu_init_mempool();
2601         dmar_init_reserved_ranges();
2602
2603         init_no_remapping_devices();
2604
2605         ret = init_dmars();
2606         if (ret) {
2607                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2608                 put_iova_domain(&reserved_iova_list);
2609                 iommu_exit_mempool();
2610                 return ret;
2611         }
2612         printk(KERN_INFO
2613         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2614
2615         init_timer(&unmap_timer);
2616         force_iommu = 1;
2617         dma_ops = &intel_dma_ops;
2618         return 0;
2619 }
2620
2621 void intel_iommu_domain_exit(struct dmar_domain *domain)
2622 {
2623         u64 end;
2624
2625         /* Domain 0 is reserved, so dont process it */
2626         if (!domain)
2627                 return;
2628
2629         end = DOMAIN_MAX_ADDR(domain->gaw);
2630         end = end & (~VTD_PAGE_MASK);
2631
2632         /* clear ptes */
2633         dma_pte_clear_range(domain, 0, end);
2634
2635         /* free page tables */
2636         dma_pte_free_pagetable(domain, 0, end);
2637
2638         iommu_free_domain(domain);
2639         free_domain_mem(domain);
2640 }
2641 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2642
2643 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2644 {
2645         struct dmar_drhd_unit *drhd;
2646         struct dmar_domain *domain;
2647         struct intel_iommu *iommu;
2648
2649         drhd = dmar_find_matched_drhd_unit(pdev);
2650         if (!drhd) {
2651                 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2652                 return NULL;
2653         }
2654
2655         iommu = drhd->iommu;
2656         if (!iommu) {
2657                 printk(KERN_ERR
2658                         "intel_iommu_domain_alloc: iommu == NULL\n");
2659                 return NULL;
2660         }
2661         domain = iommu_alloc_domain(iommu);
2662         if (!domain) {
2663                 printk(KERN_ERR
2664                         "intel_iommu_domain_alloc: domain == NULL\n");
2665                 return NULL;
2666         }
2667         if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2668                 printk(KERN_ERR
2669                         "intel_iommu_domain_alloc: domain_init() failed\n");
2670                 intel_iommu_domain_exit(domain);
2671                 return NULL;
2672         }
2673         return domain;
2674 }
2675 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2676
2677 int intel_iommu_context_mapping(
2678         struct dmar_domain *domain, struct pci_dev *pdev)
2679 {
2680         int rc;
2681         rc = domain_context_mapping(domain, pdev);
2682         return rc;
2683 }
2684 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2685
2686 int intel_iommu_page_mapping(
2687         struct dmar_domain *domain, dma_addr_t iova,
2688         u64 hpa, size_t size, int prot)
2689 {
2690         int rc;
2691         rc = domain_page_mapping(domain, iova, hpa, size, prot);
2692         return rc;
2693 }
2694 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2695
2696 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2697 {
2698         detach_domain_for_dev(domain, bus, devfn);
2699 }
2700 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2701
2702 struct dmar_domain *
2703 intel_iommu_find_domain(struct pci_dev *pdev)
2704 {
2705         return find_domain(pdev);
2706 }
2707 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2708
2709 int intel_iommu_found(void)
2710 {
2711         return g_num_of_iommus;
2712 }
2713 EXPORT_SYMBOL_GPL(intel_iommu_found);
2714
2715 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2716 {
2717         struct dma_pte *pte;
2718         u64 pfn;
2719
2720         pfn = 0;
2721         pte = addr_to_dma_pte(domain, iova);
2722
2723         if (pte)
2724                 pfn = dma_pte_addr(pte);
2725
2726         return pfn >> VTD_PAGE_SHIFT;
2727 }
2728 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);