intel-iommu: Fix address wrap on 32-bit kernel.
[pandora-kernel.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
41 #include "pci.h"
42
43 #define ROOT_SIZE               VTD_PAGE_SIZE
44 #define CONTEXT_SIZE            VTD_PAGE_SIZE
45
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48
49 #define IOAPIC_RANGE_START      (0xfee00000)
50 #define IOAPIC_RANGE_END        (0xfeefffff)
51 #define IOVA_START_ADDR         (0x1000)
52
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56
57 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
60
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
63
64 static int rwbf_quirk;
65
66 /*
67  * 0: Present
68  * 1-11: Reserved
69  * 12-63: Context Ptr (12 - (haw-1))
70  * 64-127: Reserved
71  */
72 struct root_entry {
73         u64     val;
74         u64     rsvd1;
75 };
76 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
77 static inline bool root_present(struct root_entry *root)
78 {
79         return (root->val & 1);
80 }
81 static inline void set_root_present(struct root_entry *root)
82 {
83         root->val |= 1;
84 }
85 static inline void set_root_value(struct root_entry *root, unsigned long value)
86 {
87         root->val |= value & VTD_PAGE_MASK;
88 }
89
90 static inline struct context_entry *
91 get_context_addr_from_root(struct root_entry *root)
92 {
93         return (struct context_entry *)
94                 (root_present(root)?phys_to_virt(
95                 root->val & VTD_PAGE_MASK) :
96                 NULL);
97 }
98
99 /*
100  * low 64 bits:
101  * 0: present
102  * 1: fault processing disable
103  * 2-3: translation type
104  * 12-63: address space root
105  * high 64 bits:
106  * 0-2: address width
107  * 3-6: aval
108  * 8-23: domain id
109  */
110 struct context_entry {
111         u64 lo;
112         u64 hi;
113 };
114
115 static inline bool context_present(struct context_entry *context)
116 {
117         return (context->lo & 1);
118 }
119 static inline void context_set_present(struct context_entry *context)
120 {
121         context->lo |= 1;
122 }
123
124 static inline void context_set_fault_enable(struct context_entry *context)
125 {
126         context->lo &= (((u64)-1) << 2) | 1;
127 }
128
129 #define CONTEXT_TT_MULTI_LEVEL 0
130
131 static inline void context_set_translation_type(struct context_entry *context,
132                                                 unsigned long value)
133 {
134         context->lo &= (((u64)-1) << 4) | 3;
135         context->lo |= (value & 3) << 2;
136 }
137
138 static inline void context_set_address_root(struct context_entry *context,
139                                             unsigned long value)
140 {
141         context->lo |= value & VTD_PAGE_MASK;
142 }
143
144 static inline void context_set_address_width(struct context_entry *context,
145                                              unsigned long value)
146 {
147         context->hi |= value & 7;
148 }
149
150 static inline void context_set_domain_id(struct context_entry *context,
151                                          unsigned long value)
152 {
153         context->hi |= (value & ((1 << 16) - 1)) << 8;
154 }
155
156 static inline void context_clear_entry(struct context_entry *context)
157 {
158         context->lo = 0;
159         context->hi = 0;
160 }
161
162 /*
163  * 0: readable
164  * 1: writable
165  * 2-6: reserved
166  * 7: super page
167  * 8-10: available
168  * 11: snoop behavior
169  * 12-63: Host physcial address
170  */
171 struct dma_pte {
172         u64 val;
173 };
174
175 static inline void dma_clear_pte(struct dma_pte *pte)
176 {
177         pte->val = 0;
178 }
179
180 static inline void dma_set_pte_readable(struct dma_pte *pte)
181 {
182         pte->val |= DMA_PTE_READ;
183 }
184
185 static inline void dma_set_pte_writable(struct dma_pte *pte)
186 {
187         pte->val |= DMA_PTE_WRITE;
188 }
189
190 static inline void dma_set_pte_snp(struct dma_pte *pte)
191 {
192         pte->val |= DMA_PTE_SNP;
193 }
194
195 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
196 {
197         pte->val = (pte->val & ~3) | (prot & 3);
198 }
199
200 static inline u64 dma_pte_addr(struct dma_pte *pte)
201 {
202         return (pte->val & VTD_PAGE_MASK);
203 }
204
205 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
206 {
207         pte->val |= (addr & VTD_PAGE_MASK);
208 }
209
210 static inline bool dma_pte_present(struct dma_pte *pte)
211 {
212         return (pte->val & 3) != 0;
213 }
214
215 /* devices under the same p2p bridge are owned in one domain */
216 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
217
218 /* domain represents a virtual machine, more than one devices
219  * across iommus may be owned in one domain, e.g. kvm guest.
220  */
221 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
222
223 struct dmar_domain {
224         int     id;                     /* domain id */
225         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
226
227         struct list_head devices;       /* all devices' list */
228         struct iova_domain iovad;       /* iova's that belong to this domain */
229
230         struct dma_pte  *pgd;           /* virtual address */
231         spinlock_t      mapping_lock;   /* page table lock */
232         int             gaw;            /* max guest address width */
233
234         /* adjusted guest address width, 0 is level 2 30-bit */
235         int             agaw;
236
237         int             flags;          /* flags to find out type of domain */
238
239         int             iommu_coherency;/* indicate coherency of iommu access */
240         int             iommu_snooping; /* indicate snooping control feature*/
241         int             iommu_count;    /* reference count of iommu */
242         spinlock_t      iommu_lock;     /* protect iommu set in domain */
243         u64             max_addr;       /* maximum mapped address */
244 };
245
246 /* PCI domain-device relationship */
247 struct device_domain_info {
248         struct list_head link;  /* link to domain siblings */
249         struct list_head global; /* link to global list */
250         u8 bus;                 /* PCI bus numer */
251         u8 devfn;               /* PCI devfn number */
252         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
253         struct dmar_domain *domain; /* pointer to domain */
254 };
255
256 static void flush_unmaps_timeout(unsigned long data);
257
258 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
259
260 #define HIGH_WATER_MARK 250
261 struct deferred_flush_tables {
262         int next;
263         struct iova *iova[HIGH_WATER_MARK];
264         struct dmar_domain *domain[HIGH_WATER_MARK];
265 };
266
267 static struct deferred_flush_tables *deferred_flush;
268
269 /* bitmap for indexing intel_iommus */
270 static int g_num_of_iommus;
271
272 static DEFINE_SPINLOCK(async_umap_flush_lock);
273 static LIST_HEAD(unmaps_to_do);
274
275 static int timer_on;
276 static long list_size;
277
278 static void domain_remove_dev_info(struct dmar_domain *domain);
279
280 #ifdef CONFIG_DMAR_DEFAULT_ON
281 int dmar_disabled = 0;
282 #else
283 int dmar_disabled = 1;
284 #endif /*CONFIG_DMAR_DEFAULT_ON*/
285
286 static int __initdata dmar_map_gfx = 1;
287 static int dmar_forcedac;
288 static int intel_iommu_strict;
289
290 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
291 static DEFINE_SPINLOCK(device_domain_lock);
292 static LIST_HEAD(device_domain_list);
293
294 static struct iommu_ops intel_iommu_ops;
295
296 static int __init intel_iommu_setup(char *str)
297 {
298         if (!str)
299                 return -EINVAL;
300         while (*str) {
301                 if (!strncmp(str, "on", 2)) {
302                         dmar_disabled = 0;
303                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
304                 } else if (!strncmp(str, "off", 3)) {
305                         dmar_disabled = 1;
306                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
307                 } else if (!strncmp(str, "igfx_off", 8)) {
308                         dmar_map_gfx = 0;
309                         printk(KERN_INFO
310                                 "Intel-IOMMU: disable GFX device mapping\n");
311                 } else if (!strncmp(str, "forcedac", 8)) {
312                         printk(KERN_INFO
313                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
314                         dmar_forcedac = 1;
315                 } else if (!strncmp(str, "strict", 6)) {
316                         printk(KERN_INFO
317                                 "Intel-IOMMU: disable batched IOTLB flush\n");
318                         intel_iommu_strict = 1;
319                 }
320
321                 str += strcspn(str, ",");
322                 while (*str == ',')
323                         str++;
324         }
325         return 0;
326 }
327 __setup("intel_iommu=", intel_iommu_setup);
328
329 static struct kmem_cache *iommu_domain_cache;
330 static struct kmem_cache *iommu_devinfo_cache;
331 static struct kmem_cache *iommu_iova_cache;
332
333 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
334 {
335         unsigned int flags;
336         void *vaddr;
337
338         /* trying to avoid low memory issues */
339         flags = current->flags & PF_MEMALLOC;
340         current->flags |= PF_MEMALLOC;
341         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
342         current->flags &= (~PF_MEMALLOC | flags);
343         return vaddr;
344 }
345
346
347 static inline void *alloc_pgtable_page(void)
348 {
349         unsigned int flags;
350         void *vaddr;
351
352         /* trying to avoid low memory issues */
353         flags = current->flags & PF_MEMALLOC;
354         current->flags |= PF_MEMALLOC;
355         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
356         current->flags &= (~PF_MEMALLOC | flags);
357         return vaddr;
358 }
359
360 static inline void free_pgtable_page(void *vaddr)
361 {
362         free_page((unsigned long)vaddr);
363 }
364
365 static inline void *alloc_domain_mem(void)
366 {
367         return iommu_kmem_cache_alloc(iommu_domain_cache);
368 }
369
370 static void free_domain_mem(void *vaddr)
371 {
372         kmem_cache_free(iommu_domain_cache, vaddr);
373 }
374
375 static inline void * alloc_devinfo_mem(void)
376 {
377         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
378 }
379
380 static inline void free_devinfo_mem(void *vaddr)
381 {
382         kmem_cache_free(iommu_devinfo_cache, vaddr);
383 }
384
385 struct iova *alloc_iova_mem(void)
386 {
387         return iommu_kmem_cache_alloc(iommu_iova_cache);
388 }
389
390 void free_iova_mem(struct iova *iova)
391 {
392         kmem_cache_free(iommu_iova_cache, iova);
393 }
394
395
396 static inline int width_to_agaw(int width);
397
398 /* calculate agaw for each iommu.
399  * "SAGAW" may be different across iommus, use a default agaw, and
400  * get a supported less agaw for iommus that don't support the default agaw.
401  */
402 int iommu_calculate_agaw(struct intel_iommu *iommu)
403 {
404         unsigned long sagaw;
405         int agaw = -1;
406
407         sagaw = cap_sagaw(iommu->cap);
408         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
409              agaw >= 0; agaw--) {
410                 if (test_bit(agaw, &sagaw))
411                         break;
412         }
413
414         return agaw;
415 }
416
417 /* in native case, each domain is related to only one iommu */
418 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
419 {
420         int iommu_id;
421
422         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
423
424         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
425         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
426                 return NULL;
427
428         return g_iommus[iommu_id];
429 }
430
431 static void domain_update_iommu_coherency(struct dmar_domain *domain)
432 {
433         int i;
434
435         domain->iommu_coherency = 1;
436
437         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
438         for (; i < g_num_of_iommus; ) {
439                 if (!ecap_coherent(g_iommus[i]->ecap)) {
440                         domain->iommu_coherency = 0;
441                         break;
442                 }
443                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
444         }
445 }
446
447 static void domain_update_iommu_snooping(struct dmar_domain *domain)
448 {
449         int i;
450
451         domain->iommu_snooping = 1;
452
453         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
454         for (; i < g_num_of_iommus; ) {
455                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
456                         domain->iommu_snooping = 0;
457                         break;
458                 }
459                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
460         }
461 }
462
463 /* Some capabilities may be different across iommus */
464 static void domain_update_iommu_cap(struct dmar_domain *domain)
465 {
466         domain_update_iommu_coherency(domain);
467         domain_update_iommu_snooping(domain);
468 }
469
470 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
471 {
472         struct dmar_drhd_unit *drhd = NULL;
473         int i;
474
475         for_each_drhd_unit(drhd) {
476                 if (drhd->ignored)
477                         continue;
478
479                 for (i = 0; i < drhd->devices_cnt; i++)
480                         if (drhd->devices[i] &&
481                             drhd->devices[i]->bus->number == bus &&
482                             drhd->devices[i]->devfn == devfn)
483                                 return drhd->iommu;
484
485                 if (drhd->include_all)
486                         return drhd->iommu;
487         }
488
489         return NULL;
490 }
491
492 static void domain_flush_cache(struct dmar_domain *domain,
493                                void *addr, int size)
494 {
495         if (!domain->iommu_coherency)
496                 clflush_cache_range(addr, size);
497 }
498
499 /* Gets context entry for a given bus and devfn */
500 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
501                 u8 bus, u8 devfn)
502 {
503         struct root_entry *root;
504         struct context_entry *context;
505         unsigned long phy_addr;
506         unsigned long flags;
507
508         spin_lock_irqsave(&iommu->lock, flags);
509         root = &iommu->root_entry[bus];
510         context = get_context_addr_from_root(root);
511         if (!context) {
512                 context = (struct context_entry *)alloc_pgtable_page();
513                 if (!context) {
514                         spin_unlock_irqrestore(&iommu->lock, flags);
515                         return NULL;
516                 }
517                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
518                 phy_addr = virt_to_phys((void *)context);
519                 set_root_value(root, phy_addr);
520                 set_root_present(root);
521                 __iommu_flush_cache(iommu, root, sizeof(*root));
522         }
523         spin_unlock_irqrestore(&iommu->lock, flags);
524         return &context[devfn];
525 }
526
527 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
528 {
529         struct root_entry *root;
530         struct context_entry *context;
531         int ret;
532         unsigned long flags;
533
534         spin_lock_irqsave(&iommu->lock, flags);
535         root = &iommu->root_entry[bus];
536         context = get_context_addr_from_root(root);
537         if (!context) {
538                 ret = 0;
539                 goto out;
540         }
541         ret = context_present(&context[devfn]);
542 out:
543         spin_unlock_irqrestore(&iommu->lock, flags);
544         return ret;
545 }
546
547 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
548 {
549         struct root_entry *root;
550         struct context_entry *context;
551         unsigned long flags;
552
553         spin_lock_irqsave(&iommu->lock, flags);
554         root = &iommu->root_entry[bus];
555         context = get_context_addr_from_root(root);
556         if (context) {
557                 context_clear_entry(&context[devfn]);
558                 __iommu_flush_cache(iommu, &context[devfn], \
559                         sizeof(*context));
560         }
561         spin_unlock_irqrestore(&iommu->lock, flags);
562 }
563
564 static void free_context_table(struct intel_iommu *iommu)
565 {
566         struct root_entry *root;
567         int i;
568         unsigned long flags;
569         struct context_entry *context;
570
571         spin_lock_irqsave(&iommu->lock, flags);
572         if (!iommu->root_entry) {
573                 goto out;
574         }
575         for (i = 0; i < ROOT_ENTRY_NR; i++) {
576                 root = &iommu->root_entry[i];
577                 context = get_context_addr_from_root(root);
578                 if (context)
579                         free_pgtable_page(context);
580         }
581         free_pgtable_page(iommu->root_entry);
582         iommu->root_entry = NULL;
583 out:
584         spin_unlock_irqrestore(&iommu->lock, flags);
585 }
586
587 /* page table handling */
588 #define LEVEL_STRIDE            (9)
589 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
590
591 static inline int agaw_to_level(int agaw)
592 {
593         return agaw + 2;
594 }
595
596 static inline int agaw_to_width(int agaw)
597 {
598         return 30 + agaw * LEVEL_STRIDE;
599
600 }
601
602 static inline int width_to_agaw(int width)
603 {
604         return (width - 30) / LEVEL_STRIDE;
605 }
606
607 static inline unsigned int level_to_offset_bits(int level)
608 {
609         return (12 + (level - 1) * LEVEL_STRIDE);
610 }
611
612 static inline int address_level_offset(u64 addr, int level)
613 {
614         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
615 }
616
617 static inline u64 level_mask(int level)
618 {
619         return ((u64)-1 << level_to_offset_bits(level));
620 }
621
622 static inline u64 level_size(int level)
623 {
624         return ((u64)1 << level_to_offset_bits(level));
625 }
626
627 static inline u64 align_to_level(u64 addr, int level)
628 {
629         return ((addr + level_size(level) - 1) & level_mask(level));
630 }
631
632 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
633 {
634         int addr_width = agaw_to_width(domain->agaw);
635         struct dma_pte *parent, *pte = NULL;
636         int level = agaw_to_level(domain->agaw);
637         int offset;
638         unsigned long flags;
639
640         BUG_ON(!domain->pgd);
641
642         addr &= (((u64)1) << addr_width) - 1;
643         parent = domain->pgd;
644
645         spin_lock_irqsave(&domain->mapping_lock, flags);
646         while (level > 0) {
647                 void *tmp_page;
648
649                 offset = address_level_offset(addr, level);
650                 pte = &parent[offset];
651                 if (level == 1)
652                         break;
653
654                 if (!dma_pte_present(pte)) {
655                         tmp_page = alloc_pgtable_page();
656
657                         if (!tmp_page) {
658                                 spin_unlock_irqrestore(&domain->mapping_lock,
659                                         flags);
660                                 return NULL;
661                         }
662                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
663                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
664                         /*
665                          * high level table always sets r/w, last level page
666                          * table control read/write
667                          */
668                         dma_set_pte_readable(pte);
669                         dma_set_pte_writable(pte);
670                         domain_flush_cache(domain, pte, sizeof(*pte));
671                 }
672                 parent = phys_to_virt(dma_pte_addr(pte));
673                 level--;
674         }
675
676         spin_unlock_irqrestore(&domain->mapping_lock, flags);
677         return pte;
678 }
679
680 /* return address's pte at specific level */
681 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
682                 int level)
683 {
684         struct dma_pte *parent, *pte = NULL;
685         int total = agaw_to_level(domain->agaw);
686         int offset;
687
688         parent = domain->pgd;
689         while (level <= total) {
690                 offset = address_level_offset(addr, total);
691                 pte = &parent[offset];
692                 if (level == total)
693                         return pte;
694
695                 if (!dma_pte_present(pte))
696                         break;
697                 parent = phys_to_virt(dma_pte_addr(pte));
698                 total--;
699         }
700         return NULL;
701 }
702
703 /* clear one page's page table */
704 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
705 {
706         struct dma_pte *pte = NULL;
707
708         /* get last level pte */
709         pte = dma_addr_level_pte(domain, addr, 1);
710
711         if (pte) {
712                 dma_clear_pte(pte);
713                 domain_flush_cache(domain, pte, sizeof(*pte));
714         }
715 }
716
717 /* clear last level pte, a tlb flush should be followed */
718 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
719 {
720         int addr_width = agaw_to_width(domain->agaw);
721         int npages;
722
723         start &= (((u64)1) << addr_width) - 1;
724         end &= (((u64)1) << addr_width) - 1;
725         /* in case it's partial page */
726         start = PAGE_ALIGN(start);
727         end &= PAGE_MASK;
728         npages = (end - start) / VTD_PAGE_SIZE;
729
730         /* we don't need lock here, nobody else touches the iova range */
731         while (npages--) {
732                 dma_pte_clear_one(domain, start);
733                 start += VTD_PAGE_SIZE;
734         }
735 }
736
737 /* free page table pages. last level pte should already be cleared */
738 static void dma_pte_free_pagetable(struct dmar_domain *domain,
739         u64 start, u64 end)
740 {
741         int addr_width = agaw_to_width(domain->agaw);
742         struct dma_pte *pte;
743         int total = agaw_to_level(domain->agaw);
744         int level;
745         u64 tmp;
746
747         start &= (((u64)1) << addr_width) - 1;
748         end &= (((u64)1) << addr_width) - 1;
749
750         /* we don't need lock here, nobody else touches the iova range */
751         level = 2;
752         while (level <= total) {
753                 tmp = align_to_level(start, level);
754                 if (tmp >= end || (tmp + level_size(level) > end))
755                         return;
756
757                 while (tmp < end) {
758                         pte = dma_addr_level_pte(domain, tmp, level);
759                         if (pte) {
760                                 free_pgtable_page(
761                                         phys_to_virt(dma_pte_addr(pte)));
762                                 dma_clear_pte(pte);
763                                 domain_flush_cache(domain, pte, sizeof(*pte));
764                         }
765                         tmp += level_size(level);
766                 }
767                 level++;
768         }
769         /* free pgd */
770         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
771                 free_pgtable_page(domain->pgd);
772                 domain->pgd = NULL;
773         }
774 }
775
776 /* iommu handling */
777 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
778 {
779         struct root_entry *root;
780         unsigned long flags;
781
782         root = (struct root_entry *)alloc_pgtable_page();
783         if (!root)
784                 return -ENOMEM;
785
786         __iommu_flush_cache(iommu, root, ROOT_SIZE);
787
788         spin_lock_irqsave(&iommu->lock, flags);
789         iommu->root_entry = root;
790         spin_unlock_irqrestore(&iommu->lock, flags);
791
792         return 0;
793 }
794
795 static void iommu_set_root_entry(struct intel_iommu *iommu)
796 {
797         void *addr;
798         u32 cmd, sts;
799         unsigned long flag;
800
801         addr = iommu->root_entry;
802
803         spin_lock_irqsave(&iommu->register_lock, flag);
804         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
805
806         cmd = iommu->gcmd | DMA_GCMD_SRTP;
807         writel(cmd, iommu->reg + DMAR_GCMD_REG);
808
809         /* Make sure hardware complete it */
810         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
811                 readl, (sts & DMA_GSTS_RTPS), sts);
812
813         spin_unlock_irqrestore(&iommu->register_lock, flag);
814 }
815
816 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
817 {
818         u32 val;
819         unsigned long flag;
820
821         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
822                 return;
823         val = iommu->gcmd | DMA_GCMD_WBF;
824
825         spin_lock_irqsave(&iommu->register_lock, flag);
826         writel(val, iommu->reg + DMAR_GCMD_REG);
827
828         /* Make sure hardware complete it */
829         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
830                         readl, (!(val & DMA_GSTS_WBFS)), val);
831
832         spin_unlock_irqrestore(&iommu->register_lock, flag);
833 }
834
835 /* return value determine if we need a write buffer flush */
836 static int __iommu_flush_context(struct intel_iommu *iommu,
837         u16 did, u16 source_id, u8 function_mask, u64 type,
838         int non_present_entry_flush)
839 {
840         u64 val = 0;
841         unsigned long flag;
842
843         /*
844          * In the non-present entry flush case, if hardware doesn't cache
845          * non-present entry we do nothing and if hardware cache non-present
846          * entry, we flush entries of domain 0 (the domain id is used to cache
847          * any non-present entries)
848          */
849         if (non_present_entry_flush) {
850                 if (!cap_caching_mode(iommu->cap))
851                         return 1;
852                 else
853                         did = 0;
854         }
855
856         switch (type) {
857         case DMA_CCMD_GLOBAL_INVL:
858                 val = DMA_CCMD_GLOBAL_INVL;
859                 break;
860         case DMA_CCMD_DOMAIN_INVL:
861                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
862                 break;
863         case DMA_CCMD_DEVICE_INVL:
864                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
865                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
866                 break;
867         default:
868                 BUG();
869         }
870         val |= DMA_CCMD_ICC;
871
872         spin_lock_irqsave(&iommu->register_lock, flag);
873         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
874
875         /* Make sure hardware complete it */
876         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
877                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
878
879         spin_unlock_irqrestore(&iommu->register_lock, flag);
880
881         /* flush context entry will implicitly flush write buffer */
882         return 0;
883 }
884
885 /* return value determine if we need a write buffer flush */
886 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
887         u64 addr, unsigned int size_order, u64 type,
888         int non_present_entry_flush)
889 {
890         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
891         u64 val = 0, val_iva = 0;
892         unsigned long flag;
893
894         /*
895          * In the non-present entry flush case, if hardware doesn't cache
896          * non-present entry we do nothing and if hardware cache non-present
897          * entry, we flush entries of domain 0 (the domain id is used to cache
898          * any non-present entries)
899          */
900         if (non_present_entry_flush) {
901                 if (!cap_caching_mode(iommu->cap))
902                         return 1;
903                 else
904                         did = 0;
905         }
906
907         switch (type) {
908         case DMA_TLB_GLOBAL_FLUSH:
909                 /* global flush doesn't need set IVA_REG */
910                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
911                 break;
912         case DMA_TLB_DSI_FLUSH:
913                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
914                 break;
915         case DMA_TLB_PSI_FLUSH:
916                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
917                 /* Note: always flush non-leaf currently */
918                 val_iva = size_order | addr;
919                 break;
920         default:
921                 BUG();
922         }
923         /* Note: set drain read/write */
924 #if 0
925         /*
926          * This is probably to be super secure.. Looks like we can
927          * ignore it without any impact.
928          */
929         if (cap_read_drain(iommu->cap))
930                 val |= DMA_TLB_READ_DRAIN;
931 #endif
932         if (cap_write_drain(iommu->cap))
933                 val |= DMA_TLB_WRITE_DRAIN;
934
935         spin_lock_irqsave(&iommu->register_lock, flag);
936         /* Note: Only uses first TLB reg currently */
937         if (val_iva)
938                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
939         dmar_writeq(iommu->reg + tlb_offset + 8, val);
940
941         /* Make sure hardware complete it */
942         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
943                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
944
945         spin_unlock_irqrestore(&iommu->register_lock, flag);
946
947         /* check IOTLB invalidation granularity */
948         if (DMA_TLB_IAIG(val) == 0)
949                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
950         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
951                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
952                         (unsigned long long)DMA_TLB_IIRG(type),
953                         (unsigned long long)DMA_TLB_IAIG(val));
954         /* flush iotlb entry will implicitly flush write buffer */
955         return 0;
956 }
957
958 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
959         u64 addr, unsigned int pages, int non_present_entry_flush)
960 {
961         unsigned int mask;
962
963         BUG_ON(addr & (~VTD_PAGE_MASK));
964         BUG_ON(pages == 0);
965
966         /* Fallback to domain selective flush if no PSI support */
967         if (!cap_pgsel_inv(iommu->cap))
968                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
969                                                 DMA_TLB_DSI_FLUSH,
970                                                 non_present_entry_flush);
971
972         /*
973          * PSI requires page size to be 2 ^ x, and the base address is naturally
974          * aligned to the size
975          */
976         mask = ilog2(__roundup_pow_of_two(pages));
977         /* Fallback to domain selective flush if size is too big */
978         if (mask > cap_max_amask_val(iommu->cap))
979                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
980                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
981
982         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
983                                         DMA_TLB_PSI_FLUSH,
984                                         non_present_entry_flush);
985 }
986
987 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
988 {
989         u32 pmen;
990         unsigned long flags;
991
992         spin_lock_irqsave(&iommu->register_lock, flags);
993         pmen = readl(iommu->reg + DMAR_PMEN_REG);
994         pmen &= ~DMA_PMEN_EPM;
995         writel(pmen, iommu->reg + DMAR_PMEN_REG);
996
997         /* wait for the protected region status bit to clear */
998         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
999                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1000
1001         spin_unlock_irqrestore(&iommu->register_lock, flags);
1002 }
1003
1004 static int iommu_enable_translation(struct intel_iommu *iommu)
1005 {
1006         u32 sts;
1007         unsigned long flags;
1008
1009         spin_lock_irqsave(&iommu->register_lock, flags);
1010         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
1011
1012         /* Make sure hardware complete it */
1013         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1014                 readl, (sts & DMA_GSTS_TES), sts);
1015
1016         iommu->gcmd |= DMA_GCMD_TE;
1017         spin_unlock_irqrestore(&iommu->register_lock, flags);
1018         return 0;
1019 }
1020
1021 static int iommu_disable_translation(struct intel_iommu *iommu)
1022 {
1023         u32 sts;
1024         unsigned long flag;
1025
1026         spin_lock_irqsave(&iommu->register_lock, flag);
1027         iommu->gcmd &= ~DMA_GCMD_TE;
1028         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1029
1030         /* Make sure hardware complete it */
1031         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1032                 readl, (!(sts & DMA_GSTS_TES)), sts);
1033
1034         spin_unlock_irqrestore(&iommu->register_lock, flag);
1035         return 0;
1036 }
1037
1038 /* iommu interrupt handling. Most stuff are MSI-like. */
1039
1040 static const char *fault_reason_strings[] =
1041 {
1042         "Software",
1043         "Present bit in root entry is clear",
1044         "Present bit in context entry is clear",
1045         "Invalid context entry",
1046         "Access beyond MGAW",
1047         "PTE Write access is not set",
1048         "PTE Read access is not set",
1049         "Next page table ptr is invalid",
1050         "Root table address invalid",
1051         "Context table ptr is invalid",
1052         "non-zero reserved fields in RTP",
1053         "non-zero reserved fields in CTP",
1054         "non-zero reserved fields in PTE",
1055 };
1056 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
1057
1058 const char *dmar_get_fault_reason(u8 fault_reason)
1059 {
1060         if (fault_reason > MAX_FAULT_REASON_IDX)
1061                 return "Unknown";
1062         else
1063                 return fault_reason_strings[fault_reason];
1064 }
1065
1066 void dmar_msi_unmask(unsigned int irq)
1067 {
1068         struct intel_iommu *iommu = get_irq_data(irq);
1069         unsigned long flag;
1070
1071         /* unmask it */
1072         spin_lock_irqsave(&iommu->register_lock, flag);
1073         writel(0, iommu->reg + DMAR_FECTL_REG);
1074         /* Read a reg to force flush the post write */
1075         readl(iommu->reg + DMAR_FECTL_REG);
1076         spin_unlock_irqrestore(&iommu->register_lock, flag);
1077 }
1078
1079 void dmar_msi_mask(unsigned int irq)
1080 {
1081         unsigned long flag;
1082         struct intel_iommu *iommu = get_irq_data(irq);
1083
1084         /* mask it */
1085         spin_lock_irqsave(&iommu->register_lock, flag);
1086         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1087         /* Read a reg to force flush the post write */
1088         readl(iommu->reg + DMAR_FECTL_REG);
1089         spin_unlock_irqrestore(&iommu->register_lock, flag);
1090 }
1091
1092 void dmar_msi_write(int irq, struct msi_msg *msg)
1093 {
1094         struct intel_iommu *iommu = get_irq_data(irq);
1095         unsigned long flag;
1096
1097         spin_lock_irqsave(&iommu->register_lock, flag);
1098         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1099         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1100         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1101         spin_unlock_irqrestore(&iommu->register_lock, flag);
1102 }
1103
1104 void dmar_msi_read(int irq, struct msi_msg *msg)
1105 {
1106         struct intel_iommu *iommu = get_irq_data(irq);
1107         unsigned long flag;
1108
1109         spin_lock_irqsave(&iommu->register_lock, flag);
1110         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1111         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1112         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1113         spin_unlock_irqrestore(&iommu->register_lock, flag);
1114 }
1115
1116 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1117                 u8 fault_reason, u16 source_id, unsigned long long addr)
1118 {
1119         const char *reason;
1120
1121         reason = dmar_get_fault_reason(fault_reason);
1122
1123         printk(KERN_ERR
1124                 "DMAR:[%s] Request device [%02x:%02x.%d] "
1125                 "fault addr %llx \n"
1126                 "DMAR:[fault reason %02d] %s\n",
1127                 (type ? "DMA Read" : "DMA Write"),
1128                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1129                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1130         return 0;
1131 }
1132
1133 #define PRIMARY_FAULT_REG_LEN (16)
1134 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1135 {
1136         struct intel_iommu *iommu = dev_id;
1137         int reg, fault_index;
1138         u32 fault_status;
1139         unsigned long flag;
1140
1141         spin_lock_irqsave(&iommu->register_lock, flag);
1142         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1143
1144         /* TBD: ignore advanced fault log currently */
1145         if (!(fault_status & DMA_FSTS_PPF))
1146                 goto clear_overflow;
1147
1148         fault_index = dma_fsts_fault_record_index(fault_status);
1149         reg = cap_fault_reg_offset(iommu->cap);
1150         while (1) {
1151                 u8 fault_reason;
1152                 u16 source_id;
1153                 u64 guest_addr;
1154                 int type;
1155                 u32 data;
1156
1157                 /* highest 32 bits */
1158                 data = readl(iommu->reg + reg +
1159                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1160                 if (!(data & DMA_FRCD_F))
1161                         break;
1162
1163                 fault_reason = dma_frcd_fault_reason(data);
1164                 type = dma_frcd_type(data);
1165
1166                 data = readl(iommu->reg + reg +
1167                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1168                 source_id = dma_frcd_source_id(data);
1169
1170                 guest_addr = dmar_readq(iommu->reg + reg +
1171                                 fault_index * PRIMARY_FAULT_REG_LEN);
1172                 guest_addr = dma_frcd_page_addr(guest_addr);
1173                 /* clear the fault */
1174                 writel(DMA_FRCD_F, iommu->reg + reg +
1175                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1176
1177                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1178
1179                 iommu_page_fault_do_one(iommu, type, fault_reason,
1180                                 source_id, guest_addr);
1181
1182                 fault_index++;
1183                 if (fault_index > cap_num_fault_regs(iommu->cap))
1184                         fault_index = 0;
1185                 spin_lock_irqsave(&iommu->register_lock, flag);
1186         }
1187 clear_overflow:
1188         /* clear primary fault overflow */
1189         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1190         if (fault_status & DMA_FSTS_PFO)
1191                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1192
1193         spin_unlock_irqrestore(&iommu->register_lock, flag);
1194         return IRQ_HANDLED;
1195 }
1196
1197 int dmar_set_interrupt(struct intel_iommu *iommu)
1198 {
1199         int irq, ret;
1200
1201         irq = create_irq();
1202         if (!irq) {
1203                 printk(KERN_ERR "IOMMU: no free vectors\n");
1204                 return -EINVAL;
1205         }
1206
1207         set_irq_data(irq, iommu);
1208         iommu->irq = irq;
1209
1210         ret = arch_setup_dmar_msi(irq);
1211         if (ret) {
1212                 set_irq_data(irq, NULL);
1213                 iommu->irq = 0;
1214                 destroy_irq(irq);
1215                 return 0;
1216         }
1217
1218         /* Force fault register is cleared */
1219         iommu_page_fault(irq, iommu);
1220
1221         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1222         if (ret)
1223                 printk(KERN_ERR "IOMMU: can't request irq\n");
1224         return ret;
1225 }
1226
1227 static int iommu_init_domains(struct intel_iommu *iommu)
1228 {
1229         unsigned long ndomains;
1230         unsigned long nlongs;
1231
1232         ndomains = cap_ndoms(iommu->cap);
1233         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1234         nlongs = BITS_TO_LONGS(ndomains);
1235
1236         /* TBD: there might be 64K domains,
1237          * consider other allocation for future chip
1238          */
1239         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1240         if (!iommu->domain_ids) {
1241                 printk(KERN_ERR "Allocating domain id array failed\n");
1242                 return -ENOMEM;
1243         }
1244         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1245                         GFP_KERNEL);
1246         if (!iommu->domains) {
1247                 printk(KERN_ERR "Allocating domain array failed\n");
1248                 kfree(iommu->domain_ids);
1249                 return -ENOMEM;
1250         }
1251
1252         spin_lock_init(&iommu->lock);
1253
1254         /*
1255          * if Caching mode is set, then invalid translations are tagged
1256          * with domainid 0. Hence we need to pre-allocate it.
1257          */
1258         if (cap_caching_mode(iommu->cap))
1259                 set_bit(0, iommu->domain_ids);
1260         return 0;
1261 }
1262
1263
1264 static void domain_exit(struct dmar_domain *domain);
1265 static void vm_domain_exit(struct dmar_domain *domain);
1266
1267 void free_dmar_iommu(struct intel_iommu *iommu)
1268 {
1269         struct dmar_domain *domain;
1270         int i;
1271         unsigned long flags;
1272
1273         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1274         for (; i < cap_ndoms(iommu->cap); ) {
1275                 domain = iommu->domains[i];
1276                 clear_bit(i, iommu->domain_ids);
1277
1278                 spin_lock_irqsave(&domain->iommu_lock, flags);
1279                 if (--domain->iommu_count == 0) {
1280                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1281                                 vm_domain_exit(domain);
1282                         else
1283                                 domain_exit(domain);
1284                 }
1285                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1286
1287                 i = find_next_bit(iommu->domain_ids,
1288                         cap_ndoms(iommu->cap), i+1);
1289         }
1290
1291         if (iommu->gcmd & DMA_GCMD_TE)
1292                 iommu_disable_translation(iommu);
1293
1294         if (iommu->irq) {
1295                 set_irq_data(iommu->irq, NULL);
1296                 /* This will mask the irq */
1297                 free_irq(iommu->irq, iommu);
1298                 destroy_irq(iommu->irq);
1299         }
1300
1301         kfree(iommu->domains);
1302         kfree(iommu->domain_ids);
1303
1304         g_iommus[iommu->seq_id] = NULL;
1305
1306         /* if all iommus are freed, free g_iommus */
1307         for (i = 0; i < g_num_of_iommus; i++) {
1308                 if (g_iommus[i])
1309                         break;
1310         }
1311
1312         if (i == g_num_of_iommus)
1313                 kfree(g_iommus);
1314
1315         /* free context mapping */
1316         free_context_table(iommu);
1317 }
1318
1319 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1320 {
1321         unsigned long num;
1322         unsigned long ndomains;
1323         struct dmar_domain *domain;
1324         unsigned long flags;
1325
1326         domain = alloc_domain_mem();
1327         if (!domain)
1328                 return NULL;
1329
1330         ndomains = cap_ndoms(iommu->cap);
1331
1332         spin_lock_irqsave(&iommu->lock, flags);
1333         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1334         if (num >= ndomains) {
1335                 spin_unlock_irqrestore(&iommu->lock, flags);
1336                 free_domain_mem(domain);
1337                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1338                 return NULL;
1339         }
1340
1341         set_bit(num, iommu->domain_ids);
1342         domain->id = num;
1343         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1344         set_bit(iommu->seq_id, &domain->iommu_bmp);
1345         domain->flags = 0;
1346         iommu->domains[num] = domain;
1347         spin_unlock_irqrestore(&iommu->lock, flags);
1348
1349         return domain;
1350 }
1351
1352 static void iommu_free_domain(struct dmar_domain *domain)
1353 {
1354         unsigned long flags;
1355         struct intel_iommu *iommu;
1356
1357         iommu = domain_get_iommu(domain);
1358
1359         spin_lock_irqsave(&iommu->lock, flags);
1360         clear_bit(domain->id, iommu->domain_ids);
1361         spin_unlock_irqrestore(&iommu->lock, flags);
1362 }
1363
1364 static struct iova_domain reserved_iova_list;
1365 static struct lock_class_key reserved_alloc_key;
1366 static struct lock_class_key reserved_rbtree_key;
1367
1368 static void dmar_init_reserved_ranges(void)
1369 {
1370         struct pci_dev *pdev = NULL;
1371         struct iova *iova;
1372         int i;
1373         u64 addr, size;
1374
1375         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1376
1377         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1378                 &reserved_alloc_key);
1379         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1380                 &reserved_rbtree_key);
1381
1382         /* IOAPIC ranges shouldn't be accessed by DMA */
1383         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1384                 IOVA_PFN(IOAPIC_RANGE_END));
1385         if (!iova)
1386                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1387
1388         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1389         for_each_pci_dev(pdev) {
1390                 struct resource *r;
1391
1392                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1393                         r = &pdev->resource[i];
1394                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1395                                 continue;
1396                         addr = r->start;
1397                         addr &= PAGE_MASK;
1398                         size = r->end - addr;
1399                         size = PAGE_ALIGN(size);
1400                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1401                                 IOVA_PFN(size + addr) - 1);
1402                         if (!iova)
1403                                 printk(KERN_ERR "Reserve iova failed\n");
1404                 }
1405         }
1406
1407 }
1408
1409 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1410 {
1411         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1412 }
1413
1414 static inline int guestwidth_to_adjustwidth(int gaw)
1415 {
1416         int agaw;
1417         int r = (gaw - 12) % 9;
1418
1419         if (r == 0)
1420                 agaw = gaw;
1421         else
1422                 agaw = gaw + 9 - r;
1423         if (agaw > 64)
1424                 agaw = 64;
1425         return agaw;
1426 }
1427
1428 static int domain_init(struct dmar_domain *domain, int guest_width)
1429 {
1430         struct intel_iommu *iommu;
1431         int adjust_width, agaw;
1432         unsigned long sagaw;
1433
1434         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1435         spin_lock_init(&domain->mapping_lock);
1436         spin_lock_init(&domain->iommu_lock);
1437
1438         domain_reserve_special_ranges(domain);
1439
1440         /* calculate AGAW */
1441         iommu = domain_get_iommu(domain);
1442         if (guest_width > cap_mgaw(iommu->cap))
1443                 guest_width = cap_mgaw(iommu->cap);
1444         domain->gaw = guest_width;
1445         adjust_width = guestwidth_to_adjustwidth(guest_width);
1446         agaw = width_to_agaw(adjust_width);
1447         sagaw = cap_sagaw(iommu->cap);
1448         if (!test_bit(agaw, &sagaw)) {
1449                 /* hardware doesn't support it, choose a bigger one */
1450                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1451                 agaw = find_next_bit(&sagaw, 5, agaw);
1452                 if (agaw >= 5)
1453                         return -ENODEV;
1454         }
1455         domain->agaw = agaw;
1456         INIT_LIST_HEAD(&domain->devices);
1457
1458         if (ecap_coherent(iommu->ecap))
1459                 domain->iommu_coherency = 1;
1460         else
1461                 domain->iommu_coherency = 0;
1462
1463         if (ecap_sc_support(iommu->ecap))
1464                 domain->iommu_snooping = 1;
1465         else
1466                 domain->iommu_snooping = 0;
1467
1468         domain->iommu_count = 1;
1469
1470         /* always allocate the top pgd */
1471         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1472         if (!domain->pgd)
1473                 return -ENOMEM;
1474         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1475         return 0;
1476 }
1477
1478 static void domain_exit(struct dmar_domain *domain)
1479 {
1480         u64 end;
1481
1482         /* Domain 0 is reserved, so dont process it */
1483         if (!domain)
1484                 return;
1485
1486         domain_remove_dev_info(domain);
1487         /* destroy iovas */
1488         put_iova_domain(&domain->iovad);
1489         end = DOMAIN_MAX_ADDR(domain->gaw);
1490         end = end & (~PAGE_MASK);
1491
1492         /* clear ptes */
1493         dma_pte_clear_range(domain, 0, end);
1494
1495         /* free page tables */
1496         dma_pte_free_pagetable(domain, 0, end);
1497
1498         iommu_free_domain(domain);
1499         free_domain_mem(domain);
1500 }
1501
1502 static int domain_context_mapping_one(struct dmar_domain *domain,
1503                 u8 bus, u8 devfn)
1504 {
1505         struct context_entry *context;
1506         unsigned long flags;
1507         struct intel_iommu *iommu;
1508         struct dma_pte *pgd;
1509         unsigned long num;
1510         unsigned long ndomains;
1511         int id;
1512         int agaw;
1513
1514         pr_debug("Set context mapping for %02x:%02x.%d\n",
1515                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1516         BUG_ON(!domain->pgd);
1517
1518         iommu = device_to_iommu(bus, devfn);
1519         if (!iommu)
1520                 return -ENODEV;
1521
1522         context = device_to_context_entry(iommu, bus, devfn);
1523         if (!context)
1524                 return -ENOMEM;
1525         spin_lock_irqsave(&iommu->lock, flags);
1526         if (context_present(context)) {
1527                 spin_unlock_irqrestore(&iommu->lock, flags);
1528                 return 0;
1529         }
1530
1531         id = domain->id;
1532         pgd = domain->pgd;
1533
1534         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1535                 int found = 0;
1536
1537                 /* find an available domain id for this device in iommu */
1538                 ndomains = cap_ndoms(iommu->cap);
1539                 num = find_first_bit(iommu->domain_ids, ndomains);
1540                 for (; num < ndomains; ) {
1541                         if (iommu->domains[num] == domain) {
1542                                 id = num;
1543                                 found = 1;
1544                                 break;
1545                         }
1546                         num = find_next_bit(iommu->domain_ids,
1547                                             cap_ndoms(iommu->cap), num+1);
1548                 }
1549
1550                 if (found == 0) {
1551                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1552                         if (num >= ndomains) {
1553                                 spin_unlock_irqrestore(&iommu->lock, flags);
1554                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1555                                 return -EFAULT;
1556                         }
1557
1558                         set_bit(num, iommu->domain_ids);
1559                         iommu->domains[num] = domain;
1560                         id = num;
1561                 }
1562
1563                 /* Skip top levels of page tables for
1564                  * iommu which has less agaw than default.
1565                  */
1566                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1567                         pgd = phys_to_virt(dma_pte_addr(pgd));
1568                         if (!dma_pte_present(pgd)) {
1569                                 spin_unlock_irqrestore(&iommu->lock, flags);
1570                                 return -ENOMEM;
1571                         }
1572                 }
1573         }
1574
1575         context_set_domain_id(context, id);
1576         context_set_address_width(context, iommu->agaw);
1577         context_set_address_root(context, virt_to_phys(pgd));
1578         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1579         context_set_fault_enable(context);
1580         context_set_present(context);
1581         domain_flush_cache(domain, context, sizeof(*context));
1582
1583         /* it's a non-present to present mapping */
1584         if (iommu->flush.flush_context(iommu, domain->id,
1585                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1586                 DMA_CCMD_DEVICE_INVL, 1))
1587                 iommu_flush_write_buffer(iommu);
1588         else
1589                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1590
1591         spin_unlock_irqrestore(&iommu->lock, flags);
1592
1593         spin_lock_irqsave(&domain->iommu_lock, flags);
1594         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1595                 domain->iommu_count++;
1596                 domain_update_iommu_cap(domain);
1597         }
1598         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1599         return 0;
1600 }
1601
1602 static int
1603 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1604 {
1605         int ret;
1606         struct pci_dev *tmp, *parent;
1607
1608         ret = domain_context_mapping_one(domain, pdev->bus->number,
1609                 pdev->devfn);
1610         if (ret)
1611                 return ret;
1612
1613         /* dependent device mapping */
1614         tmp = pci_find_upstream_pcie_bridge(pdev);
1615         if (!tmp)
1616                 return 0;
1617         /* Secondary interface's bus number and devfn 0 */
1618         parent = pdev->bus->self;
1619         while (parent != tmp) {
1620                 ret = domain_context_mapping_one(domain, parent->bus->number,
1621                         parent->devfn);
1622                 if (ret)
1623                         return ret;
1624                 parent = parent->bus->self;
1625         }
1626         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1627                 return domain_context_mapping_one(domain,
1628                         tmp->subordinate->number, 0);
1629         else /* this is a legacy PCI bridge */
1630                 return domain_context_mapping_one(domain,
1631                         tmp->bus->number, tmp->devfn);
1632 }
1633
1634 static int domain_context_mapped(struct pci_dev *pdev)
1635 {
1636         int ret;
1637         struct pci_dev *tmp, *parent;
1638         struct intel_iommu *iommu;
1639
1640         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1641         if (!iommu)
1642                 return -ENODEV;
1643
1644         ret = device_context_mapped(iommu,
1645                 pdev->bus->number, pdev->devfn);
1646         if (!ret)
1647                 return ret;
1648         /* dependent device mapping */
1649         tmp = pci_find_upstream_pcie_bridge(pdev);
1650         if (!tmp)
1651                 return ret;
1652         /* Secondary interface's bus number and devfn 0 */
1653         parent = pdev->bus->self;
1654         while (parent != tmp) {
1655                 ret = device_context_mapped(iommu, parent->bus->number,
1656                         parent->devfn);
1657                 if (!ret)
1658                         return ret;
1659                 parent = parent->bus->self;
1660         }
1661         if (tmp->is_pcie)
1662                 return device_context_mapped(iommu,
1663                         tmp->subordinate->number, 0);
1664         else
1665                 return device_context_mapped(iommu,
1666                         tmp->bus->number, tmp->devfn);
1667 }
1668
1669 static int
1670 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1671                         u64 hpa, size_t size, int prot)
1672 {
1673         u64 start_pfn, end_pfn;
1674         struct dma_pte *pte;
1675         int index;
1676         int addr_width = agaw_to_width(domain->agaw);
1677
1678         hpa &= (((u64)1) << addr_width) - 1;
1679
1680         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1681                 return -EINVAL;
1682         iova &= PAGE_MASK;
1683         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1684         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1685         index = 0;
1686         while (start_pfn < end_pfn) {
1687                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1688                 if (!pte)
1689                         return -ENOMEM;
1690                 /* We don't need lock here, nobody else
1691                  * touches the iova range
1692                  */
1693                 BUG_ON(dma_pte_addr(pte));
1694                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1695                 dma_set_pte_prot(pte, prot);
1696                 if (prot & DMA_PTE_SNP)
1697                         dma_set_pte_snp(pte);
1698                 domain_flush_cache(domain, pte, sizeof(*pte));
1699                 start_pfn++;
1700                 index++;
1701         }
1702         return 0;
1703 }
1704
1705 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1706 {
1707         if (!iommu)
1708                 return;
1709
1710         clear_context_table(iommu, bus, devfn);
1711         iommu->flush.flush_context(iommu, 0, 0, 0,
1712                                            DMA_CCMD_GLOBAL_INVL, 0);
1713         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1714                                          DMA_TLB_GLOBAL_FLUSH, 0);
1715 }
1716
1717 static void domain_remove_dev_info(struct dmar_domain *domain)
1718 {
1719         struct device_domain_info *info;
1720         unsigned long flags;
1721         struct intel_iommu *iommu;
1722
1723         spin_lock_irqsave(&device_domain_lock, flags);
1724         while (!list_empty(&domain->devices)) {
1725                 info = list_entry(domain->devices.next,
1726                         struct device_domain_info, link);
1727                 list_del(&info->link);
1728                 list_del(&info->global);
1729                 if (info->dev)
1730                         info->dev->dev.archdata.iommu = NULL;
1731                 spin_unlock_irqrestore(&device_domain_lock, flags);
1732
1733                 iommu = device_to_iommu(info->bus, info->devfn);
1734                 iommu_detach_dev(iommu, info->bus, info->devfn);
1735                 free_devinfo_mem(info);
1736
1737                 spin_lock_irqsave(&device_domain_lock, flags);
1738         }
1739         spin_unlock_irqrestore(&device_domain_lock, flags);
1740 }
1741
1742 /*
1743  * find_domain
1744  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1745  */
1746 static struct dmar_domain *
1747 find_domain(struct pci_dev *pdev)
1748 {
1749         struct device_domain_info *info;
1750
1751         /* No lock here, assumes no domain exit in normal case */
1752         info = pdev->dev.archdata.iommu;
1753         if (info)
1754                 return info->domain;
1755         return NULL;
1756 }
1757
1758 /* domain is initialized */
1759 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1760 {
1761         struct dmar_domain *domain, *found = NULL;
1762         struct intel_iommu *iommu;
1763         struct dmar_drhd_unit *drhd;
1764         struct device_domain_info *info, *tmp;
1765         struct pci_dev *dev_tmp;
1766         unsigned long flags;
1767         int bus = 0, devfn = 0;
1768
1769         domain = find_domain(pdev);
1770         if (domain)
1771                 return domain;
1772
1773         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1774         if (dev_tmp) {
1775                 if (dev_tmp->is_pcie) {
1776                         bus = dev_tmp->subordinate->number;
1777                         devfn = 0;
1778                 } else {
1779                         bus = dev_tmp->bus->number;
1780                         devfn = dev_tmp->devfn;
1781                 }
1782                 spin_lock_irqsave(&device_domain_lock, flags);
1783                 list_for_each_entry(info, &device_domain_list, global) {
1784                         if (info->bus == bus && info->devfn == devfn) {
1785                                 found = info->domain;
1786                                 break;
1787                         }
1788                 }
1789                 spin_unlock_irqrestore(&device_domain_lock, flags);
1790                 /* pcie-pci bridge already has a domain, uses it */
1791                 if (found) {
1792                         domain = found;
1793                         goto found_domain;
1794                 }
1795         }
1796
1797         /* Allocate new domain for the device */
1798         drhd = dmar_find_matched_drhd_unit(pdev);
1799         if (!drhd) {
1800                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1801                         pci_name(pdev));
1802                 return NULL;
1803         }
1804         iommu = drhd->iommu;
1805
1806         domain = iommu_alloc_domain(iommu);
1807         if (!domain)
1808                 goto error;
1809
1810         if (domain_init(domain, gaw)) {
1811                 domain_exit(domain);
1812                 goto error;
1813         }
1814
1815         /* register pcie-to-pci device */
1816         if (dev_tmp) {
1817                 info = alloc_devinfo_mem();
1818                 if (!info) {
1819                         domain_exit(domain);
1820                         goto error;
1821                 }
1822                 info->bus = bus;
1823                 info->devfn = devfn;
1824                 info->dev = NULL;
1825                 info->domain = domain;
1826                 /* This domain is shared by devices under p2p bridge */
1827                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1828
1829                 /* pcie-to-pci bridge already has a domain, uses it */
1830                 found = NULL;
1831                 spin_lock_irqsave(&device_domain_lock, flags);
1832                 list_for_each_entry(tmp, &device_domain_list, global) {
1833                         if (tmp->bus == bus && tmp->devfn == devfn) {
1834                                 found = tmp->domain;
1835                                 break;
1836                         }
1837                 }
1838                 if (found) {
1839                         free_devinfo_mem(info);
1840                         domain_exit(domain);
1841                         domain = found;
1842                 } else {
1843                         list_add(&info->link, &domain->devices);
1844                         list_add(&info->global, &device_domain_list);
1845                 }
1846                 spin_unlock_irqrestore(&device_domain_lock, flags);
1847         }
1848
1849 found_domain:
1850         info = alloc_devinfo_mem();
1851         if (!info)
1852                 goto error;
1853         info->bus = pdev->bus->number;
1854         info->devfn = pdev->devfn;
1855         info->dev = pdev;
1856         info->domain = domain;
1857         spin_lock_irqsave(&device_domain_lock, flags);
1858         /* somebody is fast */
1859         found = find_domain(pdev);
1860         if (found != NULL) {
1861                 spin_unlock_irqrestore(&device_domain_lock, flags);
1862                 if (found != domain) {
1863                         domain_exit(domain);
1864                         domain = found;
1865                 }
1866                 free_devinfo_mem(info);
1867                 return domain;
1868         }
1869         list_add(&info->link, &domain->devices);
1870         list_add(&info->global, &device_domain_list);
1871         pdev->dev.archdata.iommu = info;
1872         spin_unlock_irqrestore(&device_domain_lock, flags);
1873         return domain;
1874 error:
1875         /* recheck it here, maybe others set it */
1876         return find_domain(pdev);
1877 }
1878
1879 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1880                                       unsigned long long start,
1881                                       unsigned long long end)
1882 {
1883         struct dmar_domain *domain;
1884         unsigned long size;
1885         unsigned long long base;
1886         int ret;
1887
1888         printk(KERN_INFO
1889                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1890                 pci_name(pdev), start, end);
1891         /* page table init */
1892         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1893         if (!domain)
1894                 return -ENOMEM;
1895
1896         /* The address might not be aligned */
1897         base = start & PAGE_MASK;
1898         size = end - base;
1899         size = PAGE_ALIGN(size);
1900         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1901                         IOVA_PFN(base + size) - 1)) {
1902                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1903                 ret = -ENOMEM;
1904                 goto error;
1905         }
1906
1907         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1908                 size, base, pci_name(pdev));
1909         /*
1910          * RMRR range might have overlap with physical memory range,
1911          * clear it first
1912          */
1913         dma_pte_clear_range(domain, base, base + size);
1914
1915         ret = domain_page_mapping(domain, base, base, size,
1916                 DMA_PTE_READ|DMA_PTE_WRITE);
1917         if (ret)
1918                 goto error;
1919
1920         /* context entry init */
1921         ret = domain_context_mapping(domain, pdev);
1922         if (!ret)
1923                 return 0;
1924 error:
1925         domain_exit(domain);
1926         return ret;
1927
1928 }
1929
1930 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1931         struct pci_dev *pdev)
1932 {
1933         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1934                 return 0;
1935         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1936                 rmrr->end_address + 1);
1937 }
1938
1939 #ifdef CONFIG_DMAR_GFX_WA
1940 struct iommu_prepare_data {
1941         struct pci_dev *pdev;
1942         int ret;
1943 };
1944
1945 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1946                                          unsigned long end_pfn, void *datax)
1947 {
1948         struct iommu_prepare_data *data;
1949
1950         data = (struct iommu_prepare_data *)datax;
1951
1952         data->ret = iommu_prepare_identity_map(data->pdev,
1953                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1954         return data->ret;
1955
1956 }
1957
1958 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1959 {
1960         int nid;
1961         struct iommu_prepare_data data;
1962
1963         data.pdev = pdev;
1964         data.ret = 0;
1965
1966         for_each_online_node(nid) {
1967                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1968                 if (data.ret)
1969                         return data.ret;
1970         }
1971         return data.ret;
1972 }
1973
1974 static void __init iommu_prepare_gfx_mapping(void)
1975 {
1976         struct pci_dev *pdev = NULL;
1977         int ret;
1978
1979         for_each_pci_dev(pdev) {
1980                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1981                                 !IS_GFX_DEVICE(pdev))
1982                         continue;
1983                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1984                         pci_name(pdev));
1985                 ret = iommu_prepare_with_active_regions(pdev);
1986                 if (ret)
1987                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1988         }
1989 }
1990 #else /* !CONFIG_DMAR_GFX_WA */
1991 static inline void iommu_prepare_gfx_mapping(void)
1992 {
1993         return;
1994 }
1995 #endif
1996
1997 #ifdef CONFIG_DMAR_FLOPPY_WA
1998 static inline void iommu_prepare_isa(void)
1999 {
2000         struct pci_dev *pdev;
2001         int ret;
2002
2003         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2004         if (!pdev)
2005                 return;
2006
2007         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
2008         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2009
2010         if (ret)
2011                 printk("IOMMU: Failed to create 0-64M identity map, "
2012                         "floppy might not work\n");
2013
2014 }
2015 #else
2016 static inline void iommu_prepare_isa(void)
2017 {
2018         return;
2019 }
2020 #endif /* !CONFIG_DMAR_FLPY_WA */
2021
2022 static int __init init_dmars(void)
2023 {
2024         struct dmar_drhd_unit *drhd;
2025         struct dmar_rmrr_unit *rmrr;
2026         struct pci_dev *pdev;
2027         struct intel_iommu *iommu;
2028         int i, ret, unit = 0;
2029
2030         /*
2031          * for each drhd
2032          *    allocate root
2033          *    initialize and program root entry to not present
2034          * endfor
2035          */
2036         for_each_drhd_unit(drhd) {
2037                 g_num_of_iommus++;
2038                 /*
2039                  * lock not needed as this is only incremented in the single
2040                  * threaded kernel __init code path all other access are read
2041                  * only
2042                  */
2043         }
2044
2045         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2046                         GFP_KERNEL);
2047         if (!g_iommus) {
2048                 printk(KERN_ERR "Allocating global iommu array failed\n");
2049                 ret = -ENOMEM;
2050                 goto error;
2051         }
2052
2053         deferred_flush = kzalloc(g_num_of_iommus *
2054                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2055         if (!deferred_flush) {
2056                 kfree(g_iommus);
2057                 ret = -ENOMEM;
2058                 goto error;
2059         }
2060
2061         for_each_drhd_unit(drhd) {
2062                 if (drhd->ignored)
2063                         continue;
2064
2065                 iommu = drhd->iommu;
2066                 g_iommus[iommu->seq_id] = iommu;
2067
2068                 ret = iommu_init_domains(iommu);
2069                 if (ret)
2070                         goto error;
2071
2072                 /*
2073                  * TBD:
2074                  * we could share the same root & context tables
2075                  * amoung all IOMMU's. Need to Split it later.
2076                  */
2077                 ret = iommu_alloc_root_entry(iommu);
2078                 if (ret) {
2079                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2080                         goto error;
2081                 }
2082         }
2083
2084         for_each_drhd_unit(drhd) {
2085                 if (drhd->ignored)
2086                         continue;
2087
2088                 iommu = drhd->iommu;
2089                 if (dmar_enable_qi(iommu)) {
2090                         /*
2091                          * Queued Invalidate not enabled, use Register Based
2092                          * Invalidate
2093                          */
2094                         iommu->flush.flush_context = __iommu_flush_context;
2095                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2096                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2097                                "invalidation\n",
2098                                (unsigned long long)drhd->reg_base_addr);
2099                 } else {
2100                         iommu->flush.flush_context = qi_flush_context;
2101                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2102                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2103                                "invalidation\n",
2104                                (unsigned long long)drhd->reg_base_addr);
2105                 }
2106         }
2107
2108         /*
2109          * For each rmrr
2110          *   for each dev attached to rmrr
2111          *   do
2112          *     locate drhd for dev, alloc domain for dev
2113          *     allocate free domain
2114          *     allocate page table entries for rmrr
2115          *     if context not allocated for bus
2116          *           allocate and init context
2117          *           set present in root table for this bus
2118          *     init context with domain, translation etc
2119          *    endfor
2120          * endfor
2121          */
2122         for_each_rmrr_units(rmrr) {
2123                 for (i = 0; i < rmrr->devices_cnt; i++) {
2124                         pdev = rmrr->devices[i];
2125                         /* some BIOS lists non-exist devices in DMAR table */
2126                         if (!pdev)
2127                                 continue;
2128                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2129                         if (ret)
2130                                 printk(KERN_ERR
2131                                  "IOMMU: mapping reserved region failed\n");
2132                 }
2133         }
2134
2135         iommu_prepare_gfx_mapping();
2136
2137         iommu_prepare_isa();
2138
2139         /*
2140          * for each drhd
2141          *   enable fault log
2142          *   global invalidate context cache
2143          *   global invalidate iotlb
2144          *   enable translation
2145          */
2146         for_each_drhd_unit(drhd) {
2147                 if (drhd->ignored)
2148                         continue;
2149                 iommu = drhd->iommu;
2150                 sprintf (iommu->name, "dmar%d", unit++);
2151
2152                 iommu_flush_write_buffer(iommu);
2153
2154                 ret = dmar_set_interrupt(iommu);
2155                 if (ret)
2156                         goto error;
2157
2158                 iommu_set_root_entry(iommu);
2159
2160                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2161                                            0);
2162                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2163                                          0);
2164                 iommu_disable_protect_mem_regions(iommu);
2165
2166                 ret = iommu_enable_translation(iommu);
2167                 if (ret)
2168                         goto error;
2169         }
2170
2171         return 0;
2172 error:
2173         for_each_drhd_unit(drhd) {
2174                 if (drhd->ignored)
2175                         continue;
2176                 iommu = drhd->iommu;
2177                 free_iommu(iommu);
2178         }
2179         kfree(g_iommus);
2180         return ret;
2181 }
2182
2183 static inline u64 aligned_size(u64 host_addr, size_t size)
2184 {
2185         u64 addr;
2186         addr = (host_addr & (~PAGE_MASK)) + size;
2187         return PAGE_ALIGN(addr);
2188 }
2189
2190 struct iova *
2191 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2192 {
2193         struct iova *piova;
2194
2195         /* Make sure it's in range */
2196         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2197         if (!size || (IOVA_START_ADDR + size > end))
2198                 return NULL;
2199
2200         piova = alloc_iova(&domain->iovad,
2201                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2202         return piova;
2203 }
2204
2205 static struct iova *
2206 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2207                    size_t size, u64 dma_mask)
2208 {
2209         struct pci_dev *pdev = to_pci_dev(dev);
2210         struct iova *iova = NULL;
2211
2212         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2213                 iova = iommu_alloc_iova(domain, size, dma_mask);
2214         else {
2215                 /*
2216                  * First try to allocate an io virtual address in
2217                  * DMA_32BIT_MASK and if that fails then try allocating
2218                  * from higher range
2219                  */
2220                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2221                 if (!iova)
2222                         iova = iommu_alloc_iova(domain, size, dma_mask);
2223         }
2224
2225         if (!iova) {
2226                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2227                 return NULL;
2228         }
2229
2230         return iova;
2231 }
2232
2233 static struct dmar_domain *
2234 get_valid_domain_for_dev(struct pci_dev *pdev)
2235 {
2236         struct dmar_domain *domain;
2237         int ret;
2238
2239         domain = get_domain_for_dev(pdev,
2240                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2241         if (!domain) {
2242                 printk(KERN_ERR
2243                         "Allocating domain for %s failed", pci_name(pdev));
2244                 return NULL;
2245         }
2246
2247         /* make sure context mapping is ok */
2248         if (unlikely(!domain_context_mapped(pdev))) {
2249                 ret = domain_context_mapping(domain, pdev);
2250                 if (ret) {
2251                         printk(KERN_ERR
2252                                 "Domain context map for %s failed",
2253                                 pci_name(pdev));
2254                         return NULL;
2255                 }
2256         }
2257
2258         return domain;
2259 }
2260
2261 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2262                                      size_t size, int dir, u64 dma_mask)
2263 {
2264         struct pci_dev *pdev = to_pci_dev(hwdev);
2265         struct dmar_domain *domain;
2266         phys_addr_t start_paddr;
2267         struct iova *iova;
2268         int prot = 0;
2269         int ret;
2270         struct intel_iommu *iommu;
2271
2272         BUG_ON(dir == DMA_NONE);
2273         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2274                 return paddr;
2275
2276         domain = get_valid_domain_for_dev(pdev);
2277         if (!domain)
2278                 return 0;
2279
2280         iommu = domain_get_iommu(domain);
2281         size = aligned_size((u64)paddr, size);
2282
2283         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2284         if (!iova)
2285                 goto error;
2286
2287         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2288
2289         /*
2290          * Check if DMAR supports zero-length reads on write only
2291          * mappings..
2292          */
2293         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2294                         !cap_zlr(iommu->cap))
2295                 prot |= DMA_PTE_READ;
2296         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2297                 prot |= DMA_PTE_WRITE;
2298         /*
2299          * paddr - (paddr + size) might be partial page, we should map the whole
2300          * page.  Note: if two part of one page are separately mapped, we
2301          * might have two guest_addr mapping to the same host paddr, but this
2302          * is not a big problem
2303          */
2304         ret = domain_page_mapping(domain, start_paddr,
2305                 ((u64)paddr) & PAGE_MASK, size, prot);
2306         if (ret)
2307                 goto error;
2308
2309         /* it's a non-present to present mapping */
2310         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2311                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2312         if (ret)
2313                 iommu_flush_write_buffer(iommu);
2314
2315         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2316
2317 error:
2318         if (iova)
2319                 __free_iova(&domain->iovad, iova);
2320         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2321                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2322         return 0;
2323 }
2324
2325 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2326                             size_t size, int dir)
2327 {
2328         return __intel_map_single(hwdev, paddr, size, dir,
2329                                   to_pci_dev(hwdev)->dma_mask);
2330 }
2331
2332 static void flush_unmaps(void)
2333 {
2334         int i, j;
2335
2336         timer_on = 0;
2337
2338         /* just flush them all */
2339         for (i = 0; i < g_num_of_iommus; i++) {
2340                 struct intel_iommu *iommu = g_iommus[i];
2341                 if (!iommu)
2342                         continue;
2343
2344                 if (deferred_flush[i].next) {
2345                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2346                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2347                         for (j = 0; j < deferred_flush[i].next; j++) {
2348                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2349                                                 deferred_flush[i].iova[j]);
2350                         }
2351                         deferred_flush[i].next = 0;
2352                 }
2353         }
2354
2355         list_size = 0;
2356 }
2357
2358 static void flush_unmaps_timeout(unsigned long data)
2359 {
2360         unsigned long flags;
2361
2362         spin_lock_irqsave(&async_umap_flush_lock, flags);
2363         flush_unmaps();
2364         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2365 }
2366
2367 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2368 {
2369         unsigned long flags;
2370         int next, iommu_id;
2371         struct intel_iommu *iommu;
2372
2373         spin_lock_irqsave(&async_umap_flush_lock, flags);
2374         if (list_size == HIGH_WATER_MARK)
2375                 flush_unmaps();
2376
2377         iommu = domain_get_iommu(dom);
2378         iommu_id = iommu->seq_id;
2379
2380         next = deferred_flush[iommu_id].next;
2381         deferred_flush[iommu_id].domain[next] = dom;
2382         deferred_flush[iommu_id].iova[next] = iova;
2383         deferred_flush[iommu_id].next++;
2384
2385         if (!timer_on) {
2386                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2387                 timer_on = 1;
2388         }
2389         list_size++;
2390         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2391 }
2392
2393 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2394                         int dir)
2395 {
2396         struct pci_dev *pdev = to_pci_dev(dev);
2397         struct dmar_domain *domain;
2398         unsigned long start_addr;
2399         struct iova *iova;
2400         struct intel_iommu *iommu;
2401
2402         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2403                 return;
2404         domain = find_domain(pdev);
2405         BUG_ON(!domain);
2406
2407         iommu = domain_get_iommu(domain);
2408
2409         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2410         if (!iova)
2411                 return;
2412
2413         start_addr = iova->pfn_lo << PAGE_SHIFT;
2414         size = aligned_size((u64)dev_addr, size);
2415
2416         pr_debug("Device %s unmapping: %zx@%llx\n",
2417                 pci_name(pdev), size, (unsigned long long)start_addr);
2418
2419         /*  clear the whole page */
2420         dma_pte_clear_range(domain, start_addr, start_addr + size);
2421         /* free page tables */
2422         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2423         if (intel_iommu_strict) {
2424                 if (iommu_flush_iotlb_psi(iommu,
2425                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2426                         iommu_flush_write_buffer(iommu);
2427                 /* free iova */
2428                 __free_iova(&domain->iovad, iova);
2429         } else {
2430                 add_unmap(domain, iova);
2431                 /*
2432                  * queue up the release of the unmap to save the 1/6th of the
2433                  * cpu used up by the iotlb flush operation...
2434                  */
2435         }
2436 }
2437
2438 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2439                            dma_addr_t *dma_handle, gfp_t flags)
2440 {
2441         void *vaddr;
2442         int order;
2443
2444         size = PAGE_ALIGN(size);
2445         order = get_order(size);
2446         flags &= ~(GFP_DMA | GFP_DMA32);
2447
2448         vaddr = (void *)__get_free_pages(flags, order);
2449         if (!vaddr)
2450                 return NULL;
2451         memset(vaddr, 0, size);
2452
2453         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2454                                          DMA_BIDIRECTIONAL,
2455                                          hwdev->coherent_dma_mask);
2456         if (*dma_handle)
2457                 return vaddr;
2458         free_pages((unsigned long)vaddr, order);
2459         return NULL;
2460 }
2461
2462 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2463                          dma_addr_t dma_handle)
2464 {
2465         int order;
2466
2467         size = PAGE_ALIGN(size);
2468         order = get_order(size);
2469
2470         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2471         free_pages((unsigned long)vaddr, order);
2472 }
2473
2474 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2475                     int nelems, int dir)
2476 {
2477         int i;
2478         struct pci_dev *pdev = to_pci_dev(hwdev);
2479         struct dmar_domain *domain;
2480         unsigned long start_addr;
2481         struct iova *iova;
2482         size_t size = 0;
2483         phys_addr_t addr;
2484         struct scatterlist *sg;
2485         struct intel_iommu *iommu;
2486
2487         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2488                 return;
2489
2490         domain = find_domain(pdev);
2491         BUG_ON(!domain);
2492
2493         iommu = domain_get_iommu(domain);
2494
2495         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2496         if (!iova)
2497                 return;
2498         for_each_sg(sglist, sg, nelems, i) {
2499                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2500                 size += aligned_size((u64)addr, sg->length);
2501         }
2502
2503         start_addr = iova->pfn_lo << PAGE_SHIFT;
2504
2505         /*  clear the whole page */
2506         dma_pte_clear_range(domain, start_addr, start_addr + size);
2507         /* free page tables */
2508         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2509
2510         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2511                         size >> VTD_PAGE_SHIFT, 0))
2512                 iommu_flush_write_buffer(iommu);
2513
2514         /* free iova */
2515         __free_iova(&domain->iovad, iova);
2516 }
2517
2518 static int intel_nontranslate_map_sg(struct device *hddev,
2519         struct scatterlist *sglist, int nelems, int dir)
2520 {
2521         int i;
2522         struct scatterlist *sg;
2523
2524         for_each_sg(sglist, sg, nelems, i) {
2525                 BUG_ON(!sg_page(sg));
2526                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2527                 sg->dma_length = sg->length;
2528         }
2529         return nelems;
2530 }
2531
2532 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2533                  int dir)
2534 {
2535         phys_addr_t addr;
2536         int i;
2537         struct pci_dev *pdev = to_pci_dev(hwdev);
2538         struct dmar_domain *domain;
2539         size_t size = 0;
2540         int prot = 0;
2541         size_t offset = 0;
2542         struct iova *iova = NULL;
2543         int ret;
2544         struct scatterlist *sg;
2545         unsigned long start_addr;
2546         struct intel_iommu *iommu;
2547
2548         BUG_ON(dir == DMA_NONE);
2549         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2550                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2551
2552         domain = get_valid_domain_for_dev(pdev);
2553         if (!domain)
2554                 return 0;
2555
2556         iommu = domain_get_iommu(domain);
2557
2558         for_each_sg(sglist, sg, nelems, i) {
2559                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2560                 size += aligned_size((u64)addr, sg->length);
2561         }
2562
2563         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2564         if (!iova) {
2565                 sglist->dma_length = 0;
2566                 return 0;
2567         }
2568
2569         /*
2570          * Check if DMAR supports zero-length reads on write only
2571          * mappings..
2572          */
2573         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2574                         !cap_zlr(iommu->cap))
2575                 prot |= DMA_PTE_READ;
2576         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2577                 prot |= DMA_PTE_WRITE;
2578
2579         start_addr = iova->pfn_lo << PAGE_SHIFT;
2580         offset = 0;
2581         for_each_sg(sglist, sg, nelems, i) {
2582                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2583                 size = aligned_size((u64)addr, sg->length);
2584                 ret = domain_page_mapping(domain, start_addr + offset,
2585                         ((u64)addr) & PAGE_MASK,
2586                         size, prot);
2587                 if (ret) {
2588                         /*  clear the page */
2589                         dma_pte_clear_range(domain, start_addr,
2590                                   start_addr + offset);
2591                         /* free page tables */
2592                         dma_pte_free_pagetable(domain, start_addr,
2593                                   start_addr + offset);
2594                         /* free iova */
2595                         __free_iova(&domain->iovad, iova);
2596                         return 0;
2597                 }
2598                 sg->dma_address = start_addr + offset +
2599                                 ((u64)addr & (~PAGE_MASK));
2600                 sg->dma_length = sg->length;
2601                 offset += size;
2602         }
2603
2604         /* it's a non-present to present mapping */
2605         if (iommu_flush_iotlb_psi(iommu, domain->id,
2606                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2607                 iommu_flush_write_buffer(iommu);
2608         return nelems;
2609 }
2610
2611 static struct dma_mapping_ops intel_dma_ops = {
2612         .alloc_coherent = intel_alloc_coherent,
2613         .free_coherent = intel_free_coherent,
2614         .map_single = intel_map_single,
2615         .unmap_single = intel_unmap_single,
2616         .map_sg = intel_map_sg,
2617         .unmap_sg = intel_unmap_sg,
2618 };
2619
2620 static inline int iommu_domain_cache_init(void)
2621 {
2622         int ret = 0;
2623
2624         iommu_domain_cache = kmem_cache_create("iommu_domain",
2625                                          sizeof(struct dmar_domain),
2626                                          0,
2627                                          SLAB_HWCACHE_ALIGN,
2628
2629                                          NULL);
2630         if (!iommu_domain_cache) {
2631                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2632                 ret = -ENOMEM;
2633         }
2634
2635         return ret;
2636 }
2637
2638 static inline int iommu_devinfo_cache_init(void)
2639 {
2640         int ret = 0;
2641
2642         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2643                                          sizeof(struct device_domain_info),
2644                                          0,
2645                                          SLAB_HWCACHE_ALIGN,
2646                                          NULL);
2647         if (!iommu_devinfo_cache) {
2648                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2649                 ret = -ENOMEM;
2650         }
2651
2652         return ret;
2653 }
2654
2655 static inline int iommu_iova_cache_init(void)
2656 {
2657         int ret = 0;
2658
2659         iommu_iova_cache = kmem_cache_create("iommu_iova",
2660                                          sizeof(struct iova),
2661                                          0,
2662                                          SLAB_HWCACHE_ALIGN,
2663                                          NULL);
2664         if (!iommu_iova_cache) {
2665                 printk(KERN_ERR "Couldn't create iova cache\n");
2666                 ret = -ENOMEM;
2667         }
2668
2669         return ret;
2670 }
2671
2672 static int __init iommu_init_mempool(void)
2673 {
2674         int ret;
2675         ret = iommu_iova_cache_init();
2676         if (ret)
2677                 return ret;
2678
2679         ret = iommu_domain_cache_init();
2680         if (ret)
2681                 goto domain_error;
2682
2683         ret = iommu_devinfo_cache_init();
2684         if (!ret)
2685                 return ret;
2686
2687         kmem_cache_destroy(iommu_domain_cache);
2688 domain_error:
2689         kmem_cache_destroy(iommu_iova_cache);
2690
2691         return -ENOMEM;
2692 }
2693
2694 static void __init iommu_exit_mempool(void)
2695 {
2696         kmem_cache_destroy(iommu_devinfo_cache);
2697         kmem_cache_destroy(iommu_domain_cache);
2698         kmem_cache_destroy(iommu_iova_cache);
2699
2700 }
2701
2702 static void __init init_no_remapping_devices(void)
2703 {
2704         struct dmar_drhd_unit *drhd;
2705
2706         for_each_drhd_unit(drhd) {
2707                 if (!drhd->include_all) {
2708                         int i;
2709                         for (i = 0; i < drhd->devices_cnt; i++)
2710                                 if (drhd->devices[i] != NULL)
2711                                         break;
2712                         /* ignore DMAR unit if no pci devices exist */
2713                         if (i == drhd->devices_cnt)
2714                                 drhd->ignored = 1;
2715                 }
2716         }
2717
2718         if (dmar_map_gfx)
2719                 return;
2720
2721         for_each_drhd_unit(drhd) {
2722                 int i;
2723                 if (drhd->ignored || drhd->include_all)
2724                         continue;
2725
2726                 for (i = 0; i < drhd->devices_cnt; i++)
2727                         if (drhd->devices[i] &&
2728                                 !IS_GFX_DEVICE(drhd->devices[i]))
2729                                 break;
2730
2731                 if (i < drhd->devices_cnt)
2732                         continue;
2733
2734                 /* bypass IOMMU if it is just for gfx devices */
2735                 drhd->ignored = 1;
2736                 for (i = 0; i < drhd->devices_cnt; i++) {
2737                         if (!drhd->devices[i])
2738                                 continue;
2739                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2740                 }
2741         }
2742 }
2743
2744 int __init intel_iommu_init(void)
2745 {
2746         int ret = 0;
2747
2748         if (dmar_table_init())
2749                 return  -ENODEV;
2750
2751         if (dmar_dev_scope_init())
2752                 return  -ENODEV;
2753
2754         /*
2755          * Check the need for DMA-remapping initialization now.
2756          * Above initialization will also be used by Interrupt-remapping.
2757          */
2758         if (no_iommu || swiotlb || dmar_disabled)
2759                 return -ENODEV;
2760
2761         iommu_init_mempool();
2762         dmar_init_reserved_ranges();
2763
2764         init_no_remapping_devices();
2765
2766         ret = init_dmars();
2767         if (ret) {
2768                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2769                 put_iova_domain(&reserved_iova_list);
2770                 iommu_exit_mempool();
2771                 return ret;
2772         }
2773         printk(KERN_INFO
2774         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2775
2776         init_timer(&unmap_timer);
2777         force_iommu = 1;
2778         dma_ops = &intel_dma_ops;
2779
2780         register_iommu(&intel_iommu_ops);
2781
2782         return 0;
2783 }
2784
2785 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2786                                   struct pci_dev *pdev)
2787 {
2788         struct device_domain_info *info;
2789         unsigned long flags;
2790
2791         info = alloc_devinfo_mem();
2792         if (!info)
2793                 return -ENOMEM;
2794
2795         info->bus = pdev->bus->number;
2796         info->devfn = pdev->devfn;
2797         info->dev = pdev;
2798         info->domain = domain;
2799
2800         spin_lock_irqsave(&device_domain_lock, flags);
2801         list_add(&info->link, &domain->devices);
2802         list_add(&info->global, &device_domain_list);
2803         pdev->dev.archdata.iommu = info;
2804         spin_unlock_irqrestore(&device_domain_lock, flags);
2805
2806         return 0;
2807 }
2808
2809 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
2810                                            struct pci_dev *pdev)
2811 {
2812         struct pci_dev *tmp, *parent;
2813
2814         if (!iommu || !pdev)
2815                 return;
2816
2817         /* dependent device detach */
2818         tmp = pci_find_upstream_pcie_bridge(pdev);
2819         /* Secondary interface's bus number and devfn 0 */
2820         if (tmp) {
2821                 parent = pdev->bus->self;
2822                 while (parent != tmp) {
2823                         iommu_detach_dev(iommu, parent->bus->number,
2824                                 parent->devfn);
2825                         parent = parent->bus->self;
2826                 }
2827                 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
2828                         iommu_detach_dev(iommu,
2829                                 tmp->subordinate->number, 0);
2830                 else /* this is a legacy PCI bridge */
2831                         iommu_detach_dev(iommu,
2832                                 tmp->bus->number, tmp->devfn);
2833         }
2834 }
2835
2836 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2837                                           struct pci_dev *pdev)
2838 {
2839         struct device_domain_info *info;
2840         struct intel_iommu *iommu;
2841         unsigned long flags;
2842         int found = 0;
2843         struct list_head *entry, *tmp;
2844
2845         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2846         if (!iommu)
2847                 return;
2848
2849         spin_lock_irqsave(&device_domain_lock, flags);
2850         list_for_each_safe(entry, tmp, &domain->devices) {
2851                 info = list_entry(entry, struct device_domain_info, link);
2852                 if (info->bus == pdev->bus->number &&
2853                     info->devfn == pdev->devfn) {
2854                         list_del(&info->link);
2855                         list_del(&info->global);
2856                         if (info->dev)
2857                                 info->dev->dev.archdata.iommu = NULL;
2858                         spin_unlock_irqrestore(&device_domain_lock, flags);
2859
2860                         iommu_detach_dev(iommu, info->bus, info->devfn);
2861                         iommu_detach_dependent_devices(iommu, pdev);
2862                         free_devinfo_mem(info);
2863
2864                         spin_lock_irqsave(&device_domain_lock, flags);
2865
2866                         if (found)
2867                                 break;
2868                         else
2869                                 continue;
2870                 }
2871
2872                 /* if there is no other devices under the same iommu
2873                  * owned by this domain, clear this iommu in iommu_bmp
2874                  * update iommu count and coherency
2875                  */
2876                 if (device_to_iommu(info->bus, info->devfn) == iommu)
2877                         found = 1;
2878         }
2879
2880         if (found == 0) {
2881                 unsigned long tmp_flags;
2882                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2883                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2884                 domain->iommu_count--;
2885                 domain_update_iommu_cap(domain);
2886                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2887         }
2888
2889         spin_unlock_irqrestore(&device_domain_lock, flags);
2890 }
2891
2892 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2893 {
2894         struct device_domain_info *info;
2895         struct intel_iommu *iommu;
2896         unsigned long flags1, flags2;
2897
2898         spin_lock_irqsave(&device_domain_lock, flags1);
2899         while (!list_empty(&domain->devices)) {
2900                 info = list_entry(domain->devices.next,
2901                         struct device_domain_info, link);
2902                 list_del(&info->link);
2903                 list_del(&info->global);
2904                 if (info->dev)
2905                         info->dev->dev.archdata.iommu = NULL;
2906
2907                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2908
2909                 iommu = device_to_iommu(info->bus, info->devfn);
2910                 iommu_detach_dev(iommu, info->bus, info->devfn);
2911                 iommu_detach_dependent_devices(iommu, info->dev);
2912
2913                 /* clear this iommu in iommu_bmp, update iommu count
2914                  * and capabilities
2915                  */
2916                 spin_lock_irqsave(&domain->iommu_lock, flags2);
2917                 if (test_and_clear_bit(iommu->seq_id,
2918                                        &domain->iommu_bmp)) {
2919                         domain->iommu_count--;
2920                         domain_update_iommu_cap(domain);
2921                 }
2922                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2923
2924                 free_devinfo_mem(info);
2925                 spin_lock_irqsave(&device_domain_lock, flags1);
2926         }
2927         spin_unlock_irqrestore(&device_domain_lock, flags1);
2928 }
2929
2930 /* domain id for virtual machine, it won't be set in context */
2931 static unsigned long vm_domid;
2932
2933 static int vm_domain_min_agaw(struct dmar_domain *domain)
2934 {
2935         int i;
2936         int min_agaw = domain->agaw;
2937
2938         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2939         for (; i < g_num_of_iommus; ) {
2940                 if (min_agaw > g_iommus[i]->agaw)
2941                         min_agaw = g_iommus[i]->agaw;
2942
2943                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2944         }
2945
2946         return min_agaw;
2947 }
2948
2949 static struct dmar_domain *iommu_alloc_vm_domain(void)
2950 {
2951         struct dmar_domain *domain;
2952
2953         domain = alloc_domain_mem();
2954         if (!domain)
2955                 return NULL;
2956
2957         domain->id = vm_domid++;
2958         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2959         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2960
2961         return domain;
2962 }
2963
2964 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2965 {
2966         int adjust_width;
2967
2968         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2969         spin_lock_init(&domain->mapping_lock);
2970         spin_lock_init(&domain->iommu_lock);
2971
2972         domain_reserve_special_ranges(domain);
2973
2974         /* calculate AGAW */
2975         domain->gaw = guest_width;
2976         adjust_width = guestwidth_to_adjustwidth(guest_width);
2977         domain->agaw = width_to_agaw(adjust_width);
2978
2979         INIT_LIST_HEAD(&domain->devices);
2980
2981         domain->iommu_count = 0;
2982         domain->iommu_coherency = 0;
2983         domain->max_addr = 0;
2984
2985         /* always allocate the top pgd */
2986         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2987         if (!domain->pgd)
2988                 return -ENOMEM;
2989         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2990         return 0;
2991 }
2992
2993 static void iommu_free_vm_domain(struct dmar_domain *domain)
2994 {
2995         unsigned long flags;
2996         struct dmar_drhd_unit *drhd;
2997         struct intel_iommu *iommu;
2998         unsigned long i;
2999         unsigned long ndomains;
3000
3001         for_each_drhd_unit(drhd) {
3002                 if (drhd->ignored)
3003                         continue;
3004                 iommu = drhd->iommu;
3005
3006                 ndomains = cap_ndoms(iommu->cap);
3007                 i = find_first_bit(iommu->domain_ids, ndomains);
3008                 for (; i < ndomains; ) {
3009                         if (iommu->domains[i] == domain) {
3010                                 spin_lock_irqsave(&iommu->lock, flags);
3011                                 clear_bit(i, iommu->domain_ids);
3012                                 iommu->domains[i] = NULL;
3013                                 spin_unlock_irqrestore(&iommu->lock, flags);
3014                                 break;
3015                         }
3016                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3017                 }
3018         }
3019 }
3020
3021 static void vm_domain_exit(struct dmar_domain *domain)
3022 {
3023         u64 end;
3024
3025         /* Domain 0 is reserved, so dont process it */
3026         if (!domain)
3027                 return;
3028
3029         vm_domain_remove_all_dev_info(domain);
3030         /* destroy iovas */
3031         put_iova_domain(&domain->iovad);
3032         end = DOMAIN_MAX_ADDR(domain->gaw);
3033         end = end & (~VTD_PAGE_MASK);
3034
3035         /* clear ptes */
3036         dma_pte_clear_range(domain, 0, end);
3037
3038         /* free page tables */
3039         dma_pte_free_pagetable(domain, 0, end);
3040
3041         iommu_free_vm_domain(domain);
3042         free_domain_mem(domain);
3043 }
3044
3045 static int intel_iommu_domain_init(struct iommu_domain *domain)
3046 {
3047         struct dmar_domain *dmar_domain;
3048
3049         dmar_domain = iommu_alloc_vm_domain();
3050         if (!dmar_domain) {
3051                 printk(KERN_ERR
3052                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3053                 return -ENOMEM;
3054         }
3055         if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3056                 printk(KERN_ERR
3057                         "intel_iommu_domain_init() failed\n");
3058                 vm_domain_exit(dmar_domain);
3059                 return -ENOMEM;
3060         }
3061         domain->priv = dmar_domain;
3062
3063         return 0;
3064 }
3065
3066 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3067 {
3068         struct dmar_domain *dmar_domain = domain->priv;
3069
3070         domain->priv = NULL;
3071         vm_domain_exit(dmar_domain);
3072 }
3073
3074 static int intel_iommu_attach_device(struct iommu_domain *domain,
3075                                      struct device *dev)
3076 {
3077         struct dmar_domain *dmar_domain = domain->priv;
3078         struct pci_dev *pdev = to_pci_dev(dev);
3079         struct intel_iommu *iommu;
3080         int addr_width;
3081         u64 end;
3082         int ret;
3083
3084         /* normally pdev is not mapped */
3085         if (unlikely(domain_context_mapped(pdev))) {
3086                 struct dmar_domain *old_domain;
3087
3088                 old_domain = find_domain(pdev);
3089                 if (old_domain) {
3090                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3091                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3092                         else
3093                                 domain_remove_dev_info(old_domain);
3094                 }
3095         }
3096
3097         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3098         if (!iommu)
3099                 return -ENODEV;
3100
3101         /* check if this iommu agaw is sufficient for max mapped address */
3102         addr_width = agaw_to_width(iommu->agaw);
3103         end = DOMAIN_MAX_ADDR(addr_width);
3104         end = end & VTD_PAGE_MASK;
3105         if (end < dmar_domain->max_addr) {
3106                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3107                        "sufficient for the mapped address (%llx)\n",
3108                        __func__, iommu->agaw, dmar_domain->max_addr);
3109                 return -EFAULT;
3110         }
3111
3112         ret = domain_context_mapping(dmar_domain, pdev);
3113         if (ret)
3114                 return ret;
3115
3116         ret = vm_domain_add_dev_info(dmar_domain, pdev);
3117         return ret;
3118 }
3119
3120 static void intel_iommu_detach_device(struct iommu_domain *domain,
3121                                       struct device *dev)
3122 {
3123         struct dmar_domain *dmar_domain = domain->priv;
3124         struct pci_dev *pdev = to_pci_dev(dev);
3125
3126         vm_domain_remove_one_dev_info(dmar_domain, pdev);
3127 }
3128
3129 static int intel_iommu_map_range(struct iommu_domain *domain,
3130                                  unsigned long iova, phys_addr_t hpa,
3131                                  size_t size, int iommu_prot)
3132 {
3133         struct dmar_domain *dmar_domain = domain->priv;
3134         u64 max_addr;
3135         int addr_width;
3136         int prot = 0;
3137         int ret;
3138
3139         if (iommu_prot & IOMMU_READ)
3140                 prot |= DMA_PTE_READ;
3141         if (iommu_prot & IOMMU_WRITE)
3142                 prot |= DMA_PTE_WRITE;
3143         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3144                 prot |= DMA_PTE_SNP;
3145
3146         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3147         if (dmar_domain->max_addr < max_addr) {
3148                 int min_agaw;
3149                 u64 end;
3150
3151                 /* check if minimum agaw is sufficient for mapped address */
3152                 min_agaw = vm_domain_min_agaw(dmar_domain);
3153                 addr_width = agaw_to_width(min_agaw);
3154                 end = DOMAIN_MAX_ADDR(addr_width);
3155                 end = end & VTD_PAGE_MASK;
3156                 if (end < max_addr) {
3157                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3158                                "sufficient for the mapped address (%llx)\n",
3159                                __func__, min_agaw, max_addr);
3160                         return -EFAULT;
3161                 }
3162                 dmar_domain->max_addr = max_addr;
3163         }
3164
3165         ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3166         return ret;
3167 }
3168
3169 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3170                                     unsigned long iova, size_t size)
3171 {
3172         struct dmar_domain *dmar_domain = domain->priv;
3173         dma_addr_t base;
3174
3175         /* The address might not be aligned */
3176         base = iova & VTD_PAGE_MASK;
3177         size = VTD_PAGE_ALIGN(size);
3178         dma_pte_clear_range(dmar_domain, base, base + size);
3179
3180         if (dmar_domain->max_addr == base + size)
3181                 dmar_domain->max_addr = base;
3182 }
3183
3184 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3185                                             unsigned long iova)
3186 {
3187         struct dmar_domain *dmar_domain = domain->priv;
3188         struct dma_pte *pte;
3189         u64 phys = 0;
3190
3191         pte = addr_to_dma_pte(dmar_domain, iova);
3192         if (pte)
3193                 phys = dma_pte_addr(pte);
3194
3195         return phys;
3196 }
3197
3198 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3199                                       unsigned long cap)
3200 {
3201         struct dmar_domain *dmar_domain = domain->priv;
3202
3203         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3204                 return dmar_domain->iommu_snooping;
3205
3206         return 0;
3207 }
3208
3209 static struct iommu_ops intel_iommu_ops = {
3210         .domain_init    = intel_iommu_domain_init,
3211         .domain_destroy = intel_iommu_domain_destroy,
3212         .attach_dev     = intel_iommu_attach_device,
3213         .detach_dev     = intel_iommu_detach_device,
3214         .map            = intel_iommu_map_range,
3215         .unmap          = intel_iommu_unmap_range,
3216         .iova_to_phys   = intel_iommu_iova_to_phys,
3217         .domain_has_cap = intel_iommu_domain_has_cap,
3218 };
3219
3220 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3221 {
3222         /*
3223          * Mobile 4 Series Chipset neglects to set RWBF capability,
3224          * but needs it:
3225          */
3226         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3227         rwbf_quirk = 1;
3228 }
3229
3230 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);