Merge git://git.infradead.org/iommu-2.6
[pandora-kernel.git] / drivers / pci / intel-iommu.c
index 6af6b62..59f17ac 100644 (file)
@@ -47,6 +47,8 @@
 #define ROOT_SIZE              VTD_PAGE_SIZE
 #define CONTEXT_SIZE           VTD_PAGE_SIZE
 
+#define IS_BRIDGE_HOST_DEVICE(pdev) \
+                           ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
@@ -116,6 +118,11 @@ static inline unsigned long align_to_level(unsigned long pfn, int level)
        return (pfn + level_size(level) - 1) & level_mask(level);
 }
 
+static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
+{
+       return  1 << ((lvl - 1) * LEVEL_STRIDE);
+}
+
 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
    are never going to work. */
 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
@@ -142,6 +149,12 @@ static struct intel_iommu **g_iommus;
 static void __init check_tylersburg_isoch(void);
 static int rwbf_quirk;
 
+/*
+ * set to 1 to panic kernel if can't successfully enable VT-d
+ * (used when kernel is launched w/ TXT)
+ */
+static int force_on = 0;
+
 /*
  * 0: Present
  * 1-11: Reserved
@@ -338,6 +351,9 @@ struct dmar_domain {
        int             iommu_coherency;/* indicate coherency of iommu access */
        int             iommu_snooping; /* indicate snooping control feature*/
        int             iommu_count;    /* reference count of iommu */
+       int             iommu_superpage;/* Level of superpages supported:
+                                          0 == 4KiB (no superpages), 1 == 2MiB,
+                                          2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
        spinlock_t      iommu_lock;     /* protect iommu set in domain */
        u64             max_addr;       /* maximum mapped address */
 };
@@ -387,6 +403,7 @@ int dmar_disabled = 1;
 static int dmar_map_gfx = 1;
 static int dmar_forcedac;
 static int intel_iommu_strict;
+static int intel_iommu_superpage = 1;
 
 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 static DEFINE_SPINLOCK(device_domain_lock);
@@ -417,6 +434,10 @@ static int __init intel_iommu_setup(char *str)
                        printk(KERN_INFO
                                "Intel-IOMMU: disable batched IOTLB flush\n");
                        intel_iommu_strict = 1;
+               } else if (!strncmp(str, "sp_off", 6)) {
+                       printk(KERN_INFO
+                               "Intel-IOMMU: disable supported super page\n");
+                       intel_iommu_superpage = 0;
                }
 
                str += strcspn(str, ",");
@@ -555,11 +576,32 @@ static void domain_update_iommu_snooping(struct dmar_domain *domain)
        }
 }
 
+static void domain_update_iommu_superpage(struct dmar_domain *domain)
+{
+       int i, mask = 0xf;
+
+       if (!intel_iommu_superpage) {
+               domain->iommu_superpage = 0;
+               return;
+       }
+
+       domain->iommu_superpage = 4; /* 1TiB */
+
+       for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
+               mask |= cap_super_page_val(g_iommus[i]->cap);
+               if (!mask) {
+                       break;
+               }
+       }
+       domain->iommu_superpage = fls(mask);
+}
+
 /* Some capabilities may be different across iommus */
 static void domain_update_iommu_cap(struct dmar_domain *domain)
 {
        domain_update_iommu_coherency(domain);
        domain_update_iommu_snooping(domain);
+       domain_update_iommu_superpage(domain);
 }
 
 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
@@ -689,23 +731,31 @@ out:
 }
 
 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
-                                     unsigned long pfn)
+                                     unsigned long pfn, int large_level)
 {
        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
        struct dma_pte *parent, *pte = NULL;
        int level = agaw_to_level(domain->agaw);
-       int offset;
+       int offset, target_level;
 
        BUG_ON(!domain->pgd);
        BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
        parent = domain->pgd;
 
+       /* Search pte */
+       if (!large_level)
+               target_level = 1;
+       else
+               target_level = large_level;
+
        while (level > 0) {
                void *tmp_page;
 
                offset = pfn_level_offset(pfn, level);
                pte = &parent[offset];
-               if (level == 1)
+               if (!large_level && (pte->val & DMA_PTE_LARGE_PAGE))
+                       break;
+               if (level == target_level)
                        break;
 
                if (!dma_pte_present(pte)) {
@@ -733,10 +783,11 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
        return pte;
 }
 
+
 /* return address's pte at specific level */
 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
                                         unsigned long pfn,
-                                        int level)
+                                        int level, int *large_page)
 {
        struct dma_pte *parent, *pte = NULL;
        int total = agaw_to_level(domain->agaw);
@@ -749,8 +800,16 @@ static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
                if (level == total)
                        return pte;
 
-               if (!dma_pte_present(pte))
+               if (!dma_pte_present(pte)) {
+                       *large_page = total;
                        break;
+               }
+
+               if (pte->val & DMA_PTE_LARGE_PAGE) {
+                       *large_page = total;
+                       return pte;
+               }
+
                parent = phys_to_virt(dma_pte_addr(pte));
                total--;
        }
@@ -763,6 +822,7 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
                                unsigned long last_pfn)
 {
        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+       unsigned int large_page = 1;
        struct dma_pte *first_pte, *pte;
 
        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
@@ -771,14 +831,15 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
 
        /* we don't need lock here; nobody else touches the iova range */
        do {
-               first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
+               large_page = 1;
+               first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
                if (!pte) {
-                       start_pfn = align_to_level(start_pfn + 1, 2);
+                       start_pfn = align_to_level(start_pfn + 1, large_page + 1);
                        continue;
                }
-               do { 
+               do {
                        dma_clear_pte(pte);
-                       start_pfn++;
+                       start_pfn += lvl_to_nr_pages(large_page);
                        pte++;
                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 
@@ -798,6 +859,7 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
        int total = agaw_to_level(domain->agaw);
        int level;
        unsigned long tmp;
+       int large_page = 2;
 
        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
@@ -813,7 +875,10 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
                        return;
 
                do {
-                       first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
+                       large_page = level;
+                       first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
+                       if (large_page > level)
+                               level = large_page + 1;
                        if (!pte) {
                                tmp = align_to_level(tmp + 1, level + 1);
                                continue;
@@ -1397,6 +1462,7 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
        else
                domain->iommu_snooping = 0;
 
+       domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
        domain->iommu_count = 1;
        domain->nid = iommu->node;
 
@@ -1417,6 +1483,10 @@ static void domain_exit(struct dmar_domain *domain)
        if (!domain)
                return;
 
+       /* Flush any lazy unmaps that may reference this domain */
+       if (!intel_iommu_strict)
+               flush_unmaps_timeout(0);
+
        domain_remove_dev_info(domain);
        /* destroy iovas */
        put_iova_domain(&domain->iovad);
@@ -1648,6 +1718,34 @@ static inline unsigned long aligned_nrpages(unsigned long host_addr,
        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
 }
 
+/* Return largest possible superpage level for a given mapping */
+static inline int hardware_largepage_caps(struct dmar_domain *domain,
+                                         unsigned long iov_pfn,
+                                         unsigned long phy_pfn,
+                                         unsigned long pages)
+{
+       int support, level = 1;
+       unsigned long pfnmerge;
+
+       support = domain->iommu_superpage;
+
+       /* To use a large page, the virtual *and* physical addresses
+          must be aligned to 2MiB/1GiB/etc. Lower bits set in either
+          of them will mean we have to use smaller pages. So just
+          merge them and check both at once. */
+       pfnmerge = iov_pfn | phy_pfn;
+
+       while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
+               pages >>= VTD_STRIDE_SHIFT;
+               if (!pages)
+                       break;
+               pfnmerge >>= VTD_STRIDE_SHIFT;
+               level++;
+               support--;
+       }
+       return level;
+}
+
 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
                            struct scatterlist *sg, unsigned long phys_pfn,
                            unsigned long nr_pages, int prot)
@@ -1656,6 +1754,8 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
        phys_addr_t uninitialized_var(pteval);
        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
        unsigned long sg_res;
+       unsigned int largepage_lvl = 0;
+       unsigned long lvl_pages = 0;
 
        BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
 
@@ -1671,7 +1771,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
        }
 
-       while (nr_pages--) {
+       while (nr_pages > 0) {
                uint64_t tmp;
 
                if (!sg_res) {
@@ -1679,11 +1779,21 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
                        sg->dma_length = sg->length;
                        pteval = page_to_phys(sg_page(sg)) | prot;
+                       phys_pfn = pteval >> VTD_PAGE_SHIFT;
                }
+
                if (!pte) {
-                       first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
+                       largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
+
+                       first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
                        if (!pte)
                                return -ENOMEM;
+                       /* It is large page*/
+                       if (largepage_lvl > 1)
+                               pteval |= DMA_PTE_LARGE_PAGE;
+                       else
+                               pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
+
                }
                /* We don't need lock here, nobody else
                 * touches the iova range
@@ -1699,16 +1809,38 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
                        }
                        WARN_ON(1);
                }
+
+               lvl_pages = lvl_to_nr_pages(largepage_lvl);
+
+               BUG_ON(nr_pages < lvl_pages);
+               BUG_ON(sg_res < lvl_pages);
+
+               nr_pages -= lvl_pages;
+               iov_pfn += lvl_pages;
+               phys_pfn += lvl_pages;
+               pteval += lvl_pages * VTD_PAGE_SIZE;
+               sg_res -= lvl_pages;
+
+               /* If the next PTE would be the first in a new page, then we
+                  need to flush the cache on the entries we've just written.
+                  And then we'll need to recalculate 'pte', so clear it and
+                  let it get set again in the if (!pte) block above.
+
+                  If we're done (!nr_pages) we need to flush the cache too.
+
+                  Also if we've been setting superpages, we may need to
+                  recalculate 'pte' and switch back to smaller pages for the
+                  end of the mapping, if the trailing size is not enough to
+                  use another superpage (i.e. sg_res < lvl_pages). */
                pte++;
-               if (!nr_pages || first_pte_in_page(pte)) {
+               if (!nr_pages || first_pte_in_page(pte) ||
+                   (largepage_lvl > 1 && sg_res < lvl_pages)) {
                        domain_flush_cache(domain, first_pte,
                                           (void *)pte - (void *)first_pte);
                        pte = NULL;
                }
-               iov_pfn++;
-               pteval += VTD_PAGE_SIZE;
-               sg_res--;
-               if (!sg_res)
+
+               if (!sg_res && nr_pages)
                        sg = sg_next(sg);
        }
        return 0;
@@ -2016,7 +2148,7 @@ static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
        if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
                return 0;
        return iommu_prepare_identity_map(pdev, rmrr->base_address,
-               rmrr->end_address + 1);
+               rmrr->end_address);
 }
 
 #ifdef CONFIG_DMAR_FLOPPY_WA
@@ -2030,7 +2162,7 @@ static inline void iommu_prepare_isa(void)
                return;
 
        printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
-       ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
+       ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
 
        if (ret)
                printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
@@ -2106,10 +2238,10 @@ static int identity_mapping(struct pci_dev *pdev)
        if (likely(!iommu_identity_mapping))
                return 0;
 
+       info = pdev->dev.archdata.iommu;
+       if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
+               return (info->domain == si_domain);
 
-       list_for_each_entry(info, &si_domain->devices, link)
-               if (info->dev == pdev)
-                       return 1;
        return 0;
 }
 
@@ -2187,8 +2319,19 @@ static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
         * Assume that they will -- if they turn out not to be, then we can 
         * take them out of the 1:1 domain later.
         */
-       if (!startup)
-               return pdev->dma_mask > DMA_BIT_MASK(32);
+       if (!startup) {
+               /*
+                * If the device's dma_mask is less than the system's memory
+                * size then this is not a candidate for identity mapping.
+                */
+               u64 dma_mask = pdev->dma_mask;
+
+               if (pdev->dev.coherent_dma_mask &&
+                   pdev->dev.coherent_dma_mask < dma_mask)
+                       dma_mask = pdev->dev.coherent_dma_mask;
+
+               return dma_mask >= dma_get_required_mask(&pdev->dev);
+       }
 
        return 1;
 }
@@ -2203,6 +2346,9 @@ static int __init iommu_prepare_static_identity_mapping(int hw)
                return -EFAULT;
 
        for_each_pci_dev(pdev) {
+               /* Skip Host/PCI Bridge devices */
+               if (IS_BRIDGE_HOST_DEVICE(pdev))
+                       continue;
                if (iommu_should_identity_map(pdev, 1)) {
                        printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
                               hw ? "hardware" : "software", pci_name(pdev));
@@ -2218,7 +2364,7 @@ static int __init iommu_prepare_static_identity_mapping(int hw)
        return 0;
 }
 
-static int __init init_dmars(int force_on)
+static int __init init_dmars(void)
 {
        struct dmar_drhd_unit *drhd;
        struct dmar_rmrr_unit *rmrr;
@@ -2592,8 +2738,7 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
        iommu = domain_get_iommu(domain);
        size = aligned_nrpages(paddr, size);
 
-       iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
-                               pdev->dma_mask);
+       iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
        if (!iova)
                goto error;
 
@@ -3118,7 +3263,17 @@ static int init_iommu_hw(void)
                if (iommu->qi)
                        dmar_reenable_qi(iommu);
 
-       for_each_active_iommu(iommu, drhd) {
+       for_each_iommu(iommu, drhd) {
+               if (drhd->ignored) {
+                       /*
+                        * we always have to disable PMRs or DMA may fail on
+                        * this device
+                        */
+                       if (force_on)
+                               iommu_disable_protect_mem_regions(iommu);
+                       continue;
+               }
+       
                iommu_flush_write_buffer(iommu);
 
                iommu_set_root_entry(iommu);
@@ -3127,7 +3282,8 @@ static int init_iommu_hw(void)
                                           DMA_CCMD_GLOBAL_INVL);
                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
                                         DMA_TLB_GLOBAL_FLUSH);
-               iommu_enable_translation(iommu);
+               if (iommu_enable_translation(iommu))
+                       return 1;
                iommu_disable_protect_mem_regions(iommu);
        }
 
@@ -3194,7 +3350,10 @@ static void iommu_resume(void)
        unsigned long flag;
 
        if (init_iommu_hw()) {
-               WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
+               if (force_on)
+                       panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
+               else
+                       WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
                return;
        }
 
@@ -3271,7 +3430,6 @@ static struct notifier_block device_nb = {
 int __init intel_iommu_init(void)
 {
        int ret = 0;
-       int force_on = 0;
 
        /* VT-d is required for a TXT/tboot launch, so enforce that */
        force_on = tboot_force_iommu();
@@ -3309,7 +3467,7 @@ int __init intel_iommu_init(void)
 
        init_no_remapping_devices();
 
-       ret = init_dmars(force_on);
+       ret = init_dmars();
        if (ret) {
                if (force_on)
                        panic("tboot: Failed to initialize DMARs\n");
@@ -3380,8 +3538,8 @@ static void domain_remove_one_dev_info(struct dmar_domain *domain,
        spin_lock_irqsave(&device_domain_lock, flags);
        list_for_each_safe(entry, tmp, &domain->devices) {
                info = list_entry(entry, struct device_domain_info, link);
-               /* No need to compare PCI domain; it has to be the same */
-               if (info->bus == pdev->bus->number &&
+               if (info->segment == pci_domain_nr(pdev->bus) &&
+                   info->bus == pdev->bus->number &&
                    info->devfn == pdev->devfn) {
                        list_del(&info->link);
                        list_del(&info->global);
@@ -3419,10 +3577,13 @@ static void domain_remove_one_dev_info(struct dmar_domain *domain,
                domain_update_iommu_cap(domain);
                spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
 
-               spin_lock_irqsave(&iommu->lock, tmp_flags);
-               clear_bit(domain->id, iommu->domain_ids);
-               iommu->domains[domain->id] = NULL;
-               spin_unlock_irqrestore(&iommu->lock, tmp_flags);
+               if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
+                   !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
+                       spin_lock_irqsave(&iommu->lock, tmp_flags);
+                       clear_bit(domain->id, iommu->domain_ids);
+                       iommu->domains[domain->id] = NULL;
+                       spin_unlock_irqrestore(&iommu->lock, tmp_flags);
+               }
        }
 
        spin_unlock_irqrestore(&device_domain_lock, flags);
@@ -3505,6 +3666,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
        domain->iommu_count = 0;
        domain->iommu_coherency = 0;
        domain->iommu_snooping = 0;
+       domain->iommu_superpage = 0;
        domain->max_addr = 0;
        domain->nid = -1;
 
@@ -3720,7 +3882,7 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
        struct dma_pte *pte;
        u64 phys = 0;
 
-       pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
+       pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
        if (pte)
                phys = dma_pte_addr(pte);