PCI/e1000e: Add and use pci_disable_link_state_locked()
[linux-2.6.git] / drivers / pci / intel-iommu.c
index e3e84ca..505c1c7 100644 (file)
@@ -36,7 +36,7 @@
 #include <linux/iova.h>
 #include <linux/iommu.h>
 #include <linux/intel-iommu.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 #include <linux/tboot.h>
 #include <linux/dmi.h>
 #include <asm/cacheflush.h>
 #define DMA_32BIT_PFN          IOVA_PFN(DMA_BIT_MASK(32))
 #define DMA_64BIT_PFN          IOVA_PFN(DMA_BIT_MASK(64))
 
+/* page table handling */
+#define LEVEL_STRIDE           (9)
+#define LEVEL_MASK             (((u64)1 << LEVEL_STRIDE) - 1)
+
+static inline int agaw_to_level(int agaw)
+{
+       return agaw + 2;
+}
+
+static inline int agaw_to_width(int agaw)
+{
+       return 30 + agaw * LEVEL_STRIDE;
+}
+
+static inline int width_to_agaw(int width)
+{
+       return (width - 30) / LEVEL_STRIDE;
+}
+
+static inline unsigned int level_to_offset_bits(int level)
+{
+       return (level - 1) * LEVEL_STRIDE;
+}
+
+static inline int pfn_level_offset(unsigned long pfn, int level)
+{
+       return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
+}
+
+static inline unsigned long level_mask(int level)
+{
+       return -1UL << level_to_offset_bits(level);
+}
+
+static inline unsigned long level_size(int level)
+{
+       return 1UL << level_to_offset_bits(level);
+}
+
+static inline unsigned long align_to_level(unsigned long pfn, int level)
+{
+       return (pfn + level_size(level) - 1) & level_mask(level);
+}
 
 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
    are never going to work. */
@@ -236,7 +279,7 @@ static inline u64 dma_pte_addr(struct dma_pte *pte)
        return pte->val & VTD_PAGE_MASK;
 #else
        /* Must have a full atomic 64-bit read */
-       return  __cmpxchg64(pte, 0ULL, 0ULL) & VTD_PAGE_MASK;
+       return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 #endif
 }
 
@@ -305,7 +348,7 @@ struct device_domain_info {
        int segment;            /* PCI domain */
        u8 bus;                 /* PCI bus number */
        u8 devfn;               /* PCI devfn number */
-       struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
+       struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
        struct intel_iommu *iommu; /* IOMMU used by this device */
        struct dmar_domain *domain; /* pointer to domain */
 };
@@ -340,7 +383,7 @@ int dmar_disabled = 0;
 int dmar_disabled = 1;
 #endif /*CONFIG_DMAR_DEFAULT_ON*/
 
-static int __initdata dmar_map_gfx = 1;
+static int dmar_map_gfx = 1;
 static int dmar_forcedac;
 static int intel_iommu_strict;
 
@@ -387,33 +430,14 @@ static struct kmem_cache *iommu_domain_cache;
 static struct kmem_cache *iommu_devinfo_cache;
 static struct kmem_cache *iommu_iova_cache;
 
-static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
-{
-       unsigned int flags;
-       void *vaddr;
-
-       /* trying to avoid low memory issues */
-       flags = current->flags & PF_MEMALLOC;
-       current->flags |= PF_MEMALLOC;
-       vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
-       current->flags &= (~PF_MEMALLOC | flags);
-       return vaddr;
-}
-
-
 static inline void *alloc_pgtable_page(int node)
 {
-       unsigned int flags;
        struct page *page;
        void *vaddr = NULL;
 
-       /* trying to avoid low memory issues */
-       flags = current->flags & PF_MEMALLOC;
-       current->flags |= PF_MEMALLOC;
        page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
        if (page)
                vaddr = page_address(page);
-       current->flags &= (~PF_MEMALLOC | flags);
        return vaddr;
 }
 
@@ -424,7 +448,7 @@ static inline void free_pgtable_page(void *vaddr)
 
 static inline void *alloc_domain_mem(void)
 {
-       return iommu_kmem_cache_alloc(iommu_domain_cache);
+       return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 }
 
 static void free_domain_mem(void *vaddr)
@@ -434,7 +458,7 @@ static void free_domain_mem(void *vaddr)
 
 static inline void * alloc_devinfo_mem(void)
 {
-       return iommu_kmem_cache_alloc(iommu_devinfo_cache);
+       return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 }
 
 static inline void free_devinfo_mem(void *vaddr)
@@ -444,7 +468,7 @@ static inline void free_devinfo_mem(void *vaddr)
 
 struct iova *alloc_iova_mem(void)
 {
-       return iommu_kmem_cache_alloc(iommu_iova_cache);
+       return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
 }
 
 void free_iova_mem(struct iova *iova)
@@ -453,8 +477,6 @@ void free_iova_mem(struct iova *iova)
 }
 
 
-static inline int width_to_agaw(int width);
-
 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 {
        unsigned long sagaw;
@@ -510,13 +532,11 @@ static void domain_update_iommu_coherency(struct dmar_domain *domain)
 
        domain->iommu_coherency = 1;
 
-       i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
-       for (; i < g_num_of_iommus; ) {
+       for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
                if (!ecap_coherent(g_iommus[i]->ecap)) {
                        domain->iommu_coherency = 0;
                        break;
                }
-               i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
        }
 }
 
@@ -526,13 +546,11 @@ static void domain_update_iommu_snooping(struct dmar_domain *domain)
 
        domain->iommu_snooping = 1;
 
-       i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
-       for (; i < g_num_of_iommus; ) {
+       for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
                if (!ecap_sc_support(g_iommus[i]->ecap)) {
                        domain->iommu_snooping = 0;
                        break;
                }
-               i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
        }
 }
 
@@ -669,51 +687,6 @@ out:
        spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
-/* page table handling */
-#define LEVEL_STRIDE           (9)
-#define LEVEL_MASK             (((u64)1 << LEVEL_STRIDE) - 1)
-
-static inline int agaw_to_level(int agaw)
-{
-       return agaw + 2;
-}
-
-static inline int agaw_to_width(int agaw)
-{
-       return 30 + agaw * LEVEL_STRIDE;
-
-}
-
-static inline int width_to_agaw(int width)
-{
-       return (width - 30) / LEVEL_STRIDE;
-}
-
-static inline unsigned int level_to_offset_bits(int level)
-{
-       return (level - 1) * LEVEL_STRIDE;
-}
-
-static inline int pfn_level_offset(unsigned long pfn, int level)
-{
-       return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
-}
-
-static inline unsigned long level_mask(int level)
-{
-       return -1UL << level_to_offset_bits(level);
-}
-
-static inline unsigned long level_size(int level)
-{
-       return 1UL << level_to_offset_bits(level);
-}
-
-static inline unsigned long align_to_level(unsigned long pfn, int level)
-{
-       return (pfn + level_size(level) - 1) & level_mask(level);
-}
-
 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
                                      unsigned long pfn)
 {
@@ -1087,7 +1060,7 @@ static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
 }
 
 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
-                                 unsigned long pfn, unsigned int pages)
+                                 unsigned long pfn, unsigned int pages, int map)
 {
        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
@@ -1108,10 +1081,10 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
                                                DMA_TLB_PSI_FLUSH);
 
        /*
-        * In caching mode, domain ID 0 is reserved for non-present to present
-        * mapping flush. Device IOTLB doesn't need to be flushed in this case.
+        * In caching mode, changes of pages from non-present to present require
+        * flush. However, device IOTLB doesn't need to be flushed in this case.
         */
-       if (!cap_caching_mode(iommu->cap) || did)
+       if (!cap_caching_mode(iommu->cap) || !map)
                iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
 }
 
@@ -1173,7 +1146,8 @@ static int iommu_init_domains(struct intel_iommu *iommu)
        unsigned long nlongs;
 
        ndomains = cap_ndoms(iommu->cap);
-       pr_debug("Number of Domains supportd <%ld>\n", ndomains);
+       pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
+                       ndomains);
        nlongs = BITS_TO_LONGS(ndomains);
 
        spin_lock_init(&iommu->lock);
@@ -1213,8 +1187,7 @@ void free_dmar_iommu(struct intel_iommu *iommu)
        unsigned long flags;
 
        if ((iommu->domains) && (iommu->domain_ids)) {
-               i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
-               for (; i < cap_ndoms(iommu->cap); ) {
+               for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
                        domain = iommu->domains[i];
                        clear_bit(i, iommu->domain_ids);
 
@@ -1226,9 +1199,6 @@ void free_dmar_iommu(struct intel_iommu *iommu)
                                        domain_exit(domain);
                        }
                        spin_unlock_irqrestore(&domain->iommu_lock, flags);
-
-                       i = find_next_bit(iommu->domain_ids,
-                               cap_ndoms(iommu->cap), i+1);
                }
        }
 
@@ -1236,7 +1206,7 @@ void free_dmar_iommu(struct intel_iommu *iommu)
                iommu_disable_translation(iommu);
 
        if (iommu->irq) {
-               set_irq_data(iommu->irq, NULL);
+               irq_set_handler_data(iommu->irq, NULL);
                /* This will mask the irq */
                free_irq(iommu->irq, iommu);
                destroy_irq(iommu->irq);
@@ -1311,14 +1281,11 @@ static void iommu_detach_domain(struct dmar_domain *domain,
 
        spin_lock_irqsave(&iommu->lock, flags);
        ndomains = cap_ndoms(iommu->cap);
-       num = find_first_bit(iommu->domain_ids, ndomains);
-       for (; num < ndomains; ) {
+       for_each_set_bit(num, iommu->domain_ids, ndomains) {
                if (iommu->domains[num] == domain) {
                        found = 1;
                        break;
                }
-               num = find_next_bit(iommu->domain_ids,
-                                   cap_ndoms(iommu->cap), num+1);
        }
 
        if (found) {
@@ -1504,15 +1471,12 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
 
                /* find an available domain id for this device in iommu */
                ndomains = cap_ndoms(iommu->cap);
-               num = find_first_bit(iommu->domain_ids, ndomains);
-               for (; num < ndomains; ) {
+               for_each_set_bit(num, iommu->domain_ids, ndomains) {
                        if (iommu->domains[num] == domain) {
                                id = num;
                                found = 1;
                                break;
                        }
-                       num = find_next_bit(iommu->domain_ids,
-                                           cap_ndoms(iommu->cap), num+1);
                }
 
                if (found == 0) {
@@ -1530,12 +1494,15 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
 
                /* Skip top levels of page tables for
                 * iommu which has less agaw than default.
+                * Unnecessary for PT mode.
                 */
-               for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
-                       pgd = phys_to_virt(dma_pte_addr(pgd));
-                       if (!dma_pte_present(pgd)) {
-                               spin_unlock_irqrestore(&iommu->lock, flags);
-                               return -ENOMEM;
+               if (translation != CONTEXT_TT_PASS_THROUGH) {
+                       for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
+                               pgd = phys_to_virt(dma_pte_addr(pgd));
+                               if (!dma_pte_present(pgd)) {
+                                       spin_unlock_irqrestore(&iommu->lock, flags);
+                                       return -ENOMEM;
+                               }
                        }
                }
        }
@@ -1574,7 +1541,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
                                           (((u16)bus) << 8) | devfn,
                                           DMA_CCMD_MASK_NOBIT,
                                           DMA_CCMD_DEVICE_INVL);
-               iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH);
+               iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
        } else {
                iommu_flush_write_buffer(iommu);
        }
@@ -1620,7 +1587,7 @@ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
                        return ret;
                parent = parent->bus->self;
        }
-       if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
+       if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
                return domain_context_mapping_one(domain,
                                        pci_domain_nr(tmp->subordinate),
                                        tmp->subordinate->number, 0,
@@ -1660,7 +1627,7 @@ static int domain_context_mapped(struct pci_dev *pdev)
                        return ret;
                parent = parent->bus->self;
        }
-       if (tmp->is_pcie)
+       if (pci_is_pcie(tmp))
                return device_context_mapped(iommu, tmp->subordinate->number,
                                             0);
        else
@@ -1830,7 +1797,7 @@ static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
 
        dev_tmp = pci_find_upstream_pcie_bridge(pdev);
        if (dev_tmp) {
-               if (dev_tmp->is_pcie) {
+               if (pci_is_pcie(dev_tmp)) {
                        bus = dev_tmp->subordinate->number;
                        devfn = 0;
                } else {
@@ -1903,14 +1870,15 @@ static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
                        }
                }
                if (found) {
+                       spin_unlock_irqrestore(&device_domain_lock, flags);
                        free_devinfo_mem(info);
                        domain_exit(domain);
                        domain = found;
                } else {
                        list_add(&info->link, &domain->devices);
                        list_add(&info->global, &device_domain_list);
+                       spin_unlock_irqrestore(&device_domain_lock, flags);
                }
-               spin_unlock_irqrestore(&device_domain_lock, flags);
        }
 
 found_domain:
@@ -2201,7 +2169,7 @@ static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
         * the 1:1 domain, just in _case_ one of their siblings turns out
         * not to be able to map all of memory.
         */
-       if (!pdev->is_pcie) {
+       if (!pci_is_pcie(pdev)) {
                if (!pci_is_root_bus(pdev->bus))
                        return 0;
                if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
@@ -2297,7 +2265,7 @@ int __init init_dmars(void)
                /*
                 * TBD:
                 * we could share the same root & context tables
-                * amoung all IOMMU's. Need to Split it later.
+                * among all IOMMU's. Need to Split it later.
                 */
                ret = iommu_alloc_root_entry(iommu);
                if (ret) {
@@ -2349,14 +2317,16 @@ int __init init_dmars(void)
                         */
                        iommu->flush.flush_context = __iommu_flush_context;
                        iommu->flush.flush_iotlb = __iommu_flush_iotlb;
-                       printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
+                       printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
                               "invalidation\n",
+                               iommu->seq_id,
                               (unsigned long long)drhd->reg_base_addr);
                } else {
                        iommu->flush.flush_context = qi_flush_context;
                        iommu->flush.flush_iotlb = qi_flush_iotlb;
-                       printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
+                       printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
                               "invalidation\n",
+                               iommu->seq_id,
                               (unsigned long long)drhd->reg_base_addr);
                }
        }
@@ -2637,7 +2607,7 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
 
        /* it's a non-present to present mapping. Only flush if caching mode */
        if (cap_caching_mode(iommu->cap))
-               iommu_flush_iotlb_psi(iommu, 0, mm_to_dma_pfn(iova->pfn_lo), size);
+               iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
        else
                iommu_flush_write_buffer(iommu);
 
@@ -2677,15 +2647,24 @@ static void flush_unmaps(void)
                if (!deferred_flush[i].next)
                        continue;
 
-               iommu->flush.flush_iotlb(iommu, 0, 0, 0,
+               /* In caching mode, global flushes turn emulation expensive */
+               if (!cap_caching_mode(iommu->cap))
+                       iommu->flush.flush_iotlb(iommu, 0, 0, 0,
                                         DMA_TLB_GLOBAL_FLUSH);
                for (j = 0; j < deferred_flush[i].next; j++) {
                        unsigned long mask;
                        struct iova *iova = deferred_flush[i].iova[j];
-
-                       mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
-                       iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
-                                       (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
+                       struct dmar_domain *domain = deferred_flush[i].domain[j];
+
+                       /* On real hardware multiple invalidations are expensive */
+                       if (cap_caching_mode(iommu->cap))
+                               iommu_flush_iotlb_psi(iommu, domain->id,
+                               iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
+                       else {
+                               mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
+                               iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
+                                               (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
+                       }
                        __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
                }
                deferred_flush[i].next = 0;
@@ -2766,7 +2745,7 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
 
        if (intel_iommu_strict) {
                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
-                                     last_pfn - start_pfn + 1);
+                                     last_pfn - start_pfn + 1, 0);
                /* free iova */
                __free_iova(&domain->iovad, iova);
        } else {
@@ -2856,7 +2835,7 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
 
        if (intel_iommu_strict) {
                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
-                                     last_pfn - start_pfn + 1);
+                                     last_pfn - start_pfn + 1, 0);
                /* free iova */
                __free_iova(&domain->iovad, iova);
        } else {
@@ -2890,7 +2869,6 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne
        struct dmar_domain *domain;
        size_t size = 0;
        int prot = 0;
-       size_t offset_pfn = 0;
        struct iova *iova = NULL;
        int ret;
        struct scatterlist *sg;
@@ -2944,7 +2922,7 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne
 
        /* it's a non-present to present mapping. Only flush if caching mode */
        if (cap_caching_mode(iommu->cap))
-               iommu_flush_iotlb_psi(iommu, 0, start_vpfn, offset_pfn);
+               iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
        else
                iommu_flush_write_buffer(iommu);
 
@@ -3048,6 +3026,34 @@ static void __init iommu_exit_mempool(void)
 
 }
 
+static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
+{
+       struct dmar_drhd_unit *drhd;
+       u32 vtbar;
+       int rc;
+
+       /* We know that this device on this chipset has its own IOMMU.
+        * If we find it under a different IOMMU, then the BIOS is lying
+        * to us. Hope that the IOMMU for this device is actually
+        * disabled, and it needs no translation...
+        */
+       rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
+       if (rc) {
+               /* "can't" happen */
+               dev_info(&pdev->dev, "failed to run vt-d quirk\n");
+               return;
+       }
+       vtbar &= 0xffff0000;
+
+       /* we know that the this iommu should be at offset 0xa000 from vtbar */
+       drhd = dmar_find_matched_drhd_unit(pdev);
+       if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
+                           TAINT_FIRMWARE_WORKAROUND,
+                           "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
+               pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
+}
+DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
+
 static void __init init_no_remapping_devices(void)
 {
        struct dmar_drhd_unit *drhd;
@@ -3129,7 +3135,7 @@ static void iommu_flush_all(void)
        }
 }
 
-static int iommu_suspend(struct sys_device *dev, pm_message_t state)
+static int iommu_suspend(void)
 {
        struct dmar_drhd_unit *drhd;
        struct intel_iommu *iommu = NULL;
@@ -3169,7 +3175,7 @@ nomem:
        return -ENOMEM;
 }
 
-static int iommu_resume(struct sys_device *dev)
+static void iommu_resume(void)
 {
        struct dmar_drhd_unit *drhd;
        struct intel_iommu *iommu = NULL;
@@ -3177,7 +3183,7 @@ static int iommu_resume(struct sys_device *dev)
 
        if (init_iommu_hw()) {
                WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
-               return -EIO;
+               return;
        }
 
        for_each_active_iommu(iommu, drhd) {
@@ -3198,40 +3204,20 @@ static int iommu_resume(struct sys_device *dev)
 
        for_each_active_iommu(iommu, drhd)
                kfree(iommu->iommu_state);
-
-       return 0;
 }
 
-static struct sysdev_class iommu_sysclass = {
-       .name           = "iommu",
+static struct syscore_ops iommu_syscore_ops = {
        .resume         = iommu_resume,
        .suspend        = iommu_suspend,
 };
 
-static struct sys_device device_iommu = {
-       .cls    = &iommu_sysclass,
-};
-
-static int __init init_iommu_sysfs(void)
+static void __init init_iommu_pm_ops(void)
 {
-       int error;
-
-       error = sysdev_class_register(&iommu_sysclass);
-       if (error)
-               return error;
-
-       error = sysdev_register(&device_iommu);
-       if (error)
-               sysdev_class_unregister(&iommu_sysclass);
-
-       return error;
+       register_syscore_ops(&iommu_syscore_ops);
 }
 
 #else
-static int __init init_iommu_sysfs(void)
-{
-       return 0;
-}
+static inline int init_iommu_pm_ops(void) { }
 #endif /* CONFIG_PM */
 
 /*
@@ -3314,7 +3300,7 @@ int __init intel_iommu_init(void)
 #endif
        dma_ops = &intel_dma_ops;
 
-       init_iommu_sysfs();
+       init_iommu_pm_ops();
 
        register_iommu(&intel_iommu_ops);
 
@@ -3341,7 +3327,7 @@ static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
                                         parent->devfn);
                        parent = parent->bus->self;
                }
-               if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
+               if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
                        iommu_detach_dev(iommu,
                                tmp->subordinate->number, 0);
                else /* this is a legacy PCI bridge */
@@ -3452,22 +3438,6 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
 /* domain id for virtual machine, it won't be set in context */
 static unsigned long vm_domid;
 
-static int vm_domain_min_agaw(struct dmar_domain *domain)
-{
-       int i;
-       int min_agaw = domain->agaw;
-
-       i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
-       for (; i < g_num_of_iommus; ) {
-               if (min_agaw > g_iommus[i]->agaw)
-                       min_agaw = g_iommus[i]->agaw;
-
-               i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
-       }
-
-       return min_agaw;
-}
-
 static struct dmar_domain *iommu_alloc_vm_domain(void)
 {
        struct dmar_domain *domain;
@@ -3528,8 +3498,7 @@ static void iommu_free_vm_domain(struct dmar_domain *domain)
                iommu = drhd->iommu;
 
                ndomains = cap_ndoms(iommu->cap);
-               i = find_first_bit(iommu->domain_ids, ndomains);
-               for (; i < ndomains; ) {
+               for_each_set_bit(i, iommu->domain_ids, ndomains) {
                        if (iommu->domains[i] == domain) {
                                spin_lock_irqsave(&iommu->lock, flags);
                                clear_bit(i, iommu->domain_ids);
@@ -3537,7 +3506,6 @@ static void iommu_free_vm_domain(struct dmar_domain *domain)
                                spin_unlock_irqrestore(&iommu->lock, flags);
                                break;
                        }
-                       i = find_next_bit(iommu->domain_ids, ndomains, i+1);
                }
        }
 }
@@ -3598,7 +3566,6 @@ static int intel_iommu_attach_device(struct iommu_domain *domain,
        struct pci_dev *pdev = to_pci_dev(dev);
        struct intel_iommu *iommu;
        int addr_width;
-       u64 end;
 
        /* normally pdev is not mapped */
        if (unlikely(domain_context_mapped(pdev))) {
@@ -3621,14 +3588,31 @@ static int intel_iommu_attach_device(struct iommu_domain *domain,
 
        /* check if this iommu agaw is sufficient for max mapped address */
        addr_width = agaw_to_width(iommu->agaw);
-       end = DOMAIN_MAX_ADDR(addr_width);
-       end = end & VTD_PAGE_MASK;
-       if (end < dmar_domain->max_addr) {
-               printk(KERN_ERR "%s: iommu agaw (%d) is not "
+       if (addr_width > cap_mgaw(iommu->cap))
+               addr_width = cap_mgaw(iommu->cap);
+
+       if (dmar_domain->max_addr > (1LL << addr_width)) {
+               printk(KERN_ERR "%s: iommu width (%d) is not "
                       "sufficient for the mapped address (%llx)\n",
-                      __func__, iommu->agaw, dmar_domain->max_addr);
+                      __func__, addr_width, dmar_domain->max_addr);
                return -EFAULT;
        }
+       dmar_domain->gaw = addr_width;
+
+       /*
+        * Knock out extra levels of page tables if necessary
+        */
+       while (iommu->agaw < dmar_domain->agaw) {
+               struct dma_pte *pte;
+
+               pte = dmar_domain->pgd;
+               if (dma_pte_present(pte)) {
+                       free_pgtable_page(dmar_domain->pgd);
+                       dmar_domain->pgd = (struct dma_pte *)
+                               phys_to_virt(dma_pte_addr(pte));
+               }
+               dmar_domain->agaw--;
+       }
 
        return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
 }
@@ -3642,14 +3626,14 @@ static void intel_iommu_detach_device(struct iommu_domain *domain,
        domain_remove_one_dev_info(dmar_domain, pdev);
 }
 
-static int intel_iommu_map_range(struct iommu_domain *domain,
-                                unsigned long iova, phys_addr_t hpa,
-                                size_t size, int iommu_prot)
+static int intel_iommu_map(struct iommu_domain *domain,
+                          unsigned long iova, phys_addr_t hpa,
+                          int gfp_order, int iommu_prot)
 {
        struct dmar_domain *dmar_domain = domain->priv;
        u64 max_addr;
-       int addr_width;
        int prot = 0;
+       size_t size;
        int ret;
 
        if (iommu_prot & IOMMU_READ)
@@ -3659,20 +3643,17 @@ static int intel_iommu_map_range(struct iommu_domain *domain,
        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
                prot |= DMA_PTE_SNP;
 
+       size     = PAGE_SIZE << gfp_order;
        max_addr = iova + size;
        if (dmar_domain->max_addr < max_addr) {
-               int min_agaw;
                u64 end;
 
                /* check if minimum agaw is sufficient for mapped address */
-               min_agaw = vm_domain_min_agaw(dmar_domain);
-               addr_width = agaw_to_width(min_agaw);
-               end = DOMAIN_MAX_ADDR(addr_width);
-               end = end & VTD_PAGE_MASK;
+               end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
                if (end < max_addr) {
-                       printk(KERN_ERR "%s: iommu agaw (%d) is not "
+                       printk(KERN_ERR "%s: iommu width (%d) is not "
                               "sufficient for the mapped address (%llx)\n",
-                              __func__, min_agaw, max_addr);
+                              __func__, dmar_domain->gaw, max_addr);
                        return -EFAULT;
                }
                dmar_domain->max_addr = max_addr;
@@ -3685,19 +3666,19 @@ static int intel_iommu_map_range(struct iommu_domain *domain,
        return ret;
 }
 
-static void intel_iommu_unmap_range(struct iommu_domain *domain,
-                                   unsigned long iova, size_t size)
+static int intel_iommu_unmap(struct iommu_domain *domain,
+                            unsigned long iova, int gfp_order)
 {
        struct dmar_domain *dmar_domain = domain->priv;
-
-       if (!size)
-               return;
+       size_t size = PAGE_SIZE << gfp_order;
 
        dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
                            (iova + size - 1) >> VTD_PAGE_SHIFT);
 
        if (dmar_domain->max_addr == iova + size)
                dmar_domain->max_addr = iova;
+
+       return gfp_order;
 }
 
 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
@@ -3721,6 +3702,8 @@ static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
 
        if (cap == IOMMU_CAP_CACHE_COHERENCY)
                return dmar_domain->iommu_snooping;
+       if (cap == IOMMU_CAP_INTR_REMAP)
+               return intr_remapping_enabled;
 
        return 0;
 }
@@ -3730,8 +3713,8 @@ static struct iommu_ops intel_iommu_ops = {
        .domain_destroy = intel_iommu_domain_destroy,
        .attach_dev     = intel_iommu_attach_device,
        .detach_dev     = intel_iommu_detach_device,
-       .map            = intel_iommu_map_range,
-       .unmap          = intel_iommu_unmap_range,
+       .map            = intel_iommu_map,
+       .unmap          = intel_iommu_unmap,
        .iova_to_phys   = intel_iommu_iova_to_phys,
        .domain_has_cap = intel_iommu_domain_has_cap,
 };
@@ -3744,10 +3727,43 @@ static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
         */
        printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
        rwbf_quirk = 1;
+
+       /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
+       if (dev->revision == 0x07) {
+               printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
+               dmar_map_gfx = 0;
+       }
 }
 
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
 
+#define GGC 0x52
+#define GGC_MEMORY_SIZE_MASK   (0xf << 8)
+#define GGC_MEMORY_SIZE_NONE   (0x0 << 8)
+#define GGC_MEMORY_SIZE_1M     (0x1 << 8)
+#define GGC_MEMORY_SIZE_2M     (0x3 << 8)
+#define GGC_MEMORY_VT_ENABLED  (0x8 << 8)
+#define GGC_MEMORY_SIZE_2M_VT  (0x9 << 8)
+#define GGC_MEMORY_SIZE_3M_VT  (0xa << 8)
+#define GGC_MEMORY_SIZE_4M_VT  (0xb << 8)
+
+static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
+{
+       unsigned short ggc;
+
+       if (pci_read_config_word(dev, GGC, &ggc))
+               return;
+
+       if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
+               printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
+               dmar_map_gfx = 0;
+       }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
+
 /* On Tylersburg chipsets, some BIOSes have been known to enable the
    ISOCH DMAR unit for the Azalia sound device, but not give it any
    TLB entries, which causes it to deadlock. Check for that.  We do