iommu/amd: Check for the right TLP prefix bit
[linux-2.6.git] / drivers / iommu / amd_iommu.c
index 7c3a95e..08f3eec 100644 (file)
@@ -17,6 +17,7 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
+#include <linux/ratelimit.h>
 #include <linux/pci.h>
 #include <linux/pci-ats.h>
 #include <linux/bitmap.h>
 #include <linux/iommu-helper.h>
 #include <linux/iommu.h>
 #include <linux/delay.h>
+#include <linux/amd-iommu.h>
+#include <linux/notifier.h>
+#include <linux/export.h>
+#include <asm/msidef.h>
 #include <asm/proto.h>
 #include <asm/iommu.h>
 #include <asm/gart.h>
 #include <asm/dma.h>
-#include <asm/amd_iommu_proto.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
+
+#include "amd_iommu_proto.h"
+#include "amd_iommu_types.h"
 
 #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
 
 #define LOOP_TIMEOUT   100000
 
+/*
+ * This bitmap is used to advertise the page sizes our hardware support
+ * to the IOMMU core, which will then use this information to split
+ * physically contiguous memory regions it is mapping into page sizes
+ * that we support.
+ *
+ * Traditionally the IOMMU core just handed us the mappings directly,
+ * after making sure the size is an order of a 4KiB page and that the
+ * mapping has natural alignment.
+ *
+ * To retain this behavior, we currently advertise that we support
+ * all page sizes that are an order of 4KiB.
+ *
+ * If at some point we'd like to utilize the IOMMU core's new behavior,
+ * we could change this to advertise the real page sizes we support.
+ */
+#define AMD_IOMMU_PGSIZES      (~0xFFFUL)
+
 static DEFINE_RWLOCK(amd_iommu_devtable_lock);
 
 /* A list of preallocated protection domains */
 static LIST_HEAD(iommu_pd_list);
 static DEFINE_SPINLOCK(iommu_pd_list_lock);
 
+/* List of all available dev_data structures */
+static LIST_HEAD(dev_data_list);
+static DEFINE_SPINLOCK(dev_data_list_lock);
+
 /*
  * Domain for untranslated devices - only allocated
  * if iommu=pt passed on kernel cmd line.
@@ -53,6 +80,9 @@ static struct protection_domain *pt_domain;
 
 static struct iommu_ops amd_iommu_ops;
 
+static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
+int amd_iommu_max_glx_val = -1;
+
 /*
  * general struct to manage commands send to an IOMMU
  */
@@ -61,6 +91,7 @@ struct iommu_cmd {
 };
 
 static void update_domain(struct protection_domain *domain);
+static int __init alloc_passthrough_domain(void);
 
 /****************************************************************************
  *
@@ -68,6 +99,67 @@ static void update_domain(struct protection_domain *domain);
  *
  ****************************************************************************/
 
+static struct iommu_dev_data *alloc_dev_data(u16 devid)
+{
+       struct iommu_dev_data *dev_data;
+       unsigned long flags;
+
+       dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
+       if (!dev_data)
+               return NULL;
+
+       dev_data->devid = devid;
+       atomic_set(&dev_data->bind, 0);
+
+       spin_lock_irqsave(&dev_data_list_lock, flags);
+       list_add_tail(&dev_data->dev_data_list, &dev_data_list);
+       spin_unlock_irqrestore(&dev_data_list_lock, flags);
+
+       return dev_data;
+}
+
+static void free_dev_data(struct iommu_dev_data *dev_data)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&dev_data_list_lock, flags);
+       list_del(&dev_data->dev_data_list);
+       spin_unlock_irqrestore(&dev_data_list_lock, flags);
+
+       kfree(dev_data);
+}
+
+static struct iommu_dev_data *search_dev_data(u16 devid)
+{
+       struct iommu_dev_data *dev_data;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dev_data_list_lock, flags);
+       list_for_each_entry(dev_data, &dev_data_list, dev_data_list) {
+               if (dev_data->devid == devid)
+                       goto out_unlock;
+       }
+
+       dev_data = NULL;
+
+out_unlock:
+       spin_unlock_irqrestore(&dev_data_list_lock, flags);
+
+       return dev_data;
+}
+
+static struct iommu_dev_data *find_dev_data(u16 devid)
+{
+       struct iommu_dev_data *dev_data;
+
+       dev_data = search_dev_data(devid);
+
+       if (dev_data == NULL)
+               dev_data = alloc_dev_data(devid);
+
+       return dev_data;
+}
+
 static inline u16 get_device_id(struct device *dev)
 {
        struct pci_dev *pdev = to_pci_dev(dev);
@@ -80,6 +172,33 @@ static struct iommu_dev_data *get_dev_data(struct device *dev)
        return dev->archdata.iommu;
 }
 
+static bool pci_iommuv2_capable(struct pci_dev *pdev)
+{
+       static const int caps[] = {
+               PCI_EXT_CAP_ID_ATS,
+               PCI_EXT_CAP_ID_PRI,
+               PCI_EXT_CAP_ID_PASID,
+       };
+       int i, pos;
+
+       for (i = 0; i < 3; ++i) {
+               pos = pci_find_ext_capability(pdev, caps[i]);
+               if (pos == 0)
+                       return false;
+       }
+
+       return true;
+}
+
+static bool pdev_pri_erratum(struct pci_dev *pdev, u32 erratum)
+{
+       struct iommu_dev_data *dev_data;
+
+       dev_data = get_dev_data(&pdev->dev);
+
+       return dev_data->errata & (1 << erratum) ? true : false;
+}
+
 /*
  * In this function the list of preallocated protection domains is traversed to
  * find the domain for a specific device
@@ -137,33 +256,39 @@ static bool check_device(struct device *dev)
 
 static int iommu_init_device(struct device *dev)
 {
+       struct pci_dev *pdev = to_pci_dev(dev);
        struct iommu_dev_data *dev_data;
-       struct pci_dev *pdev;
-       u16 devid, alias;
+       u16 alias;
 
        if (dev->archdata.iommu)
                return 0;
 
-       dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
+       dev_data = find_dev_data(get_device_id(dev));
        if (!dev_data)
                return -ENOMEM;
 
-       dev_data->dev = dev;
+       alias = amd_iommu_alias_table[dev_data->devid];
+       if (alias != dev_data->devid) {
+               struct iommu_dev_data *alias_data;
 
-       devid = get_device_id(dev);
-       alias = amd_iommu_alias_table[devid];
-       pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
-       if (pdev)
-               dev_data->alias = &pdev->dev;
-       else {
-               kfree(dev_data);
-               return -ENOTSUPP;
+               alias_data = find_dev_data(alias);
+               if (alias_data == NULL) {
+                       pr_err("AMD-Vi: Warning: Unhandled device %s\n",
+                                       dev_name(dev));
+                       free_dev_data(dev_data);
+                       return -ENOTSUPP;
+               }
+               dev_data->alias_data = alias_data;
        }
 
-       atomic_set(&dev_data->bind, 0);
+       if (pci_iommuv2_capable(pdev)) {
+               struct amd_iommu *iommu;
 
-       dev->archdata.iommu = dev_data;
+               iommu              = amd_iommu_rlookup_table[dev_data->devid];
+               dev_data->iommu_v2 = iommu->is_iommu_v2;
+       }
 
+       dev->archdata.iommu = dev_data;
 
        return 0;
 }
@@ -184,11 +309,16 @@ static void iommu_ignore_device(struct device *dev)
 
 static void iommu_uninit_device(struct device *dev)
 {
-       kfree(dev->archdata.iommu);
+       /*
+        * Nothing to do here - we keep dev_data around for unplugged devices
+        * and reuse it when the device is re-plugged - not doing so would
+        * introduce a ton of races.
+        */
 }
 
 void __init amd_iommu_uninit_devices(void)
 {
+       struct iommu_dev_data *dev_data, *n;
        struct pci_dev *pdev = NULL;
 
        for_each_pci_dev(pdev) {
@@ -198,6 +328,10 @@ void __init amd_iommu_uninit_devices(void)
 
                iommu_uninit_device(&pdev->dev);
        }
+
+       /* Free all of our dev_data structures */
+       list_for_each_entry_safe(dev_data, n, &dev_data_list, dev_data_list)
+               free_dev_data(dev_data);
 }
 
 int __init amd_iommu_init_devices(void)
@@ -243,6 +377,11 @@ DECLARE_STATS_COUNTER(domain_flush_single);
 DECLARE_STATS_COUNTER(domain_flush_all);
 DECLARE_STATS_COUNTER(alloced_io_mem);
 DECLARE_STATS_COUNTER(total_map_requests);
+DECLARE_STATS_COUNTER(complete_ppr);
+DECLARE_STATS_COUNTER(invalidate_iotlb);
+DECLARE_STATS_COUNTER(invalidate_iotlb_all);
+DECLARE_STATS_COUNTER(pri_requests);
+
 
 static struct dentry *stats_dir;
 static struct dentry *de_fflush;
@@ -277,6 +416,10 @@ static void amd_iommu_stats_init(void)
        amd_iommu_stats_add(&domain_flush_all);
        amd_iommu_stats_add(&alloced_io_mem);
        amd_iommu_stats_add(&total_map_requests);
+       amd_iommu_stats_add(&complete_ppr);
+       amd_iommu_stats_add(&invalidate_iotlb);
+       amd_iommu_stats_add(&invalidate_iotlb_all);
+       amd_iommu_stats_add(&pri_requests);
 }
 
 #endif
@@ -291,8 +434,8 @@ static void dump_dte_entry(u16 devid)
 {
        int i;
 
-       for (i = 0; i < 8; ++i)
-               pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
+       for (i = 0; i < 4; ++i)
+               pr_err("AMD-Vi: DTE[%d]: %016llx\n", i,
                        amd_iommu_dev_table[devid].data[i]);
 }
 
@@ -387,12 +530,84 @@ static void iommu_poll_events(struct amd_iommu *iommu)
        spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
+static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u32 head)
+{
+       struct amd_iommu_fault fault;
+       volatile u64 *raw;
+       int i;
+
+       INC_STATS_COUNTER(pri_requests);
+
+       raw = (u64 *)(iommu->ppr_log + head);
+
+       /*
+        * Hardware bug: Interrupt may arrive before the entry is written to
+        * memory. If this happens we need to wait for the entry to arrive.
+        */
+       for (i = 0; i < LOOP_TIMEOUT; ++i) {
+               if (PPR_REQ_TYPE(raw[0]) != 0)
+                       break;
+               udelay(1);
+       }
+
+       if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) {
+               pr_err_ratelimited("AMD-Vi: Unknown PPR request received\n");
+               return;
+       }
+
+       fault.address   = raw[1];
+       fault.pasid     = PPR_PASID(raw[0]);
+       fault.device_id = PPR_DEVID(raw[0]);
+       fault.tag       = PPR_TAG(raw[0]);
+       fault.flags     = PPR_FLAGS(raw[0]);
+
+       /*
+        * To detect the hardware bug we need to clear the entry
+        * to back to zero.
+        */
+       raw[0] = raw[1] = 0;
+
+       atomic_notifier_call_chain(&ppr_notifier, 0, &fault);
+}
+
+static void iommu_poll_ppr_log(struct amd_iommu *iommu)
+{
+       unsigned long flags;
+       u32 head, tail;
+
+       if (iommu->ppr_log == NULL)
+               return;
+
+       spin_lock_irqsave(&iommu->lock, flags);
+
+       head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
+       tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
+
+       while (head != tail) {
+
+               /* Handle PPR entry */
+               iommu_handle_ppr_entry(iommu, head);
+
+               /* Update and refresh ring-buffer state*/
+               head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE;
+               writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
+               tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
+       }
+
+       /* enable ppr interrupts again */
+       writel(MMIO_STATUS_PPR_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET);
+
+       spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
 irqreturn_t amd_iommu_int_thread(int irq, void *data)
 {
        struct amd_iommu *iommu;
 
-       for_each_iommu(iommu)
+       for_each_iommu(iommu) {
                iommu_poll_events(iommu);
+               iommu_poll_ppr_log(iommu);
+       }
 
        return IRQ_HANDLED;
 }
@@ -521,6 +736,60 @@ static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
                cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
 }
 
+static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, int pasid,
+                                 u64 address, bool size)
+{
+       memset(cmd, 0, sizeof(*cmd));
+
+       address &= ~(0xfffULL);
+
+       cmd->data[0]  = pasid & PASID_MASK;
+       cmd->data[1]  = domid;
+       cmd->data[2]  = lower_32_bits(address);
+       cmd->data[3]  = upper_32_bits(address);
+       cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+       cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
+       if (size)
+               cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+       CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
+}
+
+static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, int pasid,
+                                 int qdep, u64 address, bool size)
+{
+       memset(cmd, 0, sizeof(*cmd));
+
+       address &= ~(0xfffULL);
+
+       cmd->data[0]  = devid;
+       cmd->data[0] |= (pasid & 0xff) << 16;
+       cmd->data[0] |= (qdep  & 0xff) << 24;
+       cmd->data[1]  = devid;
+       cmd->data[1] |= ((pasid >> 8) & 0xfff) << 16;
+       cmd->data[2]  = lower_32_bits(address);
+       cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
+       cmd->data[3]  = upper_32_bits(address);
+       if (size)
+               cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+       CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
+}
+
+static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, int pasid,
+                              int status, int tag, bool gn)
+{
+       memset(cmd, 0, sizeof(*cmd));
+
+       cmd->data[0]  = devid;
+       if (gn) {
+               cmd->data[1]  = pasid & PASID_MASK;
+               cmd->data[2]  = CMD_INV_IOMMU_PAGES_GN_MASK;
+       }
+       cmd->data[3]  = tag & 0x1ff;
+       cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT;
+
+       CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR);
+}
+
 static void build_inv_all(struct iommu_cmd *cmd)
 {
        memset(cmd, 0, sizeof(*cmd));
@@ -531,7 +800,9 @@ static void build_inv_all(struct iommu_cmd *cmd)
  * Writes the command to the IOMMUs command buffer and informs the
  * hardware about the new command.
  */
-static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
+static int iommu_queue_command_sync(struct amd_iommu *iommu,
+                                   struct iommu_cmd *cmd,
+                                   bool sync)
 {
        u32 left, tail, head, next_tail;
        unsigned long flags;
@@ -565,13 +836,18 @@ again:
        copy_cmd_to_buffer(iommu, cmd, tail);
 
        /* We need to sync now to make sure all commands are processed */
-       iommu->need_sync = true;
+       iommu->need_sync = sync;
 
        spin_unlock_irqrestore(&iommu->lock, flags);
 
        return 0;
 }
 
+static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
+{
+       return iommu_queue_command_sync(iommu, cmd, true);
+}
+
 /*
  * This function queues a completion wait command into the command
  * buffer of an IOMMU
@@ -587,7 +863,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
 
        build_completion_wait(&cmd, (u64)&sem);
 
-       ret = iommu_queue_command(iommu, &cmd);
+       ret = iommu_queue_command_sync(iommu, &cmd, false);
        if (ret)
                return ret;
 
@@ -654,19 +930,17 @@ void iommu_flush_all_caches(struct amd_iommu *iommu)
 /*
  * Command send function for flushing on-device TLB
  */
-static int device_flush_iotlb(struct device *dev, u64 address, size_t size)
+static int device_flush_iotlb(struct iommu_dev_data *dev_data,
+                             u64 address, size_t size)
 {
-       struct pci_dev *pdev = to_pci_dev(dev);
        struct amd_iommu *iommu;
        struct iommu_cmd cmd;
-       u16 devid;
        int qdep;
 
-       qdep  = pci_ats_queue_depth(pdev);
-       devid = get_device_id(dev);
-       iommu = amd_iommu_rlookup_table[devid];
+       qdep     = dev_data->ats.qdep;
+       iommu    = amd_iommu_rlookup_table[dev_data->devid];
 
-       build_inv_iotlb_pages(&cmd, devid, qdep, address, size);
+       build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size);
 
        return iommu_queue_command(iommu, &cmd);
 }
@@ -674,23 +948,19 @@ static int device_flush_iotlb(struct device *dev, u64 address, size_t size)
 /*
  * Command send function for invalidating a device table entry
  */
-static int device_flush_dte(struct device *dev)
+static int device_flush_dte(struct iommu_dev_data *dev_data)
 {
        struct amd_iommu *iommu;
-       struct pci_dev *pdev;
-       u16 devid;
        int ret;
 
-       pdev  = to_pci_dev(dev);
-       devid = get_device_id(dev);
-       iommu = amd_iommu_rlookup_table[devid];
+       iommu = amd_iommu_rlookup_table[dev_data->devid];
 
-       ret = iommu_flush_dte(iommu, devid);
+       ret = iommu_flush_dte(iommu, dev_data->devid);
        if (ret)
                return ret;
 
-       if (pci_ats_enabled(pdev))
-               ret = device_flush_iotlb(dev, 0, ~0UL);
+       if (dev_data->ats.enabled)
+               ret = device_flush_iotlb(dev_data, 0, ~0UL);
 
        return ret;
 }
@@ -721,12 +991,11 @@ static void __domain_flush_pages(struct protection_domain *domain,
        }
 
        list_for_each_entry(dev_data, &domain->dev_list, list) {
-               struct pci_dev *pdev = to_pci_dev(dev_data->dev);
 
-               if (!pci_ats_enabled(pdev))
+               if (!dev_data->ats.enabled)
                        continue;
 
-               ret |= device_flush_iotlb(dev_data->dev, address, size);
+               ret |= device_flush_iotlb(dev_data, address, size);
        }
 
        WARN_ON(ret);
@@ -773,14 +1042,9 @@ static void domain_flush_complete(struct protection_domain *domain)
 static void domain_flush_devices(struct protection_domain *domain)
 {
        struct iommu_dev_data *dev_data;
-       unsigned long flags;
-
-       spin_lock_irqsave(&domain->lock, flags);
 
        list_for_each_entry(dev_data, &domain->dev_list, list)
-               device_flush_dte(dev_data->dev);
-
-       spin_unlock_irqrestore(&domain->lock, flags);
+               device_flush_dte(dev_data);
 }
 
 /****************************************************************************
@@ -1136,7 +1400,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
 {
        int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
        struct amd_iommu *iommu;
-       unsigned long i;
+       unsigned long i, old_size;
 
 #ifdef CONFIG_IOMMU_STRESS
        populate = false;
@@ -1172,8 +1436,21 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
                }
        }
 
+       old_size                = dma_dom->aperture_size;
        dma_dom->aperture_size += APERTURE_RANGE_SIZE;
 
+       /* Reserve address range used for MSI messages */
+       if (old_size < MSI_ADDR_BASE_LO &&
+           dma_dom->aperture_size > MSI_ADDR_BASE_LO) {
+               unsigned long spage;
+               int pages;
+
+               pages = iommu_num_pages(MSI_ADDR_BASE_LO, 0x10000, PAGE_SIZE);
+               spage = MSI_ADDR_BASE_LO >> PAGE_SHIFT;
+
+               dma_ops_reserve_addresses(dma_dom, spage, pages);
+       }
+
        /* Initialize the exclusion range if necessary */
        for_each_iommu(iommu) {
                if (iommu->exclusion_start &&
@@ -1201,7 +1478,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
                if (!pte || !IOMMU_PTE_PRESENT(*pte))
                        continue;
 
-               dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
+               dma_ops_reserve_addresses(dma_dom, i >> PAGE_SHIFT, 1);
        }
 
        update_domain(&dma_dom->domain);
@@ -1414,6 +1691,48 @@ static void free_pagetable(struct protection_domain *domain)
        domain->pt_root = NULL;
 }
 
+static void free_gcr3_tbl_level1(u64 *tbl)
+{
+       u64 *ptr;
+       int i;
+
+       for (i = 0; i < 512; ++i) {
+               if (!(tbl[i] & GCR3_VALID))
+                       continue;
+
+               ptr = __va(tbl[i] & PAGE_MASK);
+
+               free_page((unsigned long)ptr);
+       }
+}
+
+static void free_gcr3_tbl_level2(u64 *tbl)
+{
+       u64 *ptr;
+       int i;
+
+       for (i = 0; i < 512; ++i) {
+               if (!(tbl[i] & GCR3_VALID))
+                       continue;
+
+               ptr = __va(tbl[i] & PAGE_MASK);
+
+               free_gcr3_tbl_level1(ptr);
+       }
+}
+
+static void free_gcr3_table(struct protection_domain *domain)
+{
+       if (domain->glx == 2)
+               free_gcr3_tbl_level2(domain->gcr3_tbl);
+       else if (domain->glx == 1)
+               free_gcr3_tbl_level1(domain->gcr3_tbl);
+       else if (domain->glx != 0)
+               BUG();
+
+       free_page((unsigned long)domain->gcr3_tbl);
+}
+
 /*
  * Free a domain, only used if something went wrong in the
  * allocation path and we need to free an already allocated page table
@@ -1500,20 +1819,52 @@ static bool dma_ops_domain(struct protection_domain *domain)
 
 static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
 {
-       u64 pte_root = virt_to_phys(domain->pt_root);
-       u32 flags = 0;
+       u64 pte_root = 0;
+       u64 flags = 0;
+
+       if (domain->mode != PAGE_MODE_NONE)
+               pte_root = virt_to_phys(domain->pt_root);
 
        pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
                    << DEV_ENTRY_MODE_SHIFT;
        pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
 
+       flags = amd_iommu_dev_table[devid].data[1];
+
        if (ats)
                flags |= DTE_FLAG_IOTLB;
 
-       amd_iommu_dev_table[devid].data[3] |= flags;
-       amd_iommu_dev_table[devid].data[2]  = domain->id;
-       amd_iommu_dev_table[devid].data[1]  = upper_32_bits(pte_root);
-       amd_iommu_dev_table[devid].data[0]  = lower_32_bits(pte_root);
+       if (domain->flags & PD_IOMMUV2_MASK) {
+               u64 gcr3 = __pa(domain->gcr3_tbl);
+               u64 glx  = domain->glx;
+               u64 tmp;
+
+               pte_root |= DTE_FLAG_GV;
+               pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
+
+               /* First mask out possible old values for GCR3 table */
+               tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
+               flags    &= ~tmp;
+
+               tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
+               flags    &= ~tmp;
+
+               /* Encode GCR3 table into DTE */
+               tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
+               pte_root |= tmp;
+
+               tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
+               flags    |= tmp;
+
+               tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
+               flags    |= tmp;
+       }
+
+       flags &= ~(0xffffUL);
+       flags |= domain->id;
+
+       amd_iommu_dev_table[devid].data[1]  = flags;
+       amd_iommu_dev_table[devid].data[0]  = pte_root;
 }
 
 static void clear_dte_entry(u16 devid)
@@ -1521,49 +1872,37 @@ static void clear_dte_entry(u16 devid)
        /* remove entry from the device table seen by the hardware */
        amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
        amd_iommu_dev_table[devid].data[1] = 0;
-       amd_iommu_dev_table[devid].data[2] = 0;
 
        amd_iommu_apply_erratum_63(devid);
 }
 
-static void do_attach(struct device *dev, struct protection_domain *domain)
+static void do_attach(struct iommu_dev_data *dev_data,
+                     struct protection_domain *domain)
 {
-       struct iommu_dev_data *dev_data;
        struct amd_iommu *iommu;
-       struct pci_dev *pdev;
-       bool ats = false;
-       u16 devid;
+       bool ats;
 
-       devid    = get_device_id(dev);
-       iommu    = amd_iommu_rlookup_table[devid];
-       dev_data = get_dev_data(dev);
-       pdev     = to_pci_dev(dev);
-
-       if (amd_iommu_iotlb_sup)
-               ats = pci_ats_enabled(pdev);
+       iommu = amd_iommu_rlookup_table[dev_data->devid];
+       ats   = dev_data->ats.enabled;
 
        /* Update data structures */
        dev_data->domain = domain;
        list_add(&dev_data->list, &domain->dev_list);
-       set_dte_entry(devid, domain, ats);
+       set_dte_entry(dev_data->devid, domain, ats);
 
        /* Do reference counting */
        domain->dev_iommu[iommu->index] += 1;
        domain->dev_cnt                 += 1;
 
        /* Flush the DTE entry */
-       device_flush_dte(dev);
+       device_flush_dte(dev_data);
 }
 
-static void do_detach(struct device *dev)
+static void do_detach(struct iommu_dev_data *dev_data)
 {
-       struct iommu_dev_data *dev_data;
        struct amd_iommu *iommu;
-       u16 devid;
 
-       devid    = get_device_id(dev);
-       iommu    = amd_iommu_rlookup_table[devid];
-       dev_data = get_dev_data(dev);
+       iommu = amd_iommu_rlookup_table[dev_data->devid];
 
        /* decrease reference counters */
        dev_data->domain->dev_iommu[iommu->index] -= 1;
@@ -1572,52 +1911,46 @@ static void do_detach(struct device *dev)
        /* Update data structures */
        dev_data->domain = NULL;
        list_del(&dev_data->list);
-       clear_dte_entry(devid);
+       clear_dte_entry(dev_data->devid);
 
        /* Flush the DTE entry */
-       device_flush_dte(dev);
+       device_flush_dte(dev_data);
 }
 
 /*
  * If a device is not yet associated with a domain, this function does
  * assigns it visible for the hardware
  */
-static int __attach_device(struct device *dev,
+static int __attach_device(struct iommu_dev_data *dev_data,
                           struct protection_domain *domain)
 {
-       struct iommu_dev_data *dev_data, *alias_data;
        int ret;
 
-       dev_data   = get_dev_data(dev);
-       alias_data = get_dev_data(dev_data->alias);
-
-       if (!alias_data)
-               return -EINVAL;
-
        /* lock domain */
        spin_lock(&domain->lock);
 
-       /* Some sanity checks */
-       ret = -EBUSY;
-       if (alias_data->domain != NULL &&
-           alias_data->domain != domain)
-               goto out_unlock;
+       if (dev_data->alias_data != NULL) {
+               struct iommu_dev_data *alias_data = dev_data->alias_data;
 
-       if (dev_data->domain != NULL &&
-           dev_data->domain != domain)
-               goto out_unlock;
+               /* Some sanity checks */
+               ret = -EBUSY;
+               if (alias_data->domain != NULL &&
+                               alias_data->domain != domain)
+                       goto out_unlock;
 
-       /* Do real assignment */
-       if (dev_data->alias != dev) {
-               alias_data = get_dev_data(dev_data->alias);
+               if (dev_data->domain != NULL &&
+                               dev_data->domain != domain)
+                       goto out_unlock;
+
+               /* Do real assignment */
                if (alias_data->domain == NULL)
-                       do_attach(dev_data->alias, domain);
+                       do_attach(alias_data, domain);
 
                atomic_inc(&alias_data->bind);
        }
 
        if (dev_data->domain == NULL)
-               do_attach(dev, domain);
+               do_attach(dev_data, domain);
 
        atomic_inc(&dev_data->bind);
 
@@ -1631,6 +1964,93 @@ out_unlock:
        return ret;
 }
 
+
+static void pdev_iommuv2_disable(struct pci_dev *pdev)
+{
+       pci_disable_ats(pdev);
+       pci_disable_pri(pdev);
+       pci_disable_pasid(pdev);
+}
+
+/* FIXME: Change generic reset-function to do the same */
+static int pri_reset_while_enabled(struct pci_dev *pdev)
+{
+       u16 control;
+       int pos;
+
+       pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+       if (!pos)
+               return -EINVAL;
+
+       pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control);
+       control |= PCI_PRI_CTRL_RESET;
+       pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
+
+       return 0;
+}
+
+static int pdev_iommuv2_enable(struct pci_dev *pdev)
+{
+       bool reset_enable;
+       int reqs, ret;
+
+       /* FIXME: Hardcode number of outstanding requests for now */
+       reqs = 32;
+       if (pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE))
+               reqs = 1;
+       reset_enable = pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_ENABLE_RESET);
+
+       /* Only allow access to user-accessible pages */
+       ret = pci_enable_pasid(pdev, 0);
+       if (ret)
+               goto out_err;
+
+       /* First reset the PRI state of the device */
+       ret = pci_reset_pri(pdev);
+       if (ret)
+               goto out_err;
+
+       /* Enable PRI */
+       ret = pci_enable_pri(pdev, reqs);
+       if (ret)
+               goto out_err;
+
+       if (reset_enable) {
+               ret = pri_reset_while_enabled(pdev);
+               if (ret)
+                       goto out_err;
+       }
+
+       ret = pci_enable_ats(pdev, PAGE_SHIFT);
+       if (ret)
+               goto out_err;
+
+       return 0;
+
+out_err:
+       pci_disable_pri(pdev);
+       pci_disable_pasid(pdev);
+
+       return ret;
+}
+
+/* FIXME: Move this to PCI code */
+#define PCI_PRI_TLP_OFF                (1 << 15)
+
+bool pci_pri_tlp_required(struct pci_dev *pdev)
+{
+       u16 status;
+       int pos;
+
+       pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+       if (!pos)
+               return false;
+
+       pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status);
+
+       return (status & PCI_PRI_TLP_OFF) ? true : false;
+}
+
 /*
  * If a device is not yet associated with a domain, this function does
  * assigns it visible for the hardware
@@ -1639,14 +2059,30 @@ static int attach_device(struct device *dev,
                         struct protection_domain *domain)
 {
        struct pci_dev *pdev = to_pci_dev(dev);
+       struct iommu_dev_data *dev_data;
        unsigned long flags;
        int ret;
 
-       if (amd_iommu_iotlb_sup)
-               pci_enable_ats(pdev, PAGE_SHIFT);
+       dev_data = get_dev_data(dev);
+
+       if (domain->flags & PD_IOMMUV2_MASK) {
+               if (!dev_data->iommu_v2 || !dev_data->passthrough)
+                       return -EINVAL;
+
+               if (pdev_iommuv2_enable(pdev) != 0)
+                       return -EINVAL;
+
+               dev_data->ats.enabled = true;
+               dev_data->ats.qdep    = pci_ats_queue_depth(pdev);
+               dev_data->pri_tlp     = pci_pri_tlp_required(pdev);
+       } else if (amd_iommu_iotlb_sup &&
+                  pci_enable_ats(pdev, PAGE_SHIFT) == 0) {
+               dev_data->ats.enabled = true;
+               dev_data->ats.qdep    = pci_ats_queue_depth(pdev);
+       }
 
        write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-       ret = __attach_device(dev, domain);
+       ret = __attach_device(dev_data, domain);
        write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
 
        /*
@@ -1662,10 +2098,8 @@ static int attach_device(struct device *dev,
 /*
  * Removes a device from a protection domain (unlocked)
  */
-static void __detach_device(struct device *dev)
+static void __detach_device(struct iommu_dev_data *dev_data)
 {
-       struct iommu_dev_data *dev_data = get_dev_data(dev);
-       struct iommu_dev_data *alias_data;
        struct protection_domain *domain;
        unsigned long flags;
 
@@ -1675,14 +2109,15 @@ static void __detach_device(struct device *dev)
 
        spin_lock_irqsave(&domain->lock, flags);
 
-       if (dev_data->alias != dev) {
-               alias_data = get_dev_data(dev_data->alias);
+       if (dev_data->alias_data != NULL) {
+               struct iommu_dev_data *alias_data = dev_data->alias_data;
+
                if (atomic_dec_and_test(&alias_data->bind))
-                       do_detach(dev_data->alias);
+                       do_detach(alias_data);
        }
 
        if (atomic_dec_and_test(&dev_data->bind))
-               do_detach(dev);
+               do_detach(dev_data);
 
        spin_unlock_irqrestore(&domain->lock, flags);
 
@@ -1691,9 +2126,9 @@ static void __detach_device(struct device *dev)
         * passthrough domain if it is detached from any other domain.
         * Make sure we can deassign from the pt_domain itself.
         */
-       if (iommu_pass_through &&
+       if (dev_data->passthrough &&
            (dev_data->domain == NULL && domain != pt_domain))
-               __attach_device(dev, pt_domain);
+               __attach_device(dev_data, pt_domain);
 }
 
 /*
@@ -1701,16 +2136,24 @@ static void __detach_device(struct device *dev)
  */
 static void detach_device(struct device *dev)
 {
-       struct pci_dev *pdev = to_pci_dev(dev);
+       struct protection_domain *domain;
+       struct iommu_dev_data *dev_data;
        unsigned long flags;
 
+       dev_data = get_dev_data(dev);
+       domain   = dev_data->domain;
+
        /* lock device table */
        write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-       __detach_device(dev);
+       __detach_device(dev_data);
        write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
 
-       if (amd_iommu_iotlb_sup && pci_ats_enabled(pdev))
-               pci_disable_ats(pdev);
+       if (domain->flags & PD_IOMMUV2_MASK)
+               pdev_iommuv2_disable(to_pci_dev(dev));
+       else if (dev_data->ats.enabled)
+               pci_disable_ats(to_pci_dev(dev));
+
+       dev_data->ats.enabled = false;
 }
 
 /*
@@ -1719,26 +2162,25 @@ static void detach_device(struct device *dev)
  */
 static struct protection_domain *domain_for_device(struct device *dev)
 {
-       struct protection_domain *dom;
-       struct iommu_dev_data *dev_data, *alias_data;
+       struct iommu_dev_data *dev_data;
+       struct protection_domain *dom = NULL;
        unsigned long flags;
-       u16 devid;
 
-       devid      = get_device_id(dev);
        dev_data   = get_dev_data(dev);
-       alias_data = get_dev_data(dev_data->alias);
-       if (!alias_data)
-               return NULL;
 
-       read_lock_irqsave(&amd_iommu_devtable_lock, flags);
-       dom = dev_data->domain;
-       if (dom == NULL &&
-           alias_data->domain != NULL) {
-               __attach_device(dev, alias_data->domain);
-               dom = alias_data->domain;
-       }
+       if (dev_data->domain)
+               return dev_data->domain;
 
-       read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+       if (dev_data->alias_data != NULL) {
+               struct iommu_dev_data *alias_data = dev_data->alias_data;
+
+               read_lock_irqsave(&amd_iommu_devtable_lock, flags);
+               if (alias_data->domain != NULL) {
+                       __attach_device(dev_data, alias_data->domain);
+                       dom = alias_data->domain;
+               }
+               read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+       }
 
        return dom;
 }
@@ -1746,18 +2188,20 @@ static struct protection_domain *domain_for_device(struct device *dev)
 static int device_change_notifier(struct notifier_block *nb,
                                  unsigned long action, void *data)
 {
-       struct device *dev = data;
-       u16 devid;
-       struct protection_domain *domain;
        struct dma_ops_domain *dma_domain;
+       struct protection_domain *domain;
+       struct iommu_dev_data *dev_data;
+       struct device *dev = data;
        struct amd_iommu *iommu;
        unsigned long flags;
+       u16 devid;
 
        if (!check_device(dev))
                return 0;
 
-       devid  = get_device_id(dev);
-       iommu  = amd_iommu_rlookup_table[devid];
+       devid    = get_device_id(dev);
+       iommu    = amd_iommu_rlookup_table[devid];
+       dev_data = get_dev_data(dev);
 
        switch (action) {
        case BUS_NOTIFY_UNBOUND_DRIVER:
@@ -1766,7 +2210,7 @@ static int device_change_notifier(struct notifier_block *nb,
 
                if (!domain)
                        goto out;
-               if (iommu_pass_through)
+               if (dev_data->passthrough)
                        break;
                detach_device(dev);
                break;
@@ -1798,7 +2242,6 @@ static int device_change_notifier(struct notifier_block *nb,
                goto out;
        }
 
-       device_flush_dte(dev);
        iommu_completion_wait(iommu);
 
 out:
@@ -1858,11 +2301,8 @@ static void update_device_table(struct protection_domain *domain)
 {
        struct iommu_dev_data *dev_data;
 
-       list_for_each_entry(dev_data, &domain->dev_list, list) {
-               struct pci_dev *pdev = to_pci_dev(dev_data->dev);
-               u16 devid = get_device_id(dev_data->dev);
-               set_dte_entry(devid, domain, pci_ats_enabled(pdev));
-       }
+       list_for_each_entry(dev_data, &domain->dev_list, list)
+               set_dte_entry(dev_data->devid, domain, dev_data->ats.enabled);
 }
 
 static void update_domain(struct protection_domain *domain)
@@ -2267,7 +2707,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
  * The exported alloc_coherent function for dma_ops.
  */
 static void *alloc_coherent(struct device *dev, size_t size,
-                           dma_addr_t *dma_addr, gfp_t flag)
+                           dma_addr_t *dma_addr, gfp_t flag,
+                           struct dma_attrs *attrs)
 {
        unsigned long flags;
        void *virt_addr;
@@ -2325,7 +2766,8 @@ out_free:
  * The exported free_coherent function for dma_ops.
  */
 static void free_coherent(struct device *dev, size_t size,
-                         void *virt_addr, dma_addr_t dma_addr)
+                         void *virt_addr, dma_addr_t dma_addr,
+                         struct dma_attrs *attrs)
 {
        unsigned long flags;
        struct protection_domain *domain;
@@ -2364,10 +2806,11 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask)
  * we don't need to preallocate the protection domains anymore.
  * For now we have to.
  */
-static void prealloc_protection_domains(void)
+static void __init prealloc_protection_domains(void)
 {
-       struct pci_dev *dev = NULL;
+       struct iommu_dev_data *dev_data;
        struct dma_ops_domain *dma_dom;
+       struct pci_dev *dev = NULL;
        u16 devid;
 
        for_each_pci_dev(dev) {
@@ -2376,6 +2819,16 @@ static void prealloc_protection_domains(void)
                if (!check_device(&dev->dev))
                        continue;
 
+               dev_data = get_dev_data(&dev->dev);
+               if (!amd_iommu_force_isolation && dev_data->iommu_v2) {
+                       /* Make sure passthrough domain is allocated */
+                       alloc_passthrough_domain();
+                       dev_data->passthrough = true;
+                       attach_device(&dev->dev, pt_domain);
+                       pr_info("AMD-Vi: Using passthough domain for device %s\n",
+                               dev_name(&dev->dev));
+               }
+
                /* Is there already any domain for it? */
                if (domain_for_device(&dev->dev))
                        continue;
@@ -2395,8 +2848,8 @@ static void prealloc_protection_domains(void)
 }
 
 static struct dma_map_ops amd_iommu_dma_ops = {
-       .alloc_coherent = alloc_coherent,
-       .free_coherent = free_coherent,
+       .alloc = alloc_coherent,
+       .free = free_coherent,
        .map_page = map_page,
        .unmap_page = unmap_page,
        .map_sg = map_sg,
@@ -2406,16 +2859,25 @@ static struct dma_map_ops amd_iommu_dma_ops = {
 
 static unsigned device_dma_ops_init(void)
 {
+       struct iommu_dev_data *dev_data;
        struct pci_dev *pdev = NULL;
        unsigned unhandled = 0;
 
        for_each_pci_dev(pdev) {
                if (!check_device(&pdev->dev)) {
+
+                       iommu_ignore_device(&pdev->dev);
+
                        unhandled += 1;
                        continue;
                }
 
-               pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops;
+               dev_data = get_dev_data(&pdev->dev);
+
+               if (!dev_data->passthrough)
+                       pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops;
+               else
+                       pdev->dev.archdata.dma_ops = &nommu_dma_ops;
        }
 
        return unhandled;
@@ -2427,7 +2889,7 @@ static unsigned device_dma_ops_init(void)
 
 void __init amd_iommu_init_api(void)
 {
-       register_iommu(&amd_iommu_ops);
+       bus_set_iommu(&pci_bus_type, &amd_iommu_ops);
 }
 
 int __init amd_iommu_init_dma_ops(void)
@@ -2497,9 +2959,7 @@ static void cleanup_domain(struct protection_domain *domain)
        write_lock_irqsave(&amd_iommu_devtable_lock, flags);
 
        list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
-               struct device *dev = dev_data->dev;
-
-               __detach_device(dev);
+               __detach_device(dev_data);
                atomic_set(&dev_data->bind, 0);
        }
 
@@ -2544,6 +3004,20 @@ out_err:
        return NULL;
 }
 
+static int __init alloc_passthrough_domain(void)
+{
+       if (pt_domain != NULL)
+               return 0;
+
+       /* allocate passthrough domain */
+       pt_domain = protection_domain_alloc();
+       if (!pt_domain)
+               return -ENOMEM;
+
+       pt_domain->mode = PAGE_MODE_NONE;
+
+       return 0;
+}
 static int amd_iommu_domain_init(struct iommu_domain *dom)
 {
        struct protection_domain *domain;
@@ -2557,6 +3031,8 @@ static int amd_iommu_domain_init(struct iommu_domain *dom)
        if (!domain->pt_root)
                goto out_free;
 
+       domain->iommu_domain = dom;
+
        dom->priv = domain;
 
        return 0;
@@ -2579,7 +3055,11 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
 
        BUG_ON(domain->dev_cnt != 0);
 
-       free_pagetable(domain);
+       if (domain->mode != PAGE_MODE_NONE)
+               free_pagetable(domain);
+
+       if (domain->flags & PD_IOMMUV2_MASK)
+               free_gcr3_table(domain);
 
        protection_domain_free(domain);
 
@@ -2605,7 +3085,6 @@ static void amd_iommu_detach_device(struct iommu_domain *dom,
        if (!iommu)
                return;
 
-       device_flush_dte(dev);
        iommu_completion_wait(iommu);
 }
 
@@ -2616,16 +3095,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
        struct iommu_dev_data *dev_data;
        struct amd_iommu *iommu;
        int ret;
-       u16 devid;
 
        if (!check_device(dev))
                return -EINVAL;
 
        dev_data = dev->archdata.iommu;
 
-       devid = get_device_id(dev);
-
-       iommu = amd_iommu_rlookup_table[devid];
+       iommu = amd_iommu_rlookup_table[dev_data->devid];
        if (!iommu)
                return -EINVAL;
 
@@ -2640,13 +3116,15 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
 }
 
 static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
-                        phys_addr_t paddr, int gfp_order, int iommu_prot)
+                        phys_addr_t paddr, size_t page_size, int iommu_prot)
 {
-       unsigned long page_size = 0x1000UL << gfp_order;
        struct protection_domain *domain = dom->priv;
        int prot = 0;
        int ret;
 
+       if (domain->mode == PAGE_MODE_NONE)
+               return -EINVAL;
+
        if (iommu_prot & IOMMU_READ)
                prot |= IOMMU_PROT_IR;
        if (iommu_prot & IOMMU_WRITE)
@@ -2659,13 +3137,14 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
        return ret;
 }
 
-static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
-                          int gfp_order)
+static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
+                          size_t page_size)
 {
        struct protection_domain *domain = dom->priv;
-       unsigned long page_size, unmap_size;
+       size_t unmap_size;
 
-       page_size  = 0x1000UL << gfp_order;
+       if (domain->mode == PAGE_MODE_NONE)
+               return -EINVAL;
 
        mutex_lock(&domain->api_lock);
        unmap_size = iommu_unmap_page(domain, iova, page_size);
@@ -2673,7 +3152,7 @@ static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
 
        domain_flush_tlb_pde(domain);
 
-       return get_order(unmap_size);
+       return unmap_size;
 }
 
 static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
@@ -2684,6 +3163,9 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
        phys_addr_t paddr;
        u64 *pte, __pte;
 
+       if (domain->mode == PAGE_MODE_NONE)
+               return iova;
+
        pte = fetch_pte(domain, iova);
 
        if (!pte || !IOMMU_PTE_PRESENT(*pte))
@@ -2711,6 +3193,26 @@ static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
        return 0;
 }
 
+static int amd_iommu_device_group(struct device *dev, unsigned int *groupid)
+{
+       struct iommu_dev_data *dev_data = dev->archdata.iommu;
+       struct pci_dev *pdev = to_pci_dev(dev);
+       u16 devid;
+
+       if (!dev_data)
+               return -ENODEV;
+
+       if (pdev->is_virtfn || !iommu_group_mf)
+               devid = dev_data->devid;
+       else
+               devid = calc_devid(pdev->bus->number,
+                                  PCI_DEVFN(PCI_SLOT(pdev->devfn), 0));
+
+       *groupid = amd_iommu_alias_table[devid];
+
+       return 0;
+}
+
 static struct iommu_ops amd_iommu_ops = {
        .domain_init = amd_iommu_domain_init,
        .domain_destroy = amd_iommu_domain_destroy,
@@ -2720,6 +3222,8 @@ static struct iommu_ops amd_iommu_ops = {
        .unmap = amd_iommu_unmap,
        .iova_to_phys = amd_iommu_iova_to_phys,
        .domain_has_cap = amd_iommu_domain_has_cap,
+       .device_group = amd_iommu_device_group,
+       .pgsize_bitmap  = AMD_IOMMU_PGSIZES,
 };
 
 /*****************************************************************************
@@ -2734,21 +3238,23 @@ static struct iommu_ops amd_iommu_ops = {
 
 int __init amd_iommu_init_passthrough(void)
 {
-       struct amd_iommu *iommu;
+       struct iommu_dev_data *dev_data;
        struct pci_dev *dev = NULL;
+       struct amd_iommu *iommu;
        u16 devid;
+       int ret;
 
-       /* allocate passthrough domain */
-       pt_domain = protection_domain_alloc();
-       if (!pt_domain)
-               return -ENOMEM;
-
-       pt_domain->mode |= PAGE_MODE_NONE;
+       ret = alloc_passthrough_domain();
+       if (ret)
+               return ret;
 
        for_each_pci_dev(dev) {
                if (!check_device(&dev->dev))
                        continue;
 
+               dev_data = get_dev_data(&dev->dev);
+               dev_data->passthrough = true;
+
                devid = get_device_id(&dev->dev);
 
                iommu = amd_iommu_rlookup_table[devid];
@@ -2758,7 +3264,375 @@ int __init amd_iommu_init_passthrough(void)
                attach_device(&dev->dev, pt_domain);
        }
 
+       amd_iommu_stats_init();
+
        pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
 
        return 0;
 }
+
+/* IOMMUv2 specific functions */
+int amd_iommu_register_ppr_notifier(struct notifier_block *nb)
+{
+       return atomic_notifier_chain_register(&ppr_notifier, nb);
+}
+EXPORT_SYMBOL(amd_iommu_register_ppr_notifier);
+
+int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb)
+{
+       return atomic_notifier_chain_unregister(&ppr_notifier, nb);
+}
+EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier);
+
+void amd_iommu_domain_direct_map(struct iommu_domain *dom)
+{
+       struct protection_domain *domain = dom->priv;
+       unsigned long flags;
+
+       spin_lock_irqsave(&domain->lock, flags);
+
+       /* Update data structure */
+       domain->mode    = PAGE_MODE_NONE;
+       domain->updated = true;
+
+       /* Make changes visible to IOMMUs */
+       update_domain(domain);
+
+       /* Page-table is not visible to IOMMU anymore, so free it */
+       free_pagetable(domain);
+
+       spin_unlock_irqrestore(&domain->lock, flags);
+}
+EXPORT_SYMBOL(amd_iommu_domain_direct_map);
+
+int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids)
+{
+       struct protection_domain *domain = dom->priv;
+       unsigned long flags;
+       int levels, ret;
+
+       if (pasids <= 0 || pasids > (PASID_MASK + 1))
+               return -EINVAL;
+
+       /* Number of GCR3 table levels required */
+       for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9)
+               levels += 1;
+
+       if (levels > amd_iommu_max_glx_val)
+               return -EINVAL;
+
+       spin_lock_irqsave(&domain->lock, flags);
+
+       /*
+        * Save us all sanity checks whether devices already in the
+        * domain support IOMMUv2. Just force that the domain has no
+        * devices attached when it is switched into IOMMUv2 mode.
+        */
+       ret = -EBUSY;
+       if (domain->dev_cnt > 0 || domain->flags & PD_IOMMUV2_MASK)
+               goto out;
+
+       ret = -ENOMEM;
+       domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC);
+       if (domain->gcr3_tbl == NULL)
+               goto out;
+
+       domain->glx      = levels;
+       domain->flags   |= PD_IOMMUV2_MASK;
+       domain->updated  = true;
+
+       update_domain(domain);
+
+       ret = 0;
+
+out:
+       spin_unlock_irqrestore(&domain->lock, flags);
+
+       return ret;
+}
+EXPORT_SYMBOL(amd_iommu_domain_enable_v2);
+
+static int __flush_pasid(struct protection_domain *domain, int pasid,
+                        u64 address, bool size)
+{
+       struct iommu_dev_data *dev_data;
+       struct iommu_cmd cmd;
+       int i, ret;
+
+       if (!(domain->flags & PD_IOMMUV2_MASK))
+               return -EINVAL;
+
+       build_inv_iommu_pasid(&cmd, domain->id, pasid, address, size);
+
+       /*
+        * IOMMU TLB needs to be flushed before Device TLB to
+        * prevent device TLB refill from IOMMU TLB
+        */
+       for (i = 0; i < amd_iommus_present; ++i) {
+               if (domain->dev_iommu[i] == 0)
+                       continue;
+
+               ret = iommu_queue_command(amd_iommus[i], &cmd);
+               if (ret != 0)
+                       goto out;
+       }
+
+       /* Wait until IOMMU TLB flushes are complete */
+       domain_flush_complete(domain);
+
+       /* Now flush device TLBs */
+       list_for_each_entry(dev_data, &domain->dev_list, list) {
+               struct amd_iommu *iommu;
+               int qdep;
+
+               BUG_ON(!dev_data->ats.enabled);
+
+               qdep  = dev_data->ats.qdep;
+               iommu = amd_iommu_rlookup_table[dev_data->devid];
+
+               build_inv_iotlb_pasid(&cmd, dev_data->devid, pasid,
+                                     qdep, address, size);
+
+               ret = iommu_queue_command(iommu, &cmd);
+               if (ret != 0)
+                       goto out;
+       }
+
+       /* Wait until all device TLBs are flushed */
+       domain_flush_complete(domain);
+
+       ret = 0;
+
+out:
+
+       return ret;
+}
+
+static int __amd_iommu_flush_page(struct protection_domain *domain, int pasid,
+                                 u64 address)
+{
+       INC_STATS_COUNTER(invalidate_iotlb);
+
+       return __flush_pasid(domain, pasid, address, false);
+}
+
+int amd_iommu_flush_page(struct iommu_domain *dom, int pasid,
+                        u64 address)
+{
+       struct protection_domain *domain = dom->priv;
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&domain->lock, flags);
+       ret = __amd_iommu_flush_page(domain, pasid, address);
+       spin_unlock_irqrestore(&domain->lock, flags);
+
+       return ret;
+}
+EXPORT_SYMBOL(amd_iommu_flush_page);
+
+static int __amd_iommu_flush_tlb(struct protection_domain *domain, int pasid)
+{
+       INC_STATS_COUNTER(invalidate_iotlb_all);
+
+       return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
+                            true);
+}
+
+int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid)
+{
+       struct protection_domain *domain = dom->priv;
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&domain->lock, flags);
+       ret = __amd_iommu_flush_tlb(domain, pasid);
+       spin_unlock_irqrestore(&domain->lock, flags);
+
+       return ret;
+}
+EXPORT_SYMBOL(amd_iommu_flush_tlb);
+
+static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc)
+{
+       int index;
+       u64 *pte;
+
+       while (true) {
+
+               index = (pasid >> (9 * level)) & 0x1ff;
+               pte   = &root[index];
+
+               if (level == 0)
+                       break;
+
+               if (!(*pte & GCR3_VALID)) {
+                       if (!alloc)
+                               return NULL;
+
+                       root = (void *)get_zeroed_page(GFP_ATOMIC);
+                       if (root == NULL)
+                               return NULL;
+
+                       *pte = __pa(root) | GCR3_VALID;
+               }
+
+               root = __va(*pte & PAGE_MASK);
+
+               level -= 1;
+       }
+
+       return pte;
+}
+
+static int __set_gcr3(struct protection_domain *domain, int pasid,
+                     unsigned long cr3)
+{
+       u64 *pte;
+
+       if (domain->mode != PAGE_MODE_NONE)
+               return -EINVAL;
+
+       pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true);
+       if (pte == NULL)
+               return -ENOMEM;
+
+       *pte = (cr3 & PAGE_MASK) | GCR3_VALID;
+
+       return __amd_iommu_flush_tlb(domain, pasid);
+}
+
+static int __clear_gcr3(struct protection_domain *domain, int pasid)
+{
+       u64 *pte;
+
+       if (domain->mode != PAGE_MODE_NONE)
+               return -EINVAL;
+
+       pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false);
+       if (pte == NULL)
+               return 0;
+
+       *pte = 0;
+
+       return __amd_iommu_flush_tlb(domain, pasid);
+}
+
+int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid,
+                             unsigned long cr3)
+{
+       struct protection_domain *domain = dom->priv;
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&domain->lock, flags);
+       ret = __set_gcr3(domain, pasid, cr3);
+       spin_unlock_irqrestore(&domain->lock, flags);
+
+       return ret;
+}
+EXPORT_SYMBOL(amd_iommu_domain_set_gcr3);
+
+int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid)
+{
+       struct protection_domain *domain = dom->priv;
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&domain->lock, flags);
+       ret = __clear_gcr3(domain, pasid);
+       spin_unlock_irqrestore(&domain->lock, flags);
+
+       return ret;
+}
+EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3);
+
+int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid,
+                          int status, int tag)
+{
+       struct iommu_dev_data *dev_data;
+       struct amd_iommu *iommu;
+       struct iommu_cmd cmd;
+
+       INC_STATS_COUNTER(complete_ppr);
+
+       dev_data = get_dev_data(&pdev->dev);
+       iommu    = amd_iommu_rlookup_table[dev_data->devid];
+
+       build_complete_ppr(&cmd, dev_data->devid, pasid, status,
+                          tag, dev_data->pri_tlp);
+
+       return iommu_queue_command(iommu, &cmd);
+}
+EXPORT_SYMBOL(amd_iommu_complete_ppr);
+
+struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev)
+{
+       struct protection_domain *domain;
+
+       domain = get_domain(&pdev->dev);
+       if (IS_ERR(domain))
+               return NULL;
+
+       /* Only return IOMMUv2 domains */
+       if (!(domain->flags & PD_IOMMUV2_MASK))
+               return NULL;
+
+       return domain->iommu_domain;
+}
+EXPORT_SYMBOL(amd_iommu_get_v2_domain);
+
+void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum)
+{
+       struct iommu_dev_data *dev_data;
+
+       if (!amd_iommu_v2_supported())
+               return;
+
+       dev_data = get_dev_data(&pdev->dev);
+       dev_data->errata |= (1 << erratum);
+}
+EXPORT_SYMBOL(amd_iommu_enable_device_erratum);
+
+int amd_iommu_device_info(struct pci_dev *pdev,
+                          struct amd_iommu_device_info *info)
+{
+       int max_pasids;
+       int pos;
+
+       if (pdev == NULL || info == NULL)
+               return -EINVAL;
+
+       if (!amd_iommu_v2_supported())
+               return -EINVAL;
+
+       memset(info, 0, sizeof(*info));
+
+       pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS);
+       if (pos)
+               info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
+
+       pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+       if (pos)
+               info->flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP;
+
+       pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
+       if (pos) {
+               int features;
+
+               max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1));
+               max_pasids = min(max_pasids, (1 << 20));
+
+               info->flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
+               info->max_pasids = min(pci_max_pasids(pdev), max_pasids);
+
+               features = pci_pasid_features(pdev);
+               if (features & PCI_PASID_CAP_EXEC)
+                       info->flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP;
+               if (features & PCI_PASID_CAP_PRIV)
+                       info->flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(amd_iommu_device_info);