x86: Handle HW IOMMU initialization failure gracefully
[linux-2.6.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
45
46 #define ROOT_SIZE               VTD_PAGE_SIZE
47 #define CONTEXT_SIZE            VTD_PAGE_SIZE
48
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
52
53 #define IOAPIC_RANGE_START      (0xfee00000)
54 #define IOAPIC_RANGE_END        (0xfeefffff)
55 #define IOVA_START_ADDR         (0x1000)
56
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
58
59 #define MAX_AGAW_WIDTH 64
60
61 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
63
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
67                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
69
70 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
73
74
75 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
76    are never going to work. */
77 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
78 {
79         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
80 }
81
82 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
83 {
84         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
85 }
86 static inline unsigned long page_to_dma_pfn(struct page *pg)
87 {
88         return mm_to_dma_pfn(page_to_pfn(pg));
89 }
90 static inline unsigned long virt_to_dma_pfn(void *p)
91 {
92         return page_to_dma_pfn(virt_to_page(p));
93 }
94
95 /* global iommu list, set NULL for ignored DMAR units */
96 static struct intel_iommu **g_iommus;
97
98 static void __init check_tylersburg_isoch(void);
99 static int rwbf_quirk;
100
101 /*
102  * 0: Present
103  * 1-11: Reserved
104  * 12-63: Context Ptr (12 - (haw-1))
105  * 64-127: Reserved
106  */
107 struct root_entry {
108         u64     val;
109         u64     rsvd1;
110 };
111 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
112 static inline bool root_present(struct root_entry *root)
113 {
114         return (root->val & 1);
115 }
116 static inline void set_root_present(struct root_entry *root)
117 {
118         root->val |= 1;
119 }
120 static inline void set_root_value(struct root_entry *root, unsigned long value)
121 {
122         root->val |= value & VTD_PAGE_MASK;
123 }
124
125 static inline struct context_entry *
126 get_context_addr_from_root(struct root_entry *root)
127 {
128         return (struct context_entry *)
129                 (root_present(root)?phys_to_virt(
130                 root->val & VTD_PAGE_MASK) :
131                 NULL);
132 }
133
134 /*
135  * low 64 bits:
136  * 0: present
137  * 1: fault processing disable
138  * 2-3: translation type
139  * 12-63: address space root
140  * high 64 bits:
141  * 0-2: address width
142  * 3-6: aval
143  * 8-23: domain id
144  */
145 struct context_entry {
146         u64 lo;
147         u64 hi;
148 };
149
150 static inline bool context_present(struct context_entry *context)
151 {
152         return (context->lo & 1);
153 }
154 static inline void context_set_present(struct context_entry *context)
155 {
156         context->lo |= 1;
157 }
158
159 static inline void context_set_fault_enable(struct context_entry *context)
160 {
161         context->lo &= (((u64)-1) << 2) | 1;
162 }
163
164 static inline void context_set_translation_type(struct context_entry *context,
165                                                 unsigned long value)
166 {
167         context->lo &= (((u64)-1) << 4) | 3;
168         context->lo |= (value & 3) << 2;
169 }
170
171 static inline void context_set_address_root(struct context_entry *context,
172                                             unsigned long value)
173 {
174         context->lo |= value & VTD_PAGE_MASK;
175 }
176
177 static inline void context_set_address_width(struct context_entry *context,
178                                              unsigned long value)
179 {
180         context->hi |= value & 7;
181 }
182
183 static inline void context_set_domain_id(struct context_entry *context,
184                                          unsigned long value)
185 {
186         context->hi |= (value & ((1 << 16) - 1)) << 8;
187 }
188
189 static inline void context_clear_entry(struct context_entry *context)
190 {
191         context->lo = 0;
192         context->hi = 0;
193 }
194
195 /*
196  * 0: readable
197  * 1: writable
198  * 2-6: reserved
199  * 7: super page
200  * 8-10: available
201  * 11: snoop behavior
202  * 12-63: Host physcial address
203  */
204 struct dma_pte {
205         u64 val;
206 };
207
208 static inline void dma_clear_pte(struct dma_pte *pte)
209 {
210         pte->val = 0;
211 }
212
213 static inline void dma_set_pte_readable(struct dma_pte *pte)
214 {
215         pte->val |= DMA_PTE_READ;
216 }
217
218 static inline void dma_set_pte_writable(struct dma_pte *pte)
219 {
220         pte->val |= DMA_PTE_WRITE;
221 }
222
223 static inline void dma_set_pte_snp(struct dma_pte *pte)
224 {
225         pte->val |= DMA_PTE_SNP;
226 }
227
228 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
229 {
230         pte->val = (pte->val & ~3) | (prot & 3);
231 }
232
233 static inline u64 dma_pte_addr(struct dma_pte *pte)
234 {
235 #ifdef CONFIG_64BIT
236         return pte->val & VTD_PAGE_MASK;
237 #else
238         /* Must have a full atomic 64-bit read */
239         return  __cmpxchg64(pte, 0ULL, 0ULL) & VTD_PAGE_MASK;
240 #endif
241 }
242
243 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
244 {
245         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
246 }
247
248 static inline bool dma_pte_present(struct dma_pte *pte)
249 {
250         return (pte->val & 3) != 0;
251 }
252
253 static inline int first_pte_in_page(struct dma_pte *pte)
254 {
255         return !((unsigned long)pte & ~VTD_PAGE_MASK);
256 }
257
258 /*
259  * This domain is a statically identity mapping domain.
260  *      1. This domain creats a static 1:1 mapping to all usable memory.
261  *      2. It maps to each iommu if successful.
262  *      3. Each iommu mapps to this domain if successful.
263  */
264 static struct dmar_domain *si_domain;
265 static int hw_pass_through = 1;
266
267 /* devices under the same p2p bridge are owned in one domain */
268 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
269
270 /* domain represents a virtual machine, more than one devices
271  * across iommus may be owned in one domain, e.g. kvm guest.
272  */
273 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
274
275 /* si_domain contains mulitple devices */
276 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
277
278 struct dmar_domain {
279         int     id;                     /* domain id */
280         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
281
282         struct list_head devices;       /* all devices' list */
283         struct iova_domain iovad;       /* iova's that belong to this domain */
284
285         struct dma_pte  *pgd;           /* virtual address */
286         int             gaw;            /* max guest address width */
287
288         /* adjusted guest address width, 0 is level 2 30-bit */
289         int             agaw;
290
291         int             flags;          /* flags to find out type of domain */
292
293         int             iommu_coherency;/* indicate coherency of iommu access */
294         int             iommu_snooping; /* indicate snooping control feature*/
295         int             iommu_count;    /* reference count of iommu */
296         spinlock_t      iommu_lock;     /* protect iommu set in domain */
297         u64             max_addr;       /* maximum mapped address */
298 };
299
300 /* PCI domain-device relationship */
301 struct device_domain_info {
302         struct list_head link;  /* link to domain siblings */
303         struct list_head global; /* link to global list */
304         int segment;            /* PCI domain */
305         u8 bus;                 /* PCI bus number */
306         u8 devfn;               /* PCI devfn number */
307         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
308         struct intel_iommu *iommu; /* IOMMU used by this device */
309         struct dmar_domain *domain; /* pointer to domain */
310 };
311
312 static void flush_unmaps_timeout(unsigned long data);
313
314 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
315
316 #define HIGH_WATER_MARK 250
317 struct deferred_flush_tables {
318         int next;
319         struct iova *iova[HIGH_WATER_MARK];
320         struct dmar_domain *domain[HIGH_WATER_MARK];
321 };
322
323 static struct deferred_flush_tables *deferred_flush;
324
325 /* bitmap for indexing intel_iommus */
326 static int g_num_of_iommus;
327
328 static DEFINE_SPINLOCK(async_umap_flush_lock);
329 static LIST_HEAD(unmaps_to_do);
330
331 static int timer_on;
332 static long list_size;
333
334 static void domain_remove_dev_info(struct dmar_domain *domain);
335
336 #ifdef CONFIG_DMAR_DEFAULT_ON
337 int dmar_disabled = 0;
338 #else
339 int dmar_disabled = 1;
340 #endif /*CONFIG_DMAR_DEFAULT_ON*/
341
342 static int __initdata dmar_map_gfx = 1;
343 static int dmar_forcedac;
344 static int intel_iommu_strict;
345
346 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
347 static DEFINE_SPINLOCK(device_domain_lock);
348 static LIST_HEAD(device_domain_list);
349
350 static struct iommu_ops intel_iommu_ops;
351
352 static int __init intel_iommu_setup(char *str)
353 {
354         if (!str)
355                 return -EINVAL;
356         while (*str) {
357                 if (!strncmp(str, "on", 2)) {
358                         dmar_disabled = 0;
359                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
360                 } else if (!strncmp(str, "off", 3)) {
361                         dmar_disabled = 1;
362                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
363                 } else if (!strncmp(str, "igfx_off", 8)) {
364                         dmar_map_gfx = 0;
365                         printk(KERN_INFO
366                                 "Intel-IOMMU: disable GFX device mapping\n");
367                 } else if (!strncmp(str, "forcedac", 8)) {
368                         printk(KERN_INFO
369                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
370                         dmar_forcedac = 1;
371                 } else if (!strncmp(str, "strict", 6)) {
372                         printk(KERN_INFO
373                                 "Intel-IOMMU: disable batched IOTLB flush\n");
374                         intel_iommu_strict = 1;
375                 }
376
377                 str += strcspn(str, ",");
378                 while (*str == ',')
379                         str++;
380         }
381         return 0;
382 }
383 __setup("intel_iommu=", intel_iommu_setup);
384
385 static struct kmem_cache *iommu_domain_cache;
386 static struct kmem_cache *iommu_devinfo_cache;
387 static struct kmem_cache *iommu_iova_cache;
388
389 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
390 {
391         unsigned int flags;
392         void *vaddr;
393
394         /* trying to avoid low memory issues */
395         flags = current->flags & PF_MEMALLOC;
396         current->flags |= PF_MEMALLOC;
397         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
398         current->flags &= (~PF_MEMALLOC | flags);
399         return vaddr;
400 }
401
402
403 static inline void *alloc_pgtable_page(void)
404 {
405         unsigned int flags;
406         void *vaddr;
407
408         /* trying to avoid low memory issues */
409         flags = current->flags & PF_MEMALLOC;
410         current->flags |= PF_MEMALLOC;
411         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
412         current->flags &= (~PF_MEMALLOC | flags);
413         return vaddr;
414 }
415
416 static inline void free_pgtable_page(void *vaddr)
417 {
418         free_page((unsigned long)vaddr);
419 }
420
421 static inline void *alloc_domain_mem(void)
422 {
423         return iommu_kmem_cache_alloc(iommu_domain_cache);
424 }
425
426 static void free_domain_mem(void *vaddr)
427 {
428         kmem_cache_free(iommu_domain_cache, vaddr);
429 }
430
431 static inline void * alloc_devinfo_mem(void)
432 {
433         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
434 }
435
436 static inline void free_devinfo_mem(void *vaddr)
437 {
438         kmem_cache_free(iommu_devinfo_cache, vaddr);
439 }
440
441 struct iova *alloc_iova_mem(void)
442 {
443         return iommu_kmem_cache_alloc(iommu_iova_cache);
444 }
445
446 void free_iova_mem(struct iova *iova)
447 {
448         kmem_cache_free(iommu_iova_cache, iova);
449 }
450
451
452 static inline int width_to_agaw(int width);
453
454 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
455 {
456         unsigned long sagaw;
457         int agaw = -1;
458
459         sagaw = cap_sagaw(iommu->cap);
460         for (agaw = width_to_agaw(max_gaw);
461              agaw >= 0; agaw--) {
462                 if (test_bit(agaw, &sagaw))
463                         break;
464         }
465
466         return agaw;
467 }
468
469 /*
470  * Calculate max SAGAW for each iommu.
471  */
472 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
473 {
474         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
475 }
476
477 /*
478  * calculate agaw for each iommu.
479  * "SAGAW" may be different across iommus, use a default agaw, and
480  * get a supported less agaw for iommus that don't support the default agaw.
481  */
482 int iommu_calculate_agaw(struct intel_iommu *iommu)
483 {
484         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
485 }
486
487 /* This functionin only returns single iommu in a domain */
488 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
489 {
490         int iommu_id;
491
492         /* si_domain and vm domain should not get here. */
493         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
494         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
495
496         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
497         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
498                 return NULL;
499
500         return g_iommus[iommu_id];
501 }
502
503 static void domain_update_iommu_coherency(struct dmar_domain *domain)
504 {
505         int i;
506
507         domain->iommu_coherency = 1;
508
509         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
510         for (; i < g_num_of_iommus; ) {
511                 if (!ecap_coherent(g_iommus[i]->ecap)) {
512                         domain->iommu_coherency = 0;
513                         break;
514                 }
515                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
516         }
517 }
518
519 static void domain_update_iommu_snooping(struct dmar_domain *domain)
520 {
521         int i;
522
523         domain->iommu_snooping = 1;
524
525         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
526         for (; i < g_num_of_iommus; ) {
527                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
528                         domain->iommu_snooping = 0;
529                         break;
530                 }
531                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
532         }
533 }
534
535 /* Some capabilities may be different across iommus */
536 static void domain_update_iommu_cap(struct dmar_domain *domain)
537 {
538         domain_update_iommu_coherency(domain);
539         domain_update_iommu_snooping(domain);
540 }
541
542 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
543 {
544         struct dmar_drhd_unit *drhd = NULL;
545         int i;
546
547         for_each_drhd_unit(drhd) {
548                 if (drhd->ignored)
549                         continue;
550                 if (segment != drhd->segment)
551                         continue;
552
553                 for (i = 0; i < drhd->devices_cnt; i++) {
554                         if (drhd->devices[i] &&
555                             drhd->devices[i]->bus->number == bus &&
556                             drhd->devices[i]->devfn == devfn)
557                                 return drhd->iommu;
558                         if (drhd->devices[i] &&
559                             drhd->devices[i]->subordinate &&
560                             drhd->devices[i]->subordinate->number <= bus &&
561                             drhd->devices[i]->subordinate->subordinate >= bus)
562                                 return drhd->iommu;
563                 }
564
565                 if (drhd->include_all)
566                         return drhd->iommu;
567         }
568
569         return NULL;
570 }
571
572 static void domain_flush_cache(struct dmar_domain *domain,
573                                void *addr, int size)
574 {
575         if (!domain->iommu_coherency)
576                 clflush_cache_range(addr, size);
577 }
578
579 /* Gets context entry for a given bus and devfn */
580 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
581                 u8 bus, u8 devfn)
582 {
583         struct root_entry *root;
584         struct context_entry *context;
585         unsigned long phy_addr;
586         unsigned long flags;
587
588         spin_lock_irqsave(&iommu->lock, flags);
589         root = &iommu->root_entry[bus];
590         context = get_context_addr_from_root(root);
591         if (!context) {
592                 context = (struct context_entry *)alloc_pgtable_page();
593                 if (!context) {
594                         spin_unlock_irqrestore(&iommu->lock, flags);
595                         return NULL;
596                 }
597                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
598                 phy_addr = virt_to_phys((void *)context);
599                 set_root_value(root, phy_addr);
600                 set_root_present(root);
601                 __iommu_flush_cache(iommu, root, sizeof(*root));
602         }
603         spin_unlock_irqrestore(&iommu->lock, flags);
604         return &context[devfn];
605 }
606
607 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
608 {
609         struct root_entry *root;
610         struct context_entry *context;
611         int ret;
612         unsigned long flags;
613
614         spin_lock_irqsave(&iommu->lock, flags);
615         root = &iommu->root_entry[bus];
616         context = get_context_addr_from_root(root);
617         if (!context) {
618                 ret = 0;
619                 goto out;
620         }
621         ret = context_present(&context[devfn]);
622 out:
623         spin_unlock_irqrestore(&iommu->lock, flags);
624         return ret;
625 }
626
627 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
628 {
629         struct root_entry *root;
630         struct context_entry *context;
631         unsigned long flags;
632
633         spin_lock_irqsave(&iommu->lock, flags);
634         root = &iommu->root_entry[bus];
635         context = get_context_addr_from_root(root);
636         if (context) {
637                 context_clear_entry(&context[devfn]);
638                 __iommu_flush_cache(iommu, &context[devfn], \
639                         sizeof(*context));
640         }
641         spin_unlock_irqrestore(&iommu->lock, flags);
642 }
643
644 static void free_context_table(struct intel_iommu *iommu)
645 {
646         struct root_entry *root;
647         int i;
648         unsigned long flags;
649         struct context_entry *context;
650
651         spin_lock_irqsave(&iommu->lock, flags);
652         if (!iommu->root_entry) {
653                 goto out;
654         }
655         for (i = 0; i < ROOT_ENTRY_NR; i++) {
656                 root = &iommu->root_entry[i];
657                 context = get_context_addr_from_root(root);
658                 if (context)
659                         free_pgtable_page(context);
660         }
661         free_pgtable_page(iommu->root_entry);
662         iommu->root_entry = NULL;
663 out:
664         spin_unlock_irqrestore(&iommu->lock, flags);
665 }
666
667 /* page table handling */
668 #define LEVEL_STRIDE            (9)
669 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
670
671 static inline int agaw_to_level(int agaw)
672 {
673         return agaw + 2;
674 }
675
676 static inline int agaw_to_width(int agaw)
677 {
678         return 30 + agaw * LEVEL_STRIDE;
679
680 }
681
682 static inline int width_to_agaw(int width)
683 {
684         return (width - 30) / LEVEL_STRIDE;
685 }
686
687 static inline unsigned int level_to_offset_bits(int level)
688 {
689         return (level - 1) * LEVEL_STRIDE;
690 }
691
692 static inline int pfn_level_offset(unsigned long pfn, int level)
693 {
694         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
695 }
696
697 static inline unsigned long level_mask(int level)
698 {
699         return -1UL << level_to_offset_bits(level);
700 }
701
702 static inline unsigned long level_size(int level)
703 {
704         return 1UL << level_to_offset_bits(level);
705 }
706
707 static inline unsigned long align_to_level(unsigned long pfn, int level)
708 {
709         return (pfn + level_size(level) - 1) & level_mask(level);
710 }
711
712 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
713                                       unsigned long pfn)
714 {
715         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
716         struct dma_pte *parent, *pte = NULL;
717         int level = agaw_to_level(domain->agaw);
718         int offset;
719
720         BUG_ON(!domain->pgd);
721         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
722         parent = domain->pgd;
723
724         while (level > 0) {
725                 void *tmp_page;
726
727                 offset = pfn_level_offset(pfn, level);
728                 pte = &parent[offset];
729                 if (level == 1)
730                         break;
731
732                 if (!dma_pte_present(pte)) {
733                         uint64_t pteval;
734
735                         tmp_page = alloc_pgtable_page();
736
737                         if (!tmp_page)
738                                 return NULL;
739
740                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
741                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
742                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
743                                 /* Someone else set it while we were thinking; use theirs. */
744                                 free_pgtable_page(tmp_page);
745                         } else {
746                                 dma_pte_addr(pte);
747                                 domain_flush_cache(domain, pte, sizeof(*pte));
748                         }
749                 }
750                 parent = phys_to_virt(dma_pte_addr(pte));
751                 level--;
752         }
753
754         return pte;
755 }
756
757 /* return address's pte at specific level */
758 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
759                                          unsigned long pfn,
760                                          int level)
761 {
762         struct dma_pte *parent, *pte = NULL;
763         int total = agaw_to_level(domain->agaw);
764         int offset;
765
766         parent = domain->pgd;
767         while (level <= total) {
768                 offset = pfn_level_offset(pfn, total);
769                 pte = &parent[offset];
770                 if (level == total)
771                         return pte;
772
773                 if (!dma_pte_present(pte))
774                         break;
775                 parent = phys_to_virt(dma_pte_addr(pte));
776                 total--;
777         }
778         return NULL;
779 }
780
781 /* clear last level pte, a tlb flush should be followed */
782 static void dma_pte_clear_range(struct dmar_domain *domain,
783                                 unsigned long start_pfn,
784                                 unsigned long last_pfn)
785 {
786         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
787         struct dma_pte *first_pte, *pte;
788
789         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
790         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
791         BUG_ON(start_pfn > last_pfn);
792
793         /* we don't need lock here; nobody else touches the iova range */
794         do {
795                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
796                 if (!pte) {
797                         start_pfn = align_to_level(start_pfn + 1, 2);
798                         continue;
799                 }
800                 do { 
801                         dma_clear_pte(pte);
802                         start_pfn++;
803                         pte++;
804                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
805
806                 domain_flush_cache(domain, first_pte,
807                                    (void *)pte - (void *)first_pte);
808
809         } while (start_pfn && start_pfn <= last_pfn);
810 }
811
812 /* free page table pages. last level pte should already be cleared */
813 static void dma_pte_free_pagetable(struct dmar_domain *domain,
814                                    unsigned long start_pfn,
815                                    unsigned long last_pfn)
816 {
817         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
818         struct dma_pte *first_pte, *pte;
819         int total = agaw_to_level(domain->agaw);
820         int level;
821         unsigned long tmp;
822
823         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
824         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
825         BUG_ON(start_pfn > last_pfn);
826
827         /* We don't need lock here; nobody else touches the iova range */
828         level = 2;
829         while (level <= total) {
830                 tmp = align_to_level(start_pfn, level);
831
832                 /* If we can't even clear one PTE at this level, we're done */
833                 if (tmp + level_size(level) - 1 > last_pfn)
834                         return;
835
836                 do {
837                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
838                         if (!pte) {
839                                 tmp = align_to_level(tmp + 1, level + 1);
840                                 continue;
841                         }
842                         do {
843                                 if (dma_pte_present(pte)) {
844                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
845                                         dma_clear_pte(pte);
846                                 }
847                                 pte++;
848                                 tmp += level_size(level);
849                         } while (!first_pte_in_page(pte) &&
850                                  tmp + level_size(level) - 1 <= last_pfn);
851
852                         domain_flush_cache(domain, first_pte,
853                                            (void *)pte - (void *)first_pte);
854                         
855                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
856                 level++;
857         }
858         /* free pgd */
859         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
860                 free_pgtable_page(domain->pgd);
861                 domain->pgd = NULL;
862         }
863 }
864
865 /* iommu handling */
866 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
867 {
868         struct root_entry *root;
869         unsigned long flags;
870
871         root = (struct root_entry *)alloc_pgtable_page();
872         if (!root)
873                 return -ENOMEM;
874
875         __iommu_flush_cache(iommu, root, ROOT_SIZE);
876
877         spin_lock_irqsave(&iommu->lock, flags);
878         iommu->root_entry = root;
879         spin_unlock_irqrestore(&iommu->lock, flags);
880
881         return 0;
882 }
883
884 static void iommu_set_root_entry(struct intel_iommu *iommu)
885 {
886         void *addr;
887         u32 sts;
888         unsigned long flag;
889
890         addr = iommu->root_entry;
891
892         spin_lock_irqsave(&iommu->register_lock, flag);
893         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
894
895         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
896
897         /* Make sure hardware complete it */
898         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
899                       readl, (sts & DMA_GSTS_RTPS), sts);
900
901         spin_unlock_irqrestore(&iommu->register_lock, flag);
902 }
903
904 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
905 {
906         u32 val;
907         unsigned long flag;
908
909         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
910                 return;
911
912         spin_lock_irqsave(&iommu->register_lock, flag);
913         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
914
915         /* Make sure hardware complete it */
916         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
917                       readl, (!(val & DMA_GSTS_WBFS)), val);
918
919         spin_unlock_irqrestore(&iommu->register_lock, flag);
920 }
921
922 /* return value determine if we need a write buffer flush */
923 static void __iommu_flush_context(struct intel_iommu *iommu,
924                                   u16 did, u16 source_id, u8 function_mask,
925                                   u64 type)
926 {
927         u64 val = 0;
928         unsigned long flag;
929
930         switch (type) {
931         case DMA_CCMD_GLOBAL_INVL:
932                 val = DMA_CCMD_GLOBAL_INVL;
933                 break;
934         case DMA_CCMD_DOMAIN_INVL:
935                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
936                 break;
937         case DMA_CCMD_DEVICE_INVL:
938                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
939                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
940                 break;
941         default:
942                 BUG();
943         }
944         val |= DMA_CCMD_ICC;
945
946         spin_lock_irqsave(&iommu->register_lock, flag);
947         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
948
949         /* Make sure hardware complete it */
950         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
951                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
952
953         spin_unlock_irqrestore(&iommu->register_lock, flag);
954 }
955
956 /* return value determine if we need a write buffer flush */
957 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
958                                 u64 addr, unsigned int size_order, u64 type)
959 {
960         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
961         u64 val = 0, val_iva = 0;
962         unsigned long flag;
963
964         switch (type) {
965         case DMA_TLB_GLOBAL_FLUSH:
966                 /* global flush doesn't need set IVA_REG */
967                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
968                 break;
969         case DMA_TLB_DSI_FLUSH:
970                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
971                 break;
972         case DMA_TLB_PSI_FLUSH:
973                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
974                 /* Note: always flush non-leaf currently */
975                 val_iva = size_order | addr;
976                 break;
977         default:
978                 BUG();
979         }
980         /* Note: set drain read/write */
981 #if 0
982         /*
983          * This is probably to be super secure.. Looks like we can
984          * ignore it without any impact.
985          */
986         if (cap_read_drain(iommu->cap))
987                 val |= DMA_TLB_READ_DRAIN;
988 #endif
989         if (cap_write_drain(iommu->cap))
990                 val |= DMA_TLB_WRITE_DRAIN;
991
992         spin_lock_irqsave(&iommu->register_lock, flag);
993         /* Note: Only uses first TLB reg currently */
994         if (val_iva)
995                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
996         dmar_writeq(iommu->reg + tlb_offset + 8, val);
997
998         /* Make sure hardware complete it */
999         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1000                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1001
1002         spin_unlock_irqrestore(&iommu->register_lock, flag);
1003
1004         /* check IOTLB invalidation granularity */
1005         if (DMA_TLB_IAIG(val) == 0)
1006                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1007         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1008                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1009                         (unsigned long long)DMA_TLB_IIRG(type),
1010                         (unsigned long long)DMA_TLB_IAIG(val));
1011 }
1012
1013 static struct device_domain_info *iommu_support_dev_iotlb(
1014         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1015 {
1016         int found = 0;
1017         unsigned long flags;
1018         struct device_domain_info *info;
1019         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1020
1021         if (!ecap_dev_iotlb_support(iommu->ecap))
1022                 return NULL;
1023
1024         if (!iommu->qi)
1025                 return NULL;
1026
1027         spin_lock_irqsave(&device_domain_lock, flags);
1028         list_for_each_entry(info, &domain->devices, link)
1029                 if (info->bus == bus && info->devfn == devfn) {
1030                         found = 1;
1031                         break;
1032                 }
1033         spin_unlock_irqrestore(&device_domain_lock, flags);
1034
1035         if (!found || !info->dev)
1036                 return NULL;
1037
1038         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1039                 return NULL;
1040
1041         if (!dmar_find_matched_atsr_unit(info->dev))
1042                 return NULL;
1043
1044         info->iommu = iommu;
1045
1046         return info;
1047 }
1048
1049 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1050 {
1051         if (!info)
1052                 return;
1053
1054         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1055 }
1056
1057 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1058 {
1059         if (!info->dev || !pci_ats_enabled(info->dev))
1060                 return;
1061
1062         pci_disable_ats(info->dev);
1063 }
1064
1065 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1066                                   u64 addr, unsigned mask)
1067 {
1068         u16 sid, qdep;
1069         unsigned long flags;
1070         struct device_domain_info *info;
1071
1072         spin_lock_irqsave(&device_domain_lock, flags);
1073         list_for_each_entry(info, &domain->devices, link) {
1074                 if (!info->dev || !pci_ats_enabled(info->dev))
1075                         continue;
1076
1077                 sid = info->bus << 8 | info->devfn;
1078                 qdep = pci_ats_queue_depth(info->dev);
1079                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1080         }
1081         spin_unlock_irqrestore(&device_domain_lock, flags);
1082 }
1083
1084 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1085                                   unsigned long pfn, unsigned int pages)
1086 {
1087         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1088         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1089
1090         BUG_ON(pages == 0);
1091
1092         /*
1093          * Fallback to domain selective flush if no PSI support or the size is
1094          * too big.
1095          * PSI requires page size to be 2 ^ x, and the base address is naturally
1096          * aligned to the size
1097          */
1098         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1099                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1100                                                 DMA_TLB_DSI_FLUSH);
1101         else
1102                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1103                                                 DMA_TLB_PSI_FLUSH);
1104
1105         /*
1106          * In caching mode, domain ID 0 is reserved for non-present to present
1107          * mapping flush. Device IOTLB doesn't need to be flushed in this case.
1108          */
1109         if (!cap_caching_mode(iommu->cap) || did)
1110                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1111 }
1112
1113 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1114 {
1115         u32 pmen;
1116         unsigned long flags;
1117
1118         spin_lock_irqsave(&iommu->register_lock, flags);
1119         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1120         pmen &= ~DMA_PMEN_EPM;
1121         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1122
1123         /* wait for the protected region status bit to clear */
1124         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1125                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1126
1127         spin_unlock_irqrestore(&iommu->register_lock, flags);
1128 }
1129
1130 static int iommu_enable_translation(struct intel_iommu *iommu)
1131 {
1132         u32 sts;
1133         unsigned long flags;
1134
1135         spin_lock_irqsave(&iommu->register_lock, flags);
1136         iommu->gcmd |= DMA_GCMD_TE;
1137         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1138
1139         /* Make sure hardware complete it */
1140         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1141                       readl, (sts & DMA_GSTS_TES), sts);
1142
1143         spin_unlock_irqrestore(&iommu->register_lock, flags);
1144         return 0;
1145 }
1146
1147 static int iommu_disable_translation(struct intel_iommu *iommu)
1148 {
1149         u32 sts;
1150         unsigned long flag;
1151
1152         spin_lock_irqsave(&iommu->register_lock, flag);
1153         iommu->gcmd &= ~DMA_GCMD_TE;
1154         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1155
1156         /* Make sure hardware complete it */
1157         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1158                       readl, (!(sts & DMA_GSTS_TES)), sts);
1159
1160         spin_unlock_irqrestore(&iommu->register_lock, flag);
1161         return 0;
1162 }
1163
1164
1165 static int iommu_init_domains(struct intel_iommu *iommu)
1166 {
1167         unsigned long ndomains;
1168         unsigned long nlongs;
1169
1170         ndomains = cap_ndoms(iommu->cap);
1171         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1172         nlongs = BITS_TO_LONGS(ndomains);
1173
1174         spin_lock_init(&iommu->lock);
1175
1176         /* TBD: there might be 64K domains,
1177          * consider other allocation for future chip
1178          */
1179         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1180         if (!iommu->domain_ids) {
1181                 printk(KERN_ERR "Allocating domain id array failed\n");
1182                 return -ENOMEM;
1183         }
1184         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1185                         GFP_KERNEL);
1186         if (!iommu->domains) {
1187                 printk(KERN_ERR "Allocating domain array failed\n");
1188                 return -ENOMEM;
1189         }
1190
1191         /*
1192          * if Caching mode is set, then invalid translations are tagged
1193          * with domainid 0. Hence we need to pre-allocate it.
1194          */
1195         if (cap_caching_mode(iommu->cap))
1196                 set_bit(0, iommu->domain_ids);
1197         return 0;
1198 }
1199
1200
1201 static void domain_exit(struct dmar_domain *domain);
1202 static void vm_domain_exit(struct dmar_domain *domain);
1203
1204 void free_dmar_iommu(struct intel_iommu *iommu)
1205 {
1206         struct dmar_domain *domain;
1207         int i;
1208         unsigned long flags;
1209
1210         if ((iommu->domains) && (iommu->domain_ids)) {
1211                 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1212                 for (; i < cap_ndoms(iommu->cap); ) {
1213                         domain = iommu->domains[i];
1214                         clear_bit(i, iommu->domain_ids);
1215
1216                         spin_lock_irqsave(&domain->iommu_lock, flags);
1217                         if (--domain->iommu_count == 0) {
1218                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1219                                         vm_domain_exit(domain);
1220                                 else
1221                                         domain_exit(domain);
1222                         }
1223                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1224
1225                         i = find_next_bit(iommu->domain_ids,
1226                                 cap_ndoms(iommu->cap), i+1);
1227                 }
1228         }
1229
1230         if (iommu->gcmd & DMA_GCMD_TE)
1231                 iommu_disable_translation(iommu);
1232
1233         if (iommu->irq) {
1234                 set_irq_data(iommu->irq, NULL);
1235                 /* This will mask the irq */
1236                 free_irq(iommu->irq, iommu);
1237                 destroy_irq(iommu->irq);
1238         }
1239
1240         kfree(iommu->domains);
1241         kfree(iommu->domain_ids);
1242
1243         g_iommus[iommu->seq_id] = NULL;
1244
1245         /* if all iommus are freed, free g_iommus */
1246         for (i = 0; i < g_num_of_iommus; i++) {
1247                 if (g_iommus[i])
1248                         break;
1249         }
1250
1251         if (i == g_num_of_iommus)
1252                 kfree(g_iommus);
1253
1254         /* free context mapping */
1255         free_context_table(iommu);
1256 }
1257
1258 static struct dmar_domain *alloc_domain(void)
1259 {
1260         struct dmar_domain *domain;
1261
1262         domain = alloc_domain_mem();
1263         if (!domain)
1264                 return NULL;
1265
1266         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1267         domain->flags = 0;
1268
1269         return domain;
1270 }
1271
1272 static int iommu_attach_domain(struct dmar_domain *domain,
1273                                struct intel_iommu *iommu)
1274 {
1275         int num;
1276         unsigned long ndomains;
1277         unsigned long flags;
1278
1279         ndomains = cap_ndoms(iommu->cap);
1280
1281         spin_lock_irqsave(&iommu->lock, flags);
1282
1283         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1284         if (num >= ndomains) {
1285                 spin_unlock_irqrestore(&iommu->lock, flags);
1286                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1287                 return -ENOMEM;
1288         }
1289
1290         domain->id = num;
1291         set_bit(num, iommu->domain_ids);
1292         set_bit(iommu->seq_id, &domain->iommu_bmp);
1293         iommu->domains[num] = domain;
1294         spin_unlock_irqrestore(&iommu->lock, flags);
1295
1296         return 0;
1297 }
1298
1299 static void iommu_detach_domain(struct dmar_domain *domain,
1300                                 struct intel_iommu *iommu)
1301 {
1302         unsigned long flags;
1303         int num, ndomains;
1304         int found = 0;
1305
1306         spin_lock_irqsave(&iommu->lock, flags);
1307         ndomains = cap_ndoms(iommu->cap);
1308         num = find_first_bit(iommu->domain_ids, ndomains);
1309         for (; num < ndomains; ) {
1310                 if (iommu->domains[num] == domain) {
1311                         found = 1;
1312                         break;
1313                 }
1314                 num = find_next_bit(iommu->domain_ids,
1315                                     cap_ndoms(iommu->cap), num+1);
1316         }
1317
1318         if (found) {
1319                 clear_bit(num, iommu->domain_ids);
1320                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1321                 iommu->domains[num] = NULL;
1322         }
1323         spin_unlock_irqrestore(&iommu->lock, flags);
1324 }
1325
1326 static struct iova_domain reserved_iova_list;
1327 static struct lock_class_key reserved_rbtree_key;
1328
1329 static void dmar_init_reserved_ranges(void)
1330 {
1331         struct pci_dev *pdev = NULL;
1332         struct iova *iova;
1333         int i;
1334
1335         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1336
1337         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1338                 &reserved_rbtree_key);
1339
1340         /* IOAPIC ranges shouldn't be accessed by DMA */
1341         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1342                 IOVA_PFN(IOAPIC_RANGE_END));
1343         if (!iova)
1344                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1345
1346         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1347         for_each_pci_dev(pdev) {
1348                 struct resource *r;
1349
1350                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1351                         r = &pdev->resource[i];
1352                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1353                                 continue;
1354                         iova = reserve_iova(&reserved_iova_list,
1355                                             IOVA_PFN(r->start),
1356                                             IOVA_PFN(r->end));
1357                         if (!iova)
1358                                 printk(KERN_ERR "Reserve iova failed\n");
1359                 }
1360         }
1361
1362 }
1363
1364 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1365 {
1366         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1367 }
1368
1369 static inline int guestwidth_to_adjustwidth(int gaw)
1370 {
1371         int agaw;
1372         int r = (gaw - 12) % 9;
1373
1374         if (r == 0)
1375                 agaw = gaw;
1376         else
1377                 agaw = gaw + 9 - r;
1378         if (agaw > 64)
1379                 agaw = 64;
1380         return agaw;
1381 }
1382
1383 static int domain_init(struct dmar_domain *domain, int guest_width)
1384 {
1385         struct intel_iommu *iommu;
1386         int adjust_width, agaw;
1387         unsigned long sagaw;
1388
1389         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1390         spin_lock_init(&domain->iommu_lock);
1391
1392         domain_reserve_special_ranges(domain);
1393
1394         /* calculate AGAW */
1395         iommu = domain_get_iommu(domain);
1396         if (guest_width > cap_mgaw(iommu->cap))
1397                 guest_width = cap_mgaw(iommu->cap);
1398         domain->gaw = guest_width;
1399         adjust_width = guestwidth_to_adjustwidth(guest_width);
1400         agaw = width_to_agaw(adjust_width);
1401         sagaw = cap_sagaw(iommu->cap);
1402         if (!test_bit(agaw, &sagaw)) {
1403                 /* hardware doesn't support it, choose a bigger one */
1404                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1405                 agaw = find_next_bit(&sagaw, 5, agaw);
1406                 if (agaw >= 5)
1407                         return -ENODEV;
1408         }
1409         domain->agaw = agaw;
1410         INIT_LIST_HEAD(&domain->devices);
1411
1412         if (ecap_coherent(iommu->ecap))
1413                 domain->iommu_coherency = 1;
1414         else
1415                 domain->iommu_coherency = 0;
1416
1417         if (ecap_sc_support(iommu->ecap))
1418                 domain->iommu_snooping = 1;
1419         else
1420                 domain->iommu_snooping = 0;
1421
1422         domain->iommu_count = 1;
1423
1424         /* always allocate the top pgd */
1425         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1426         if (!domain->pgd)
1427                 return -ENOMEM;
1428         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1429         return 0;
1430 }
1431
1432 static void domain_exit(struct dmar_domain *domain)
1433 {
1434         struct dmar_drhd_unit *drhd;
1435         struct intel_iommu *iommu;
1436
1437         /* Domain 0 is reserved, so dont process it */
1438         if (!domain)
1439                 return;
1440
1441         domain_remove_dev_info(domain);
1442         /* destroy iovas */
1443         put_iova_domain(&domain->iovad);
1444
1445         /* clear ptes */
1446         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1447
1448         /* free page tables */
1449         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1450
1451         for_each_active_iommu(iommu, drhd)
1452                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1453                         iommu_detach_domain(domain, iommu);
1454
1455         free_domain_mem(domain);
1456 }
1457
1458 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1459                                  u8 bus, u8 devfn, int translation)
1460 {
1461         struct context_entry *context;
1462         unsigned long flags;
1463         struct intel_iommu *iommu;
1464         struct dma_pte *pgd;
1465         unsigned long num;
1466         unsigned long ndomains;
1467         int id;
1468         int agaw;
1469         struct device_domain_info *info = NULL;
1470
1471         pr_debug("Set context mapping for %02x:%02x.%d\n",
1472                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1473
1474         BUG_ON(!domain->pgd);
1475         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1476                translation != CONTEXT_TT_MULTI_LEVEL);
1477
1478         iommu = device_to_iommu(segment, bus, devfn);
1479         if (!iommu)
1480                 return -ENODEV;
1481
1482         context = device_to_context_entry(iommu, bus, devfn);
1483         if (!context)
1484                 return -ENOMEM;
1485         spin_lock_irqsave(&iommu->lock, flags);
1486         if (context_present(context)) {
1487                 spin_unlock_irqrestore(&iommu->lock, flags);
1488                 return 0;
1489         }
1490
1491         id = domain->id;
1492         pgd = domain->pgd;
1493
1494         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1495             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1496                 int found = 0;
1497
1498                 /* find an available domain id for this device in iommu */
1499                 ndomains = cap_ndoms(iommu->cap);
1500                 num = find_first_bit(iommu->domain_ids, ndomains);
1501                 for (; num < ndomains; ) {
1502                         if (iommu->domains[num] == domain) {
1503                                 id = num;
1504                                 found = 1;
1505                                 break;
1506                         }
1507                         num = find_next_bit(iommu->domain_ids,
1508                                             cap_ndoms(iommu->cap), num+1);
1509                 }
1510
1511                 if (found == 0) {
1512                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1513                         if (num >= ndomains) {
1514                                 spin_unlock_irqrestore(&iommu->lock, flags);
1515                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1516                                 return -EFAULT;
1517                         }
1518
1519                         set_bit(num, iommu->domain_ids);
1520                         iommu->domains[num] = domain;
1521                         id = num;
1522                 }
1523
1524                 /* Skip top levels of page tables for
1525                  * iommu which has less agaw than default.
1526                  */
1527                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1528                         pgd = phys_to_virt(dma_pte_addr(pgd));
1529                         if (!dma_pte_present(pgd)) {
1530                                 spin_unlock_irqrestore(&iommu->lock, flags);
1531                                 return -ENOMEM;
1532                         }
1533                 }
1534         }
1535
1536         context_set_domain_id(context, id);
1537
1538         if (translation != CONTEXT_TT_PASS_THROUGH) {
1539                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1540                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1541                                      CONTEXT_TT_MULTI_LEVEL;
1542         }
1543         /*
1544          * In pass through mode, AW must be programmed to indicate the largest
1545          * AGAW value supported by hardware. And ASR is ignored by hardware.
1546          */
1547         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1548                 context_set_address_width(context, iommu->msagaw);
1549         else {
1550                 context_set_address_root(context, virt_to_phys(pgd));
1551                 context_set_address_width(context, iommu->agaw);
1552         }
1553
1554         context_set_translation_type(context, translation);
1555         context_set_fault_enable(context);
1556         context_set_present(context);
1557         domain_flush_cache(domain, context, sizeof(*context));
1558
1559         /*
1560          * It's a non-present to present mapping. If hardware doesn't cache
1561          * non-present entry we only need to flush the write-buffer. If the
1562          * _does_ cache non-present entries, then it does so in the special
1563          * domain #0, which we have to flush:
1564          */
1565         if (cap_caching_mode(iommu->cap)) {
1566                 iommu->flush.flush_context(iommu, 0,
1567                                            (((u16)bus) << 8) | devfn,
1568                                            DMA_CCMD_MASK_NOBIT,
1569                                            DMA_CCMD_DEVICE_INVL);
1570                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH);
1571         } else {
1572                 iommu_flush_write_buffer(iommu);
1573         }
1574         iommu_enable_dev_iotlb(info);
1575         spin_unlock_irqrestore(&iommu->lock, flags);
1576
1577         spin_lock_irqsave(&domain->iommu_lock, flags);
1578         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1579                 domain->iommu_count++;
1580                 domain_update_iommu_cap(domain);
1581         }
1582         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1583         return 0;
1584 }
1585
1586 static int
1587 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1588                         int translation)
1589 {
1590         int ret;
1591         struct pci_dev *tmp, *parent;
1592
1593         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1594                                          pdev->bus->number, pdev->devfn,
1595                                          translation);
1596         if (ret)
1597                 return ret;
1598
1599         /* dependent device mapping */
1600         tmp = pci_find_upstream_pcie_bridge(pdev);
1601         if (!tmp)
1602                 return 0;
1603         /* Secondary interface's bus number and devfn 0 */
1604         parent = pdev->bus->self;
1605         while (parent != tmp) {
1606                 ret = domain_context_mapping_one(domain,
1607                                                  pci_domain_nr(parent->bus),
1608                                                  parent->bus->number,
1609                                                  parent->devfn, translation);
1610                 if (ret)
1611                         return ret;
1612                 parent = parent->bus->self;
1613         }
1614         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1615                 return domain_context_mapping_one(domain,
1616                                         pci_domain_nr(tmp->subordinate),
1617                                         tmp->subordinate->number, 0,
1618                                         translation);
1619         else /* this is a legacy PCI bridge */
1620                 return domain_context_mapping_one(domain,
1621                                                   pci_domain_nr(tmp->bus),
1622                                                   tmp->bus->number,
1623                                                   tmp->devfn,
1624                                                   translation);
1625 }
1626
1627 static int domain_context_mapped(struct pci_dev *pdev)
1628 {
1629         int ret;
1630         struct pci_dev *tmp, *parent;
1631         struct intel_iommu *iommu;
1632
1633         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1634                                 pdev->devfn);
1635         if (!iommu)
1636                 return -ENODEV;
1637
1638         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1639         if (!ret)
1640                 return ret;
1641         /* dependent device mapping */
1642         tmp = pci_find_upstream_pcie_bridge(pdev);
1643         if (!tmp)
1644                 return ret;
1645         /* Secondary interface's bus number and devfn 0 */
1646         parent = pdev->bus->self;
1647         while (parent != tmp) {
1648                 ret = device_context_mapped(iommu, parent->bus->number,
1649                                             parent->devfn);
1650                 if (!ret)
1651                         return ret;
1652                 parent = parent->bus->self;
1653         }
1654         if (tmp->is_pcie)
1655                 return device_context_mapped(iommu, tmp->subordinate->number,
1656                                              0);
1657         else
1658                 return device_context_mapped(iommu, tmp->bus->number,
1659                                              tmp->devfn);
1660 }
1661
1662 /* Returns a number of VTD pages, but aligned to MM page size */
1663 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1664                                             size_t size)
1665 {
1666         host_addr &= ~PAGE_MASK;
1667         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1668 }
1669
1670 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1671                             struct scatterlist *sg, unsigned long phys_pfn,
1672                             unsigned long nr_pages, int prot)
1673 {
1674         struct dma_pte *first_pte = NULL, *pte = NULL;
1675         phys_addr_t uninitialized_var(pteval);
1676         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1677         unsigned long sg_res;
1678
1679         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1680
1681         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1682                 return -EINVAL;
1683
1684         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1685
1686         if (sg)
1687                 sg_res = 0;
1688         else {
1689                 sg_res = nr_pages + 1;
1690                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1691         }
1692
1693         while (nr_pages--) {
1694                 uint64_t tmp;
1695
1696                 if (!sg_res) {
1697                         sg_res = aligned_nrpages(sg->offset, sg->length);
1698                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1699                         sg->dma_length = sg->length;
1700                         pteval = page_to_phys(sg_page(sg)) | prot;
1701                 }
1702                 if (!pte) {
1703                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1704                         if (!pte)
1705                                 return -ENOMEM;
1706                 }
1707                 /* We don't need lock here, nobody else
1708                  * touches the iova range
1709                  */
1710                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1711                 if (tmp) {
1712                         static int dumps = 5;
1713                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1714                                iov_pfn, tmp, (unsigned long long)pteval);
1715                         if (dumps) {
1716                                 dumps--;
1717                                 debug_dma_dump_mappings(NULL);
1718                         }
1719                         WARN_ON(1);
1720                 }
1721                 pte++;
1722                 if (!nr_pages || first_pte_in_page(pte)) {
1723                         domain_flush_cache(domain, first_pte,
1724                                            (void *)pte - (void *)first_pte);
1725                         pte = NULL;
1726                 }
1727                 iov_pfn++;
1728                 pteval += VTD_PAGE_SIZE;
1729                 sg_res--;
1730                 if (!sg_res)
1731                         sg = sg_next(sg);
1732         }
1733         return 0;
1734 }
1735
1736 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1737                                     struct scatterlist *sg, unsigned long nr_pages,
1738                                     int prot)
1739 {
1740         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1741 }
1742
1743 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1744                                      unsigned long phys_pfn, unsigned long nr_pages,
1745                                      int prot)
1746 {
1747         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1748 }
1749
1750 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1751 {
1752         if (!iommu)
1753                 return;
1754
1755         clear_context_table(iommu, bus, devfn);
1756         iommu->flush.flush_context(iommu, 0, 0, 0,
1757                                            DMA_CCMD_GLOBAL_INVL);
1758         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1759 }
1760
1761 static void domain_remove_dev_info(struct dmar_domain *domain)
1762 {
1763         struct device_domain_info *info;
1764         unsigned long flags;
1765         struct intel_iommu *iommu;
1766
1767         spin_lock_irqsave(&device_domain_lock, flags);
1768         while (!list_empty(&domain->devices)) {
1769                 info = list_entry(domain->devices.next,
1770                         struct device_domain_info, link);
1771                 list_del(&info->link);
1772                 list_del(&info->global);
1773                 if (info->dev)
1774                         info->dev->dev.archdata.iommu = NULL;
1775                 spin_unlock_irqrestore(&device_domain_lock, flags);
1776
1777                 iommu_disable_dev_iotlb(info);
1778                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1779                 iommu_detach_dev(iommu, info->bus, info->devfn);
1780                 free_devinfo_mem(info);
1781
1782                 spin_lock_irqsave(&device_domain_lock, flags);
1783         }
1784         spin_unlock_irqrestore(&device_domain_lock, flags);
1785 }
1786
1787 /*
1788  * find_domain
1789  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1790  */
1791 static struct dmar_domain *
1792 find_domain(struct pci_dev *pdev)
1793 {
1794         struct device_domain_info *info;
1795
1796         /* No lock here, assumes no domain exit in normal case */
1797         info = pdev->dev.archdata.iommu;
1798         if (info)
1799                 return info->domain;
1800         return NULL;
1801 }
1802
1803 /* domain is initialized */
1804 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1805 {
1806         struct dmar_domain *domain, *found = NULL;
1807         struct intel_iommu *iommu;
1808         struct dmar_drhd_unit *drhd;
1809         struct device_domain_info *info, *tmp;
1810         struct pci_dev *dev_tmp;
1811         unsigned long flags;
1812         int bus = 0, devfn = 0;
1813         int segment;
1814         int ret;
1815
1816         domain = find_domain(pdev);
1817         if (domain)
1818                 return domain;
1819
1820         segment = pci_domain_nr(pdev->bus);
1821
1822         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1823         if (dev_tmp) {
1824                 if (dev_tmp->is_pcie) {
1825                         bus = dev_tmp->subordinate->number;
1826                         devfn = 0;
1827                 } else {
1828                         bus = dev_tmp->bus->number;
1829                         devfn = dev_tmp->devfn;
1830                 }
1831                 spin_lock_irqsave(&device_domain_lock, flags);
1832                 list_for_each_entry(info, &device_domain_list, global) {
1833                         if (info->segment == segment &&
1834                             info->bus == bus && info->devfn == devfn) {
1835                                 found = info->domain;
1836                                 break;
1837                         }
1838                 }
1839                 spin_unlock_irqrestore(&device_domain_lock, flags);
1840                 /* pcie-pci bridge already has a domain, uses it */
1841                 if (found) {
1842                         domain = found;
1843                         goto found_domain;
1844                 }
1845         }
1846
1847         domain = alloc_domain();
1848         if (!domain)
1849                 goto error;
1850
1851         /* Allocate new domain for the device */
1852         drhd = dmar_find_matched_drhd_unit(pdev);
1853         if (!drhd) {
1854                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1855                         pci_name(pdev));
1856                 return NULL;
1857         }
1858         iommu = drhd->iommu;
1859
1860         ret = iommu_attach_domain(domain, iommu);
1861         if (ret) {
1862                 domain_exit(domain);
1863                 goto error;
1864         }
1865
1866         if (domain_init(domain, gaw)) {
1867                 domain_exit(domain);
1868                 goto error;
1869         }
1870
1871         /* register pcie-to-pci device */
1872         if (dev_tmp) {
1873                 info = alloc_devinfo_mem();
1874                 if (!info) {
1875                         domain_exit(domain);
1876                         goto error;
1877                 }
1878                 info->segment = segment;
1879                 info->bus = bus;
1880                 info->devfn = devfn;
1881                 info->dev = NULL;
1882                 info->domain = domain;
1883                 /* This domain is shared by devices under p2p bridge */
1884                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1885
1886                 /* pcie-to-pci bridge already has a domain, uses it */
1887                 found = NULL;
1888                 spin_lock_irqsave(&device_domain_lock, flags);
1889                 list_for_each_entry(tmp, &device_domain_list, global) {
1890                         if (tmp->segment == segment &&
1891                             tmp->bus == bus && tmp->devfn == devfn) {
1892                                 found = tmp->domain;
1893                                 break;
1894                         }
1895                 }
1896                 if (found) {
1897                         free_devinfo_mem(info);
1898                         domain_exit(domain);
1899                         domain = found;
1900                 } else {
1901                         list_add(&info->link, &domain->devices);
1902                         list_add(&info->global, &device_domain_list);
1903                 }
1904                 spin_unlock_irqrestore(&device_domain_lock, flags);
1905         }
1906
1907 found_domain:
1908         info = alloc_devinfo_mem();
1909         if (!info)
1910                 goto error;
1911         info->segment = segment;
1912         info->bus = pdev->bus->number;
1913         info->devfn = pdev->devfn;
1914         info->dev = pdev;
1915         info->domain = domain;
1916         spin_lock_irqsave(&device_domain_lock, flags);
1917         /* somebody is fast */
1918         found = find_domain(pdev);
1919         if (found != NULL) {
1920                 spin_unlock_irqrestore(&device_domain_lock, flags);
1921                 if (found != domain) {
1922                         domain_exit(domain);
1923                         domain = found;
1924                 }
1925                 free_devinfo_mem(info);
1926                 return domain;
1927         }
1928         list_add(&info->link, &domain->devices);
1929         list_add(&info->global, &device_domain_list);
1930         pdev->dev.archdata.iommu = info;
1931         spin_unlock_irqrestore(&device_domain_lock, flags);
1932         return domain;
1933 error:
1934         /* recheck it here, maybe others set it */
1935         return find_domain(pdev);
1936 }
1937
1938 static int iommu_identity_mapping;
1939 #define IDENTMAP_ALL            1
1940 #define IDENTMAP_GFX            2
1941 #define IDENTMAP_AZALIA         4
1942
1943 static int iommu_domain_identity_map(struct dmar_domain *domain,
1944                                      unsigned long long start,
1945                                      unsigned long long end)
1946 {
1947         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1948         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1949
1950         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1951                           dma_to_mm_pfn(last_vpfn))) {
1952                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1953                 return -ENOMEM;
1954         }
1955
1956         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1957                  start, end, domain->id);
1958         /*
1959          * RMRR range might have overlap with physical memory range,
1960          * clear it first
1961          */
1962         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1963
1964         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1965                                   last_vpfn - first_vpfn + 1,
1966                                   DMA_PTE_READ|DMA_PTE_WRITE);
1967 }
1968
1969 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1970                                       unsigned long long start,
1971                                       unsigned long long end)
1972 {
1973         struct dmar_domain *domain;
1974         int ret;
1975
1976         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1977         if (!domain)
1978                 return -ENOMEM;
1979
1980         /* For _hardware_ passthrough, don't bother. But for software
1981            passthrough, we do it anyway -- it may indicate a memory
1982            range which is reserved in E820, so which didn't get set
1983            up to start with in si_domain */
1984         if (domain == si_domain && hw_pass_through) {
1985                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1986                        pci_name(pdev), start, end);
1987                 return 0;
1988         }
1989
1990         printk(KERN_INFO
1991                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1992                pci_name(pdev), start, end);
1993         
1994         if (end >> agaw_to_width(domain->agaw)) {
1995                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1996                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1997                      agaw_to_width(domain->agaw),
1998                      dmi_get_system_info(DMI_BIOS_VENDOR),
1999                      dmi_get_system_info(DMI_BIOS_VERSION),
2000                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2001                 ret = -EIO;
2002                 goto error;
2003         }
2004
2005         ret = iommu_domain_identity_map(domain, start, end);
2006         if (ret)
2007                 goto error;
2008
2009         /* context entry init */
2010         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2011         if (ret)
2012                 goto error;
2013
2014         return 0;
2015
2016  error:
2017         domain_exit(domain);
2018         return ret;
2019 }
2020
2021 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2022         struct pci_dev *pdev)
2023 {
2024         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2025                 return 0;
2026         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2027                 rmrr->end_address + 1);
2028 }
2029
2030 #ifdef CONFIG_DMAR_FLOPPY_WA
2031 static inline void iommu_prepare_isa(void)
2032 {
2033         struct pci_dev *pdev;
2034         int ret;
2035
2036         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2037         if (!pdev)
2038                 return;
2039
2040         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2041         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2042
2043         if (ret)
2044                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2045                        "floppy might not work\n");
2046
2047 }
2048 #else
2049 static inline void iommu_prepare_isa(void)
2050 {
2051         return;
2052 }
2053 #endif /* !CONFIG_DMAR_FLPY_WA */
2054
2055 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2056
2057 static int __init si_domain_work_fn(unsigned long start_pfn,
2058                                     unsigned long end_pfn, void *datax)
2059 {
2060         int *ret = datax;
2061
2062         *ret = iommu_domain_identity_map(si_domain,
2063                                          (uint64_t)start_pfn << PAGE_SHIFT,
2064                                          (uint64_t)end_pfn << PAGE_SHIFT);
2065         return *ret;
2066
2067 }
2068
2069 static int __init si_domain_init(int hw)
2070 {
2071         struct dmar_drhd_unit *drhd;
2072         struct intel_iommu *iommu;
2073         int nid, ret = 0;
2074
2075         si_domain = alloc_domain();
2076         if (!si_domain)
2077                 return -EFAULT;
2078
2079         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2080
2081         for_each_active_iommu(iommu, drhd) {
2082                 ret = iommu_attach_domain(si_domain, iommu);
2083                 if (ret) {
2084                         domain_exit(si_domain);
2085                         return -EFAULT;
2086                 }
2087         }
2088
2089         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2090                 domain_exit(si_domain);
2091                 return -EFAULT;
2092         }
2093
2094         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2095
2096         if (hw)
2097                 return 0;
2098
2099         for_each_online_node(nid) {
2100                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2101                 if (ret)
2102                         return ret;
2103         }
2104
2105         return 0;
2106 }
2107
2108 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2109                                           struct pci_dev *pdev);
2110 static int identity_mapping(struct pci_dev *pdev)
2111 {
2112         struct device_domain_info *info;
2113
2114         if (likely(!iommu_identity_mapping))
2115                 return 0;
2116
2117
2118         list_for_each_entry(info, &si_domain->devices, link)
2119                 if (info->dev == pdev)
2120                         return 1;
2121         return 0;
2122 }
2123
2124 static int domain_add_dev_info(struct dmar_domain *domain,
2125                                struct pci_dev *pdev,
2126                                int translation)
2127 {
2128         struct device_domain_info *info;
2129         unsigned long flags;
2130         int ret;
2131
2132         info = alloc_devinfo_mem();
2133         if (!info)
2134                 return -ENOMEM;
2135
2136         ret = domain_context_mapping(domain, pdev, translation);
2137         if (ret) {
2138                 free_devinfo_mem(info);
2139                 return ret;
2140         }
2141
2142         info->segment = pci_domain_nr(pdev->bus);
2143         info->bus = pdev->bus->number;
2144         info->devfn = pdev->devfn;
2145         info->dev = pdev;
2146         info->domain = domain;
2147
2148         spin_lock_irqsave(&device_domain_lock, flags);
2149         list_add(&info->link, &domain->devices);
2150         list_add(&info->global, &device_domain_list);
2151         pdev->dev.archdata.iommu = info;
2152         spin_unlock_irqrestore(&device_domain_lock, flags);
2153
2154         return 0;
2155 }
2156
2157 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2158 {
2159         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2160                 return 1;
2161
2162         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2163                 return 1;
2164
2165         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2166                 return 0;
2167
2168         /*
2169          * We want to start off with all devices in the 1:1 domain, and
2170          * take them out later if we find they can't access all of memory.
2171          *
2172          * However, we can't do this for PCI devices behind bridges,
2173          * because all PCI devices behind the same bridge will end up
2174          * with the same source-id on their transactions.
2175          *
2176          * Practically speaking, we can't change things around for these
2177          * devices at run-time, because we can't be sure there'll be no
2178          * DMA transactions in flight for any of their siblings.
2179          * 
2180          * So PCI devices (unless they're on the root bus) as well as
2181          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2182          * the 1:1 domain, just in _case_ one of their siblings turns out
2183          * not to be able to map all of memory.
2184          */
2185         if (!pdev->is_pcie) {
2186                 if (!pci_is_root_bus(pdev->bus))
2187                         return 0;
2188                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2189                         return 0;
2190         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2191                 return 0;
2192
2193         /* 
2194          * At boot time, we don't yet know if devices will be 64-bit capable.
2195          * Assume that they will -- if they turn out not to be, then we can 
2196          * take them out of the 1:1 domain later.
2197          */
2198         if (!startup)
2199                 return pdev->dma_mask > DMA_BIT_MASK(32);
2200
2201         return 1;
2202 }
2203
2204 static int __init iommu_prepare_static_identity_mapping(int hw)
2205 {
2206         struct pci_dev *pdev = NULL;
2207         int ret;
2208
2209         ret = si_domain_init(hw);
2210         if (ret)
2211                 return -EFAULT;
2212
2213         for_each_pci_dev(pdev) {
2214                 if (iommu_should_identity_map(pdev, 1)) {
2215                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2216                                hw ? "hardware" : "software", pci_name(pdev));
2217
2218                         ret = domain_add_dev_info(si_domain, pdev,
2219                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2220                                                      CONTEXT_TT_MULTI_LEVEL);
2221                         if (ret)
2222                                 return ret;
2223                 }
2224         }
2225
2226         return 0;
2227 }
2228
2229 int __init init_dmars(void)
2230 {
2231         struct dmar_drhd_unit *drhd;
2232         struct dmar_rmrr_unit *rmrr;
2233         struct pci_dev *pdev;
2234         struct intel_iommu *iommu;
2235         int i, ret;
2236
2237         /*
2238          * for each drhd
2239          *    allocate root
2240          *    initialize and program root entry to not present
2241          * endfor
2242          */
2243         for_each_drhd_unit(drhd) {
2244                 g_num_of_iommus++;
2245                 /*
2246                  * lock not needed as this is only incremented in the single
2247                  * threaded kernel __init code path all other access are read
2248                  * only
2249                  */
2250         }
2251
2252         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2253                         GFP_KERNEL);
2254         if (!g_iommus) {
2255                 printk(KERN_ERR "Allocating global iommu array failed\n");
2256                 ret = -ENOMEM;
2257                 goto error;
2258         }
2259
2260         deferred_flush = kzalloc(g_num_of_iommus *
2261                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2262         if (!deferred_flush) {
2263                 ret = -ENOMEM;
2264                 goto error;
2265         }
2266
2267         for_each_drhd_unit(drhd) {
2268                 if (drhd->ignored)
2269                         continue;
2270
2271                 iommu = drhd->iommu;
2272                 g_iommus[iommu->seq_id] = iommu;
2273
2274                 ret = iommu_init_domains(iommu);
2275                 if (ret)
2276                         goto error;
2277
2278                 /*
2279                  * TBD:
2280                  * we could share the same root & context tables
2281                  * amoung all IOMMU's. Need to Split it later.
2282                  */
2283                 ret = iommu_alloc_root_entry(iommu);
2284                 if (ret) {
2285                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2286                         goto error;
2287                 }
2288                 if (!ecap_pass_through(iommu->ecap))
2289                         hw_pass_through = 0;
2290         }
2291
2292         /*
2293          * Start from the sane iommu hardware state.
2294          */
2295         for_each_drhd_unit(drhd) {
2296                 if (drhd->ignored)
2297                         continue;
2298
2299                 iommu = drhd->iommu;
2300
2301                 /*
2302                  * If the queued invalidation is already initialized by us
2303                  * (for example, while enabling interrupt-remapping) then
2304                  * we got the things already rolling from a sane state.
2305                  */
2306                 if (iommu->qi)
2307                         continue;
2308
2309                 /*
2310                  * Clear any previous faults.
2311                  */
2312                 dmar_fault(-1, iommu);
2313                 /*
2314                  * Disable queued invalidation if supported and already enabled
2315                  * before OS handover.
2316                  */
2317                 dmar_disable_qi(iommu);
2318         }
2319
2320         for_each_drhd_unit(drhd) {
2321                 if (drhd->ignored)
2322                         continue;
2323
2324                 iommu = drhd->iommu;
2325
2326                 if (dmar_enable_qi(iommu)) {
2327                         /*
2328                          * Queued Invalidate not enabled, use Register Based
2329                          * Invalidate
2330                          */
2331                         iommu->flush.flush_context = __iommu_flush_context;
2332                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2333                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2334                                "invalidation\n",
2335                                (unsigned long long)drhd->reg_base_addr);
2336                 } else {
2337                         iommu->flush.flush_context = qi_flush_context;
2338                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2339                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2340                                "invalidation\n",
2341                                (unsigned long long)drhd->reg_base_addr);
2342                 }
2343         }
2344
2345         if (iommu_pass_through)
2346                 iommu_identity_mapping |= IDENTMAP_ALL;
2347
2348 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2349         iommu_identity_mapping |= IDENTMAP_GFX;
2350 #endif
2351
2352         check_tylersburg_isoch();
2353
2354         /*
2355          * If pass through is not set or not enabled, setup context entries for
2356          * identity mappings for rmrr, gfx, and isa and may fall back to static
2357          * identity mapping if iommu_identity_mapping is set.
2358          */
2359         if (iommu_identity_mapping) {
2360                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2361                 if (ret) {
2362                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2363                         goto error;
2364                 }
2365         }
2366         /*
2367          * For each rmrr
2368          *   for each dev attached to rmrr
2369          *   do
2370          *     locate drhd for dev, alloc domain for dev
2371          *     allocate free domain
2372          *     allocate page table entries for rmrr
2373          *     if context not allocated for bus
2374          *           allocate and init context
2375          *           set present in root table for this bus
2376          *     init context with domain, translation etc
2377          *    endfor
2378          * endfor
2379          */
2380         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2381         for_each_rmrr_units(rmrr) {
2382                 for (i = 0; i < rmrr->devices_cnt; i++) {
2383                         pdev = rmrr->devices[i];
2384                         /*
2385                          * some BIOS lists non-exist devices in DMAR
2386                          * table.
2387                          */
2388                         if (!pdev)
2389                                 continue;
2390                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2391                         if (ret)
2392                                 printk(KERN_ERR
2393                                        "IOMMU: mapping reserved region failed\n");
2394                 }
2395         }
2396
2397         iommu_prepare_isa();
2398
2399         /*
2400          * for each drhd
2401          *   enable fault log
2402          *   global invalidate context cache
2403          *   global invalidate iotlb
2404          *   enable translation
2405          */
2406         for_each_drhd_unit(drhd) {
2407                 if (drhd->ignored)
2408                         continue;
2409                 iommu = drhd->iommu;
2410
2411                 iommu_flush_write_buffer(iommu);
2412
2413                 ret = dmar_set_interrupt(iommu);
2414                 if (ret)
2415                         goto error;
2416
2417                 iommu_set_root_entry(iommu);
2418
2419                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2420                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2421
2422                 ret = iommu_enable_translation(iommu);
2423                 if (ret)
2424                         goto error;
2425
2426                 iommu_disable_protect_mem_regions(iommu);
2427         }
2428
2429         return 0;
2430 error:
2431         for_each_drhd_unit(drhd) {
2432                 if (drhd->ignored)
2433                         continue;
2434                 iommu = drhd->iommu;
2435                 free_iommu(iommu);
2436         }
2437         kfree(g_iommus);
2438         return ret;
2439 }
2440
2441 /* This takes a number of _MM_ pages, not VTD pages */
2442 static struct iova *intel_alloc_iova(struct device *dev,
2443                                      struct dmar_domain *domain,
2444                                      unsigned long nrpages, uint64_t dma_mask)
2445 {
2446         struct pci_dev *pdev = to_pci_dev(dev);
2447         struct iova *iova = NULL;
2448
2449         /* Restrict dma_mask to the width that the iommu can handle */
2450         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2451
2452         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2453                 /*
2454                  * First try to allocate an io virtual address in
2455                  * DMA_BIT_MASK(32) and if that fails then try allocating
2456                  * from higher range
2457                  */
2458                 iova = alloc_iova(&domain->iovad, nrpages,
2459                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2460                 if (iova)
2461                         return iova;
2462         }
2463         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2464         if (unlikely(!iova)) {
2465                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2466                        nrpages, pci_name(pdev));
2467                 return NULL;
2468         }
2469
2470         return iova;
2471 }
2472
2473 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2474 {
2475         struct dmar_domain *domain;
2476         int ret;
2477
2478         domain = get_domain_for_dev(pdev,
2479                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2480         if (!domain) {
2481                 printk(KERN_ERR
2482                         "Allocating domain for %s failed", pci_name(pdev));
2483                 return NULL;
2484         }
2485
2486         /* make sure context mapping is ok */
2487         if (unlikely(!domain_context_mapped(pdev))) {
2488                 ret = domain_context_mapping(domain, pdev,
2489                                              CONTEXT_TT_MULTI_LEVEL);
2490                 if (ret) {
2491                         printk(KERN_ERR
2492                                 "Domain context map for %s failed",
2493                                 pci_name(pdev));
2494                         return NULL;
2495                 }
2496         }
2497
2498         return domain;
2499 }
2500
2501 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2502 {
2503         struct device_domain_info *info;
2504
2505         /* No lock here, assumes no domain exit in normal case */
2506         info = dev->dev.archdata.iommu;
2507         if (likely(info))
2508                 return info->domain;
2509
2510         return __get_valid_domain_for_dev(dev);
2511 }
2512
2513 static int iommu_dummy(struct pci_dev *pdev)
2514 {
2515         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2516 }
2517
2518 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2519 static int iommu_no_mapping(struct device *dev)
2520 {
2521         struct pci_dev *pdev;
2522         int found;
2523
2524         if (unlikely(dev->bus != &pci_bus_type))
2525                 return 1;
2526
2527         pdev = to_pci_dev(dev);
2528         if (iommu_dummy(pdev))
2529                 return 1;
2530
2531         if (!iommu_identity_mapping)
2532                 return 0;
2533
2534         found = identity_mapping(pdev);
2535         if (found) {
2536                 if (iommu_should_identity_map(pdev, 0))
2537                         return 1;
2538                 else {
2539                         /*
2540                          * 32 bit DMA is removed from si_domain and fall back
2541                          * to non-identity mapping.
2542                          */
2543                         domain_remove_one_dev_info(si_domain, pdev);
2544                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2545                                pci_name(pdev));
2546                         return 0;
2547                 }
2548         } else {
2549                 /*
2550                  * In case of a detached 64 bit DMA device from vm, the device
2551                  * is put into si_domain for identity mapping.
2552                  */
2553                 if (iommu_should_identity_map(pdev, 0)) {
2554                         int ret;
2555                         ret = domain_add_dev_info(si_domain, pdev,
2556                                                   hw_pass_through ?
2557                                                   CONTEXT_TT_PASS_THROUGH :
2558                                                   CONTEXT_TT_MULTI_LEVEL);
2559                         if (!ret) {
2560                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2561                                        pci_name(pdev));
2562                                 return 1;
2563                         }
2564                 }
2565         }
2566
2567         return 0;
2568 }
2569
2570 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2571                                      size_t size, int dir, u64 dma_mask)
2572 {
2573         struct pci_dev *pdev = to_pci_dev(hwdev);
2574         struct dmar_domain *domain;
2575         phys_addr_t start_paddr;
2576         struct iova *iova;
2577         int prot = 0;
2578         int ret;
2579         struct intel_iommu *iommu;
2580         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2581
2582         BUG_ON(dir == DMA_NONE);
2583
2584         if (iommu_no_mapping(hwdev))
2585                 return paddr;
2586
2587         domain = get_valid_domain_for_dev(pdev);
2588         if (!domain)
2589                 return 0;
2590
2591         iommu = domain_get_iommu(domain);
2592         size = aligned_nrpages(paddr, size);
2593
2594         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2595                                 pdev->dma_mask);
2596         if (!iova)
2597                 goto error;
2598
2599         /*
2600          * Check if DMAR supports zero-length reads on write only
2601          * mappings..
2602          */
2603         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2604                         !cap_zlr(iommu->cap))
2605                 prot |= DMA_PTE_READ;
2606         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2607                 prot |= DMA_PTE_WRITE;
2608         /*
2609          * paddr - (paddr + size) might be partial page, we should map the whole
2610          * page.  Note: if two part of one page are separately mapped, we
2611          * might have two guest_addr mapping to the same host paddr, but this
2612          * is not a big problem
2613          */
2614         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2615                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2616         if (ret)
2617                 goto error;
2618
2619         /* it's a non-present to present mapping. Only flush if caching mode */
2620         if (cap_caching_mode(iommu->cap))
2621                 iommu_flush_iotlb_psi(iommu, 0, mm_to_dma_pfn(iova->pfn_lo), size);
2622         else
2623                 iommu_flush_write_buffer(iommu);
2624
2625         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2626         start_paddr += paddr & ~PAGE_MASK;
2627         return start_paddr;
2628
2629 error:
2630         if (iova)
2631                 __free_iova(&domain->iovad, iova);
2632         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2633                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2634         return 0;
2635 }
2636
2637 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2638                                  unsigned long offset, size_t size,
2639                                  enum dma_data_direction dir,
2640                                  struct dma_attrs *attrs)
2641 {
2642         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2643                                   dir, to_pci_dev(dev)->dma_mask);
2644 }
2645
2646 static void flush_unmaps(void)
2647 {
2648         int i, j;
2649
2650         timer_on = 0;
2651
2652         /* just flush them all */
2653         for (i = 0; i < g_num_of_iommus; i++) {
2654                 struct intel_iommu *iommu = g_iommus[i];
2655                 if (!iommu)
2656                         continue;
2657
2658                 if (!deferred_flush[i].next)
2659                         continue;
2660
2661                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2662                                          DMA_TLB_GLOBAL_FLUSH);
2663                 for (j = 0; j < deferred_flush[i].next; j++) {
2664                         unsigned long mask;
2665                         struct iova *iova = deferred_flush[i].iova[j];
2666
2667                         mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2668                         iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2669                                         (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2670                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2671                 }
2672                 deferred_flush[i].next = 0;
2673         }
2674
2675         list_size = 0;
2676 }
2677
2678 static void flush_unmaps_timeout(unsigned long data)
2679 {
2680         unsigned long flags;
2681
2682         spin_lock_irqsave(&async_umap_flush_lock, flags);
2683         flush_unmaps();
2684         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2685 }
2686
2687 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2688 {
2689         unsigned long flags;
2690         int next, iommu_id;
2691         struct intel_iommu *iommu;
2692
2693         spin_lock_irqsave(&async_umap_flush_lock, flags);
2694         if (list_size == HIGH_WATER_MARK)
2695                 flush_unmaps();
2696
2697         iommu = domain_get_iommu(dom);
2698         iommu_id = iommu->seq_id;
2699
2700         next = deferred_flush[iommu_id].next;
2701         deferred_flush[iommu_id].domain[next] = dom;
2702         deferred_flush[iommu_id].iova[next] = iova;
2703         deferred_flush[iommu_id].next++;
2704
2705         if (!timer_on) {
2706                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2707                 timer_on = 1;
2708         }
2709         list_size++;
2710         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2711 }
2712
2713 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2714                              size_t size, enum dma_data_direction dir,
2715                              struct dma_attrs *attrs)
2716 {
2717         struct pci_dev *pdev = to_pci_dev(dev);
2718         struct dmar_domain *domain;
2719         unsigned long start_pfn, last_pfn;
2720         struct iova *iova;
2721         struct intel_iommu *iommu;
2722
2723         if (iommu_no_mapping(dev))
2724                 return;
2725
2726         domain = find_domain(pdev);
2727         BUG_ON(!domain);
2728
2729         iommu = domain_get_iommu(domain);
2730
2731         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2732         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2733                       (unsigned long long)dev_addr))
2734                 return;
2735
2736         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2737         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2738
2739         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2740                  pci_name(pdev), start_pfn, last_pfn);
2741
2742         /*  clear the whole page */
2743         dma_pte_clear_range(domain, start_pfn, last_pfn);
2744
2745         /* free page tables */
2746         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2747
2748         if (intel_iommu_strict) {
2749                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2750                                       last_pfn - start_pfn + 1);
2751                 /* free iova */
2752                 __free_iova(&domain->iovad, iova);
2753         } else {
2754                 add_unmap(domain, iova);
2755                 /*
2756                  * queue up the release of the unmap to save the 1/6th of the
2757                  * cpu used up by the iotlb flush operation...
2758                  */
2759         }
2760 }
2761
2762 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2763                                   dma_addr_t *dma_handle, gfp_t flags)
2764 {
2765         void *vaddr;
2766         int order;
2767
2768         size = PAGE_ALIGN(size);
2769         order = get_order(size);
2770         flags &= ~(GFP_DMA | GFP_DMA32);
2771
2772         vaddr = (void *)__get_free_pages(flags, order);
2773         if (!vaddr)
2774                 return NULL;
2775         memset(vaddr, 0, size);
2776
2777         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2778                                          DMA_BIDIRECTIONAL,
2779                                          hwdev->coherent_dma_mask);
2780         if (*dma_handle)
2781                 return vaddr;
2782         free_pages((unsigned long)vaddr, order);
2783         return NULL;
2784 }
2785
2786 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2787                                 dma_addr_t dma_handle)
2788 {
2789         int order;
2790
2791         size = PAGE_ALIGN(size);
2792         order = get_order(size);
2793
2794         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2795         free_pages((unsigned long)vaddr, order);
2796 }
2797
2798 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2799                            int nelems, enum dma_data_direction dir,
2800                            struct dma_attrs *attrs)
2801 {
2802         struct pci_dev *pdev = to_pci_dev(hwdev);
2803         struct dmar_domain *domain;
2804         unsigned long start_pfn, last_pfn;
2805         struct iova *iova;
2806         struct intel_iommu *iommu;
2807
2808         if (iommu_no_mapping(hwdev))
2809                 return;
2810
2811         domain = find_domain(pdev);
2812         BUG_ON(!domain);
2813
2814         iommu = domain_get_iommu(domain);
2815
2816         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2817         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2818                       (unsigned long long)sglist[0].dma_address))
2819                 return;
2820
2821         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2822         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2823
2824         /*  clear the whole page */
2825         dma_pte_clear_range(domain, start_pfn, last_pfn);
2826
2827         /* free page tables */
2828         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2829
2830         if (intel_iommu_strict) {
2831                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2832                                       last_pfn - start_pfn + 1);
2833                 /* free iova */
2834                 __free_iova(&domain->iovad, iova);
2835         } else {
2836                 add_unmap(domain, iova);
2837                 /*
2838                  * queue up the release of the unmap to save the 1/6th of the
2839                  * cpu used up by the iotlb flush operation...
2840                  */
2841         }
2842 }
2843
2844 static int intel_nontranslate_map_sg(struct device *hddev,
2845         struct scatterlist *sglist, int nelems, int dir)
2846 {
2847         int i;
2848         struct scatterlist *sg;
2849
2850         for_each_sg(sglist, sg, nelems, i) {
2851                 BUG_ON(!sg_page(sg));
2852                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2853                 sg->dma_length = sg->length;
2854         }
2855         return nelems;
2856 }
2857
2858 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2859                         enum dma_data_direction dir, struct dma_attrs *attrs)
2860 {
2861         int i;
2862         struct pci_dev *pdev = to_pci_dev(hwdev);
2863         struct dmar_domain *domain;
2864         size_t size = 0;
2865         int prot = 0;
2866         size_t offset_pfn = 0;
2867         struct iova *iova = NULL;
2868         int ret;
2869         struct scatterlist *sg;
2870         unsigned long start_vpfn;
2871         struct intel_iommu *iommu;
2872
2873         BUG_ON(dir == DMA_NONE);
2874         if (iommu_no_mapping(hwdev))
2875                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2876
2877         domain = get_valid_domain_for_dev(pdev);
2878         if (!domain)
2879                 return 0;
2880
2881         iommu = domain_get_iommu(domain);
2882
2883         for_each_sg(sglist, sg, nelems, i)
2884                 size += aligned_nrpages(sg->offset, sg->length);
2885
2886         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2887                                 pdev->dma_mask);
2888         if (!iova) {
2889                 sglist->dma_length = 0;
2890                 return 0;
2891         }
2892
2893         /*
2894          * Check if DMAR supports zero-length reads on write only
2895          * mappings..
2896          */
2897         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2898                         !cap_zlr(iommu->cap))
2899                 prot |= DMA_PTE_READ;
2900         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2901                 prot |= DMA_PTE_WRITE;
2902
2903         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2904
2905         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2906         if (unlikely(ret)) {
2907                 /*  clear the page */
2908                 dma_pte_clear_range(domain, start_vpfn,
2909                                     start_vpfn + size - 1);
2910                 /* free page tables */
2911                 dma_pte_free_pagetable(domain, start_vpfn,
2912                                        start_vpfn + size - 1);
2913                 /* free iova */
2914                 __free_iova(&domain->iovad, iova);
2915                 return 0;
2916         }
2917
2918         /* it's a non-present to present mapping. Only flush if caching mode */
2919         if (cap_caching_mode(iommu->cap))
2920                 iommu_flush_iotlb_psi(iommu, 0, start_vpfn, offset_pfn);
2921         else
2922                 iommu_flush_write_buffer(iommu);
2923
2924         return nelems;
2925 }
2926
2927 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2928 {
2929         return !dma_addr;
2930 }
2931
2932 struct dma_map_ops intel_dma_ops = {
2933         .alloc_coherent = intel_alloc_coherent,
2934         .free_coherent = intel_free_coherent,
2935         .map_sg = intel_map_sg,
2936         .unmap_sg = intel_unmap_sg,
2937         .map_page = intel_map_page,
2938         .unmap_page = intel_unmap_page,
2939         .mapping_error = intel_mapping_error,
2940 };
2941
2942 static inline int iommu_domain_cache_init(void)
2943 {
2944         int ret = 0;
2945
2946         iommu_domain_cache = kmem_cache_create("iommu_domain",
2947                                          sizeof(struct dmar_domain),
2948                                          0,
2949                                          SLAB_HWCACHE_ALIGN,
2950
2951                                          NULL);
2952         if (!iommu_domain_cache) {
2953                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2954                 ret = -ENOMEM;
2955         }
2956
2957         return ret;
2958 }
2959
2960 static inline int iommu_devinfo_cache_init(void)
2961 {
2962         int ret = 0;
2963
2964         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2965                                          sizeof(struct device_domain_info),
2966                                          0,
2967                                          SLAB_HWCACHE_ALIGN,
2968                                          NULL);
2969         if (!iommu_devinfo_cache) {
2970                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2971                 ret = -ENOMEM;
2972         }
2973
2974         return ret;
2975 }
2976
2977 static inline int iommu_iova_cache_init(void)
2978 {
2979         int ret = 0;
2980
2981         iommu_iova_cache = kmem_cache_create("iommu_iova",
2982                                          sizeof(struct iova),
2983                                          0,
2984                                          SLAB_HWCACHE_ALIGN,
2985                                          NULL);
2986         if (!iommu_iova_cache) {
2987                 printk(KERN_ERR "Couldn't create iova cache\n");
2988                 ret = -ENOMEM;
2989         }
2990
2991         return ret;
2992 }
2993
2994 static int __init iommu_init_mempool(void)
2995 {
2996         int ret;
2997         ret = iommu_iova_cache_init();
2998         if (ret)
2999                 return ret;
3000
3001         ret = iommu_domain_cache_init();
3002         if (ret)
3003                 goto domain_error;
3004
3005         ret = iommu_devinfo_cache_init();
3006         if (!ret)
3007                 return ret;
3008
3009         kmem_cache_destroy(iommu_domain_cache);
3010 domain_error:
3011         kmem_cache_destroy(iommu_iova_cache);
3012
3013         return -ENOMEM;
3014 }
3015
3016 static void __init iommu_exit_mempool(void)
3017 {
3018         kmem_cache_destroy(iommu_devinfo_cache);
3019         kmem_cache_destroy(iommu_domain_cache);
3020         kmem_cache_destroy(iommu_iova_cache);
3021
3022 }
3023
3024 static void __init init_no_remapping_devices(void)
3025 {
3026         struct dmar_drhd_unit *drhd;
3027
3028         for_each_drhd_unit(drhd) {
3029                 if (!drhd->include_all) {
3030                         int i;
3031                         for (i = 0; i < drhd->devices_cnt; i++)
3032                                 if (drhd->devices[i] != NULL)
3033                                         break;
3034                         /* ignore DMAR unit if no pci devices exist */
3035                         if (i == drhd->devices_cnt)
3036                                 drhd->ignored = 1;
3037                 }
3038         }
3039
3040         if (dmar_map_gfx)
3041                 return;
3042
3043         for_each_drhd_unit(drhd) {
3044                 int i;
3045                 if (drhd->ignored || drhd->include_all)
3046                         continue;
3047
3048                 for (i = 0; i < drhd->devices_cnt; i++)
3049                         if (drhd->devices[i] &&
3050                                 !IS_GFX_DEVICE(drhd->devices[i]))
3051                                 break;
3052
3053                 if (i < drhd->devices_cnt)
3054                         continue;
3055
3056                 /* bypass IOMMU if it is just for gfx devices */
3057                 drhd->ignored = 1;
3058                 for (i = 0; i < drhd->devices_cnt; i++) {
3059                         if (!drhd->devices[i])
3060                                 continue;
3061                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3062                 }
3063         }
3064 }
3065
3066 #ifdef CONFIG_SUSPEND
3067 static int init_iommu_hw(void)
3068 {
3069         struct dmar_drhd_unit *drhd;
3070         struct intel_iommu *iommu = NULL;
3071
3072         for_each_active_iommu(iommu, drhd)
3073                 if (iommu->qi)
3074                         dmar_reenable_qi(iommu);
3075
3076         for_each_active_iommu(iommu, drhd) {
3077                 iommu_flush_write_buffer(iommu);
3078
3079                 iommu_set_root_entry(iommu);
3080
3081                 iommu->flush.flush_context(iommu, 0, 0, 0,
3082                                            DMA_CCMD_GLOBAL_INVL);
3083                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3084                                          DMA_TLB_GLOBAL_FLUSH);
3085                 iommu_enable_translation(iommu);
3086                 iommu_disable_protect_mem_regions(iommu);
3087         }
3088
3089         return 0;
3090 }
3091
3092 static void iommu_flush_all(void)
3093 {
3094         struct dmar_drhd_unit *drhd;
3095         struct intel_iommu *iommu;
3096
3097         for_each_active_iommu(iommu, drhd) {
3098                 iommu->flush.flush_context(iommu, 0, 0, 0,
3099                                            DMA_CCMD_GLOBAL_INVL);
3100                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3101                                          DMA_TLB_GLOBAL_FLUSH);
3102         }
3103 }
3104
3105 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
3106 {
3107         struct dmar_drhd_unit *drhd;
3108         struct intel_iommu *iommu = NULL;
3109         unsigned long flag;
3110
3111         for_each_active_iommu(iommu, drhd) {
3112                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3113                                                  GFP_ATOMIC);
3114                 if (!iommu->iommu_state)
3115                         goto nomem;
3116         }
3117
3118         iommu_flush_all();
3119
3120         for_each_active_iommu(iommu, drhd) {
3121                 iommu_disable_translation(iommu);
3122
3123                 spin_lock_irqsave(&iommu->register_lock, flag);
3124
3125                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3126                         readl(iommu->reg + DMAR_FECTL_REG);
3127                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3128                         readl(iommu->reg + DMAR_FEDATA_REG);
3129                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3130                         readl(iommu->reg + DMAR_FEADDR_REG);
3131                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3132                         readl(iommu->reg + DMAR_FEUADDR_REG);
3133
3134                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3135         }
3136         return 0;
3137
3138 nomem:
3139         for_each_active_iommu(iommu, drhd)
3140                 kfree(iommu->iommu_state);
3141
3142         return -ENOMEM;
3143 }
3144
3145 static int iommu_resume(struct sys_device *dev)
3146 {
3147         struct dmar_drhd_unit *drhd;
3148         struct intel_iommu *iommu = NULL;
3149         unsigned long flag;
3150
3151         if (init_iommu_hw()) {
3152                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3153                 return -EIO;
3154         }
3155
3156         for_each_active_iommu(iommu, drhd) {
3157
3158                 spin_lock_irqsave(&iommu->register_lock, flag);
3159
3160                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3161                         iommu->reg + DMAR_FECTL_REG);
3162                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3163                         iommu->reg + DMAR_FEDATA_REG);
3164                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3165                         iommu->reg + DMAR_FEADDR_REG);
3166                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3167                         iommu->reg + DMAR_FEUADDR_REG);
3168
3169                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3170         }
3171
3172         for_each_active_iommu(iommu, drhd)
3173                 kfree(iommu->iommu_state);
3174
3175         return 0;
3176 }
3177
3178 static struct sysdev_class iommu_sysclass = {
3179         .name           = "iommu",
3180         .resume         = iommu_resume,
3181         .suspend        = iommu_suspend,
3182 };
3183
3184 static struct sys_device device_iommu = {
3185         .cls    = &iommu_sysclass,
3186 };
3187
3188 static int __init init_iommu_sysfs(void)
3189 {
3190         int error;
3191
3192         error = sysdev_class_register(&iommu_sysclass);
3193         if (error)
3194                 return error;
3195
3196         error = sysdev_register(&device_iommu);
3197         if (error)
3198                 sysdev_class_unregister(&iommu_sysclass);
3199
3200         return error;
3201 }
3202
3203 #else
3204 static int __init init_iommu_sysfs(void)
3205 {
3206         return 0;
3207 }
3208 #endif  /* CONFIG_PM */
3209
3210 int __init intel_iommu_init(void)
3211 {
3212         int ret = 0;
3213         int force_on = 0;
3214
3215         /* VT-d is required for a TXT/tboot launch, so enforce that */
3216         force_on = tboot_force_iommu();
3217
3218         if (dmar_table_init()) {
3219                 if (force_on)
3220                         panic("tboot: Failed to initialize DMAR table\n");
3221                 return  -ENODEV;
3222         }
3223
3224         if (dmar_dev_scope_init()) {
3225                 if (force_on)
3226                         panic("tboot: Failed to initialize DMAR device scope\n");
3227                 return  -ENODEV;
3228         }
3229
3230         /*
3231          * Check the need for DMA-remapping initialization now.
3232          * Above initialization will also be used by Interrupt-remapping.
3233          */
3234         if (no_iommu || dmar_disabled)
3235                 return -ENODEV;
3236
3237         iommu_init_mempool();
3238         dmar_init_reserved_ranges();
3239
3240         init_no_remapping_devices();
3241
3242         ret = init_dmars();
3243         if (ret) {
3244                 if (force_on)
3245                         panic("tboot: Failed to initialize DMARs\n");
3246                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3247                 put_iova_domain(&reserved_iova_list);
3248                 iommu_exit_mempool();
3249                 return ret;
3250         }
3251         printk(KERN_INFO
3252         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3253
3254         init_timer(&unmap_timer);
3255 #ifdef CONFIG_SWIOTLB
3256         swiotlb = 0;
3257 #endif
3258         dma_ops = &intel_dma_ops;
3259
3260         init_iommu_sysfs();
3261
3262         register_iommu(&intel_iommu_ops);
3263
3264         return 0;
3265 }
3266
3267 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3268                                            struct pci_dev *pdev)
3269 {
3270         struct pci_dev *tmp, *parent;
3271
3272         if (!iommu || !pdev)
3273                 return;
3274
3275         /* dependent device detach */
3276         tmp = pci_find_upstream_pcie_bridge(pdev);
3277         /* Secondary interface's bus number and devfn 0 */
3278         if (tmp) {
3279                 parent = pdev->bus->self;
3280                 while (parent != tmp) {
3281                         iommu_detach_dev(iommu, parent->bus->number,
3282                                          parent->devfn);
3283                         parent = parent->bus->self;
3284                 }
3285                 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
3286                         iommu_detach_dev(iommu,
3287                                 tmp->subordinate->number, 0);
3288                 else /* this is a legacy PCI bridge */
3289                         iommu_detach_dev(iommu, tmp->bus->number,
3290                                          tmp->devfn);
3291         }
3292 }
3293
3294 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3295                                           struct pci_dev *pdev)
3296 {
3297         struct device_domain_info *info;
3298         struct intel_iommu *iommu;
3299         unsigned long flags;
3300         int found = 0;
3301         struct list_head *entry, *tmp;
3302
3303         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3304                                 pdev->devfn);
3305         if (!iommu)
3306                 return;
3307
3308         spin_lock_irqsave(&device_domain_lock, flags);
3309         list_for_each_safe(entry, tmp, &domain->devices) {
3310                 info = list_entry(entry, struct device_domain_info, link);
3311                 /* No need to compare PCI domain; it has to be the same */
3312                 if (info->bus == pdev->bus->number &&
3313                     info->devfn == pdev->devfn) {
3314                         list_del(&info->link);
3315                         list_del(&info->global);
3316                         if (info->dev)
3317                                 info->dev->dev.archdata.iommu = NULL;
3318                         spin_unlock_irqrestore(&device_domain_lock, flags);
3319
3320                         iommu_disable_dev_iotlb(info);
3321                         iommu_detach_dev(iommu, info->bus, info->devfn);
3322                         iommu_detach_dependent_devices(iommu, pdev);
3323                         free_devinfo_mem(info);
3324
3325                         spin_lock_irqsave(&device_domain_lock, flags);
3326
3327                         if (found)
3328                                 break;
3329                         else
3330                                 continue;
3331                 }
3332
3333                 /* if there is no other devices under the same iommu
3334                  * owned by this domain, clear this iommu in iommu_bmp
3335                  * update iommu count and coherency
3336                  */
3337                 if (iommu == device_to_iommu(info->segment, info->bus,
3338                                             info->devfn))
3339                         found = 1;
3340         }
3341
3342         if (found == 0) {
3343                 unsigned long tmp_flags;
3344                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3345                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3346                 domain->iommu_count--;
3347                 domain_update_iommu_cap(domain);
3348                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3349         }
3350
3351         spin_unlock_irqrestore(&device_domain_lock, flags);
3352 }
3353
3354 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3355 {
3356         struct device_domain_info *info;
3357         struct intel_iommu *iommu;
3358         unsigned long flags1, flags2;
3359
3360         spin_lock_irqsave(&device_domain_lock, flags1);
3361         while (!list_empty(&domain->devices)) {
3362                 info = list_entry(domain->devices.next,
3363                         struct device_domain_info, link);
3364                 list_del(&info->link);
3365                 list_del(&info->global);
3366                 if (info->dev)
3367                         info->dev->dev.archdata.iommu = NULL;
3368
3369                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3370
3371                 iommu_disable_dev_iotlb(info);
3372                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3373                 iommu_detach_dev(iommu, info->bus, info->devfn);
3374                 iommu_detach_dependent_devices(iommu, info->dev);
3375
3376                 /* clear this iommu in iommu_bmp, update iommu count
3377                  * and capabilities
3378                  */
3379                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3380                 if (test_and_clear_bit(iommu->seq_id,
3381                                        &domain->iommu_bmp)) {
3382                         domain->iommu_count--;
3383                         domain_update_iommu_cap(domain);
3384                 }
3385                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3386
3387                 free_devinfo_mem(info);
3388                 spin_lock_irqsave(&device_domain_lock, flags1);
3389         }
3390         spin_unlock_irqrestore(&device_domain_lock, flags1);
3391 }
3392
3393 /* domain id for virtual machine, it won't be set in context */
3394 static unsigned long vm_domid;
3395
3396 static int vm_domain_min_agaw(struct dmar_domain *domain)
3397 {
3398         int i;
3399         int min_agaw = domain->agaw;
3400
3401         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
3402         for (; i < g_num_of_iommus; ) {
3403                 if (min_agaw > g_iommus[i]->agaw)
3404                         min_agaw = g_iommus[i]->agaw;
3405
3406                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
3407         }
3408
3409         return min_agaw;
3410 }
3411
3412 static struct dmar_domain *iommu_alloc_vm_domain(void)
3413 {
3414         struct dmar_domain *domain;
3415
3416         domain = alloc_domain_mem();
3417         if (!domain)
3418                 return NULL;
3419
3420         domain->id = vm_domid++;
3421         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3422         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3423
3424         return domain;
3425 }
3426
3427 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3428 {
3429         int adjust_width;
3430
3431         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3432         spin_lock_init(&domain->iommu_lock);
3433
3434         domain_reserve_special_ranges(domain);
3435
3436         /* calculate AGAW */
3437         domain->gaw = guest_width;
3438         adjust_width = guestwidth_to_adjustwidth(guest_width);
3439         domain->agaw = width_to_agaw(adjust_width);
3440
3441         INIT_LIST_HEAD(&domain->devices);
3442
3443         domain->iommu_count = 0;
3444         domain->iommu_coherency = 0;
3445         domain->iommu_snooping = 0;
3446         domain->max_addr = 0;
3447
3448         /* always allocate the top pgd */
3449         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
3450         if (!domain->pgd)
3451                 return -ENOMEM;
3452         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3453         return 0;
3454 }
3455
3456 static void iommu_free_vm_domain(struct dmar_domain *domain)
3457 {
3458         unsigned long flags;
3459         struct dmar_drhd_unit *drhd;
3460         struct intel_iommu *iommu;
3461         unsigned long i;
3462         unsigned long ndomains;
3463
3464         for_each_drhd_unit(drhd) {
3465                 if (drhd->ignored)
3466                         continue;
3467                 iommu = drhd->iommu;
3468
3469                 ndomains = cap_ndoms(iommu->cap);
3470                 i = find_first_bit(iommu->domain_ids, ndomains);
3471                 for (; i < ndomains; ) {
3472                         if (iommu->domains[i] == domain) {
3473                                 spin_lock_irqsave(&iommu->lock, flags);
3474                                 clear_bit(i, iommu->domain_ids);
3475                                 iommu->domains[i] = NULL;
3476                                 spin_unlock_irqrestore(&iommu->lock, flags);
3477                                 break;
3478                         }
3479                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3480                 }
3481         }
3482 }
3483
3484 static void vm_domain_exit(struct dmar_domain *domain)
3485 {
3486         /* Domain 0 is reserved, so dont process it */
3487         if (!domain)
3488                 return;
3489
3490         vm_domain_remove_all_dev_info(domain);
3491         /* destroy iovas */
3492         put_iova_domain(&domain->iovad);
3493
3494         /* clear ptes */
3495         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3496
3497         /* free page tables */
3498         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3499
3500         iommu_free_vm_domain(domain);
3501         free_domain_mem(domain);
3502 }
3503
3504 static int intel_iommu_domain_init(struct iommu_domain *domain)
3505 {
3506         struct dmar_domain *dmar_domain;
3507
3508         dmar_domain = iommu_alloc_vm_domain();
3509         if (!dmar_domain) {
3510                 printk(KERN_ERR
3511                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3512                 return -ENOMEM;
3513         }
3514         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3515                 printk(KERN_ERR
3516                         "intel_iommu_domain_init() failed\n");
3517                 vm_domain_exit(dmar_domain);
3518                 return -ENOMEM;
3519         }
3520         domain->priv = dmar_domain;
3521
3522         return 0;
3523 }
3524
3525 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3526 {
3527         struct dmar_domain *dmar_domain = domain->priv;
3528
3529         domain->priv = NULL;
3530         vm_domain_exit(dmar_domain);
3531 }
3532
3533 static int intel_iommu_attach_device(struct iommu_domain *domain,
3534                                      struct device *dev)
3535 {
3536         struct dmar_domain *dmar_domain = domain->priv;
3537         struct pci_dev *pdev = to_pci_dev(dev);
3538         struct intel_iommu *iommu;
3539         int addr_width;
3540         u64 end;
3541
3542         /* normally pdev is not mapped */
3543         if (unlikely(domain_context_mapped(pdev))) {
3544                 struct dmar_domain *old_domain;
3545
3546                 old_domain = find_domain(pdev);
3547                 if (old_domain) {
3548                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3549                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3550                                 domain_remove_one_dev_info(old_domain, pdev);
3551                         else
3552                                 domain_remove_dev_info(old_domain);
3553                 }
3554         }
3555
3556         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3557                                 pdev->devfn);
3558         if (!iommu)
3559                 return -ENODEV;
3560
3561         /* check if this iommu agaw is sufficient for max mapped address */
3562         addr_width = agaw_to_width(iommu->agaw);
3563         end = DOMAIN_MAX_ADDR(addr_width);
3564         end = end & VTD_PAGE_MASK;
3565         if (end < dmar_domain->max_addr) {
3566                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3567                        "sufficient for the mapped address (%llx)\n",
3568                        __func__, iommu->agaw, dmar_domain->max_addr);
3569                 return -EFAULT;
3570         }
3571
3572         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3573 }
3574
3575 static void intel_iommu_detach_device(struct iommu_domain *domain,
3576                                       struct device *dev)
3577 {
3578         struct dmar_domain *dmar_domain = domain->priv;
3579         struct pci_dev *pdev = to_pci_dev(dev);
3580
3581         domain_remove_one_dev_info(dmar_domain, pdev);
3582 }
3583
3584 static int intel_iommu_map_range(struct iommu_domain *domain,
3585                                  unsigned long iova, phys_addr_t hpa,
3586                                  size_t size, int iommu_prot)
3587 {
3588         struct dmar_domain *dmar_domain = domain->priv;
3589         u64 max_addr;
3590         int addr_width;
3591         int prot = 0;
3592         int ret;
3593
3594         if (iommu_prot & IOMMU_READ)
3595                 prot |= DMA_PTE_READ;
3596         if (iommu_prot & IOMMU_WRITE)
3597                 prot |= DMA_PTE_WRITE;
3598         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3599                 prot |= DMA_PTE_SNP;
3600
3601         max_addr = iova + size;
3602         if (dmar_domain->max_addr < max_addr) {
3603                 int min_agaw;
3604                 u64 end;
3605
3606                 /* check if minimum agaw is sufficient for mapped address */
3607                 min_agaw = vm_domain_min_agaw(dmar_domain);
3608                 addr_width = agaw_to_width(min_agaw);
3609                 end = DOMAIN_MAX_ADDR(addr_width);
3610                 end = end & VTD_PAGE_MASK;
3611                 if (end < max_addr) {
3612                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3613                                "sufficient for the mapped address (%llx)\n",
3614                                __func__, min_agaw, max_addr);
3615                         return -EFAULT;
3616                 }
3617                 dmar_domain->max_addr = max_addr;
3618         }
3619         /* Round up size to next multiple of PAGE_SIZE, if it and
3620            the low bits of hpa would take us onto the next page */
3621         size = aligned_nrpages(hpa, size);
3622         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3623                                  hpa >> VTD_PAGE_SHIFT, size, prot);
3624         return ret;
3625 }
3626
3627 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3628                                     unsigned long iova, size_t size)
3629 {
3630         struct dmar_domain *dmar_domain = domain->priv;
3631
3632         if (!size)
3633                 return;
3634
3635         dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3636                             (iova + size - 1) >> VTD_PAGE_SHIFT);
3637
3638         if (dmar_domain->max_addr == iova + size)
3639                 dmar_domain->max_addr = iova;
3640 }
3641
3642 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3643                                             unsigned long iova)
3644 {
3645         struct dmar_domain *dmar_domain = domain->priv;
3646         struct dma_pte *pte;
3647         u64 phys = 0;
3648
3649         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3650         if (pte)
3651                 phys = dma_pte_addr(pte);
3652
3653         return phys;
3654 }
3655
3656 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3657                                       unsigned long cap)
3658 {
3659         struct dmar_domain *dmar_domain = domain->priv;
3660
3661         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3662                 return dmar_domain->iommu_snooping;
3663
3664         return 0;
3665 }
3666
3667 static struct iommu_ops intel_iommu_ops = {
3668         .domain_init    = intel_iommu_domain_init,
3669         .domain_destroy = intel_iommu_domain_destroy,
3670         .attach_dev     = intel_iommu_attach_device,
3671         .detach_dev     = intel_iommu_detach_device,
3672         .map            = intel_iommu_map_range,
3673         .unmap          = intel_iommu_unmap_range,
3674         .iova_to_phys   = intel_iommu_iova_to_phys,
3675         .domain_has_cap = intel_iommu_domain_has_cap,
3676 };
3677
3678 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3679 {
3680         /*
3681          * Mobile 4 Series Chipset neglects to set RWBF capability,
3682          * but needs it:
3683          */
3684         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3685         rwbf_quirk = 1;
3686 }
3687
3688 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3689
3690 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3691    ISOCH DMAR unit for the Azalia sound device, but not give it any
3692    TLB entries, which causes it to deadlock. Check for that.  We do
3693    this in a function called from init_dmars(), instead of in a PCI
3694    quirk, because we don't want to print the obnoxious "BIOS broken"
3695    message if VT-d is actually disabled.
3696 */
3697 static void __init check_tylersburg_isoch(void)
3698 {
3699         struct pci_dev *pdev;
3700         uint32_t vtisochctrl;
3701
3702         /* If there's no Azalia in the system anyway, forget it. */
3703         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3704         if (!pdev)
3705                 return;
3706         pci_dev_put(pdev);
3707
3708         /* System Management Registers. Might be hidden, in which case
3709            we can't do the sanity check. But that's OK, because the
3710            known-broken BIOSes _don't_ actually hide it, so far. */
3711         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3712         if (!pdev)
3713                 return;
3714
3715         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3716                 pci_dev_put(pdev);
3717                 return;
3718         }
3719
3720         pci_dev_put(pdev);
3721
3722         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3723         if (vtisochctrl & 1)
3724                 return;
3725
3726         /* Drop all bits other than the number of TLB entries */
3727         vtisochctrl &= 0x1c;
3728
3729         /* If we have the recommended number of TLB entries (16), fine. */
3730         if (vtisochctrl == 0x10)
3731                 return;
3732
3733         /* Zero TLB entries? You get to ride the short bus to school. */
3734         if (!vtisochctrl) {
3735                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3736                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3737                      dmi_get_system_info(DMI_BIOS_VENDOR),
3738                      dmi_get_system_info(DMI_BIOS_VERSION),
3739                      dmi_get_system_info(DMI_PRODUCT_VERSION));
3740                 iommu_identity_mapping |= IDENTMAP_AZALIA;
3741                 return;
3742         }
3743         
3744         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3745                vtisochctrl);
3746 }