8d6159426311bda6a5bcab491b282e5427d6eff2
[linux-2.6.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
45
46 #define ROOT_SIZE               VTD_PAGE_SIZE
47 #define CONTEXT_SIZE            VTD_PAGE_SIZE
48
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
52
53 #define IOAPIC_RANGE_START      (0xfee00000)
54 #define IOAPIC_RANGE_END        (0xfeefffff)
55 #define IOVA_START_ADDR         (0x1000)
56
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
58
59 #define MAX_AGAW_WIDTH 64
60
61 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
63
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
67                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
69
70 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
73
74
75 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
76    are never going to work. */
77 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
78 {
79         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
80 }
81
82 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
83 {
84         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
85 }
86 static inline unsigned long page_to_dma_pfn(struct page *pg)
87 {
88         return mm_to_dma_pfn(page_to_pfn(pg));
89 }
90 static inline unsigned long virt_to_dma_pfn(void *p)
91 {
92         return page_to_dma_pfn(virt_to_page(p));
93 }
94
95 /* global iommu list, set NULL for ignored DMAR units */
96 static struct intel_iommu **g_iommus;
97
98 static void __init check_tylersburg_isoch(void);
99 static int rwbf_quirk;
100
101 /*
102  * 0: Present
103  * 1-11: Reserved
104  * 12-63: Context Ptr (12 - (haw-1))
105  * 64-127: Reserved
106  */
107 struct root_entry {
108         u64     val;
109         u64     rsvd1;
110 };
111 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
112 static inline bool root_present(struct root_entry *root)
113 {
114         return (root->val & 1);
115 }
116 static inline void set_root_present(struct root_entry *root)
117 {
118         root->val |= 1;
119 }
120 static inline void set_root_value(struct root_entry *root, unsigned long value)
121 {
122         root->val |= value & VTD_PAGE_MASK;
123 }
124
125 static inline struct context_entry *
126 get_context_addr_from_root(struct root_entry *root)
127 {
128         return (struct context_entry *)
129                 (root_present(root)?phys_to_virt(
130                 root->val & VTD_PAGE_MASK) :
131                 NULL);
132 }
133
134 /*
135  * low 64 bits:
136  * 0: present
137  * 1: fault processing disable
138  * 2-3: translation type
139  * 12-63: address space root
140  * high 64 bits:
141  * 0-2: address width
142  * 3-6: aval
143  * 8-23: domain id
144  */
145 struct context_entry {
146         u64 lo;
147         u64 hi;
148 };
149
150 static inline bool context_present(struct context_entry *context)
151 {
152         return (context->lo & 1);
153 }
154 static inline void context_set_present(struct context_entry *context)
155 {
156         context->lo |= 1;
157 }
158
159 static inline void context_set_fault_enable(struct context_entry *context)
160 {
161         context->lo &= (((u64)-1) << 2) | 1;
162 }
163
164 static inline void context_set_translation_type(struct context_entry *context,
165                                                 unsigned long value)
166 {
167         context->lo &= (((u64)-1) << 4) | 3;
168         context->lo |= (value & 3) << 2;
169 }
170
171 static inline void context_set_address_root(struct context_entry *context,
172                                             unsigned long value)
173 {
174         context->lo |= value & VTD_PAGE_MASK;
175 }
176
177 static inline void context_set_address_width(struct context_entry *context,
178                                              unsigned long value)
179 {
180         context->hi |= value & 7;
181 }
182
183 static inline void context_set_domain_id(struct context_entry *context,
184                                          unsigned long value)
185 {
186         context->hi |= (value & ((1 << 16) - 1)) << 8;
187 }
188
189 static inline void context_clear_entry(struct context_entry *context)
190 {
191         context->lo = 0;
192         context->hi = 0;
193 }
194
195 /*
196  * 0: readable
197  * 1: writable
198  * 2-6: reserved
199  * 7: super page
200  * 8-10: available
201  * 11: snoop behavior
202  * 12-63: Host physcial address
203  */
204 struct dma_pte {
205         u64 val;
206 };
207
208 static inline void dma_clear_pte(struct dma_pte *pte)
209 {
210         pte->val = 0;
211 }
212
213 static inline void dma_set_pte_readable(struct dma_pte *pte)
214 {
215         pte->val |= DMA_PTE_READ;
216 }
217
218 static inline void dma_set_pte_writable(struct dma_pte *pte)
219 {
220         pte->val |= DMA_PTE_WRITE;
221 }
222
223 static inline void dma_set_pte_snp(struct dma_pte *pte)
224 {
225         pte->val |= DMA_PTE_SNP;
226 }
227
228 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
229 {
230         pte->val = (pte->val & ~3) | (prot & 3);
231 }
232
233 static inline u64 dma_pte_addr(struct dma_pte *pte)
234 {
235 #ifdef CONFIG_64BIT
236         return pte->val & VTD_PAGE_MASK;
237 #else
238         /* Must have a full atomic 64-bit read */
239         return  __cmpxchg64(pte, 0ULL, 0ULL) & VTD_PAGE_MASK;
240 #endif
241 }
242
243 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
244 {
245         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
246 }
247
248 static inline bool dma_pte_present(struct dma_pte *pte)
249 {
250         return (pte->val & 3) != 0;
251 }
252
253 static inline int first_pte_in_page(struct dma_pte *pte)
254 {
255         return !((unsigned long)pte & ~VTD_PAGE_MASK);
256 }
257
258 /*
259  * This domain is a statically identity mapping domain.
260  *      1. This domain creats a static 1:1 mapping to all usable memory.
261  *      2. It maps to each iommu if successful.
262  *      3. Each iommu mapps to this domain if successful.
263  */
264 static struct dmar_domain *si_domain;
265 static int hw_pass_through = 1;
266
267 /* devices under the same p2p bridge are owned in one domain */
268 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
269
270 /* domain represents a virtual machine, more than one devices
271  * across iommus may be owned in one domain, e.g. kvm guest.
272  */
273 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
274
275 /* si_domain contains mulitple devices */
276 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
277
278 struct dmar_domain {
279         int     id;                     /* domain id */
280         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
281
282         struct list_head devices;       /* all devices' list */
283         struct iova_domain iovad;       /* iova's that belong to this domain */
284
285         struct dma_pte  *pgd;           /* virtual address */
286         int             gaw;            /* max guest address width */
287
288         /* adjusted guest address width, 0 is level 2 30-bit */
289         int             agaw;
290
291         int             flags;          /* flags to find out type of domain */
292
293         int             iommu_coherency;/* indicate coherency of iommu access */
294         int             iommu_snooping; /* indicate snooping control feature*/
295         int             iommu_count;    /* reference count of iommu */
296         spinlock_t      iommu_lock;     /* protect iommu set in domain */
297         u64             max_addr;       /* maximum mapped address */
298 };
299
300 /* PCI domain-device relationship */
301 struct device_domain_info {
302         struct list_head link;  /* link to domain siblings */
303         struct list_head global; /* link to global list */
304         int segment;            /* PCI domain */
305         u8 bus;                 /* PCI bus number */
306         u8 devfn;               /* PCI devfn number */
307         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
308         struct intel_iommu *iommu; /* IOMMU used by this device */
309         struct dmar_domain *domain; /* pointer to domain */
310 };
311
312 static void flush_unmaps_timeout(unsigned long data);
313
314 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
315
316 #define HIGH_WATER_MARK 250
317 struct deferred_flush_tables {
318         int next;
319         struct iova *iova[HIGH_WATER_MARK];
320         struct dmar_domain *domain[HIGH_WATER_MARK];
321 };
322
323 static struct deferred_flush_tables *deferred_flush;
324
325 /* bitmap for indexing intel_iommus */
326 static int g_num_of_iommus;
327
328 static DEFINE_SPINLOCK(async_umap_flush_lock);
329 static LIST_HEAD(unmaps_to_do);
330
331 static int timer_on;
332 static long list_size;
333
334 static void domain_remove_dev_info(struct dmar_domain *domain);
335
336 #ifdef CONFIG_DMAR_DEFAULT_ON
337 int dmar_disabled = 0;
338 #else
339 int dmar_disabled = 1;
340 #endif /*CONFIG_DMAR_DEFAULT_ON*/
341
342 static int __initdata dmar_map_gfx = 1;
343 static int dmar_forcedac;
344 static int intel_iommu_strict;
345
346 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
347 static DEFINE_SPINLOCK(device_domain_lock);
348 static LIST_HEAD(device_domain_list);
349
350 static struct iommu_ops intel_iommu_ops;
351
352 static int __init intel_iommu_setup(char *str)
353 {
354         if (!str)
355                 return -EINVAL;
356         while (*str) {
357                 if (!strncmp(str, "on", 2)) {
358                         dmar_disabled = 0;
359                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
360                 } else if (!strncmp(str, "off", 3)) {
361                         dmar_disabled = 1;
362                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
363                 } else if (!strncmp(str, "igfx_off", 8)) {
364                         dmar_map_gfx = 0;
365                         printk(KERN_INFO
366                                 "Intel-IOMMU: disable GFX device mapping\n");
367                 } else if (!strncmp(str, "forcedac", 8)) {
368                         printk(KERN_INFO
369                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
370                         dmar_forcedac = 1;
371                 } else if (!strncmp(str, "strict", 6)) {
372                         printk(KERN_INFO
373                                 "Intel-IOMMU: disable batched IOTLB flush\n");
374                         intel_iommu_strict = 1;
375                 }
376
377                 str += strcspn(str, ",");
378                 while (*str == ',')
379                         str++;
380         }
381         return 0;
382 }
383 __setup("intel_iommu=", intel_iommu_setup);
384
385 static struct kmem_cache *iommu_domain_cache;
386 static struct kmem_cache *iommu_devinfo_cache;
387 static struct kmem_cache *iommu_iova_cache;
388
389 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
390 {
391         unsigned int flags;
392         void *vaddr;
393
394         /* trying to avoid low memory issues */
395         flags = current->flags & PF_MEMALLOC;
396         current->flags |= PF_MEMALLOC;
397         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
398         current->flags &= (~PF_MEMALLOC | flags);
399         return vaddr;
400 }
401
402
403 static inline void *alloc_pgtable_page(void)
404 {
405         unsigned int flags;
406         void *vaddr;
407
408         /* trying to avoid low memory issues */
409         flags = current->flags & PF_MEMALLOC;
410         current->flags |= PF_MEMALLOC;
411         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
412         current->flags &= (~PF_MEMALLOC | flags);
413         return vaddr;
414 }
415
416 static inline void free_pgtable_page(void *vaddr)
417 {
418         free_page((unsigned long)vaddr);
419 }
420
421 static inline void *alloc_domain_mem(void)
422 {
423         return iommu_kmem_cache_alloc(iommu_domain_cache);
424 }
425
426 static void free_domain_mem(void *vaddr)
427 {
428         kmem_cache_free(iommu_domain_cache, vaddr);
429 }
430
431 static inline void * alloc_devinfo_mem(void)
432 {
433         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
434 }
435
436 static inline void free_devinfo_mem(void *vaddr)
437 {
438         kmem_cache_free(iommu_devinfo_cache, vaddr);
439 }
440
441 struct iova *alloc_iova_mem(void)
442 {
443         return iommu_kmem_cache_alloc(iommu_iova_cache);
444 }
445
446 void free_iova_mem(struct iova *iova)
447 {
448         kmem_cache_free(iommu_iova_cache, iova);
449 }
450
451
452 static inline int width_to_agaw(int width);
453
454 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
455 {
456         unsigned long sagaw;
457         int agaw = -1;
458
459         sagaw = cap_sagaw(iommu->cap);
460         for (agaw = width_to_agaw(max_gaw);
461              agaw >= 0; agaw--) {
462                 if (test_bit(agaw, &sagaw))
463                         break;
464         }
465
466         return agaw;
467 }
468
469 /*
470  * Calculate max SAGAW for each iommu.
471  */
472 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
473 {
474         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
475 }
476
477 /*
478  * calculate agaw for each iommu.
479  * "SAGAW" may be different across iommus, use a default agaw, and
480  * get a supported less agaw for iommus that don't support the default agaw.
481  */
482 int iommu_calculate_agaw(struct intel_iommu *iommu)
483 {
484         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
485 }
486
487 /* This functionin only returns single iommu in a domain */
488 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
489 {
490         int iommu_id;
491
492         /* si_domain and vm domain should not get here. */
493         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
494         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
495
496         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
497         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
498                 return NULL;
499
500         return g_iommus[iommu_id];
501 }
502
503 static void domain_update_iommu_coherency(struct dmar_domain *domain)
504 {
505         int i;
506
507         domain->iommu_coherency = 1;
508
509         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
510         for (; i < g_num_of_iommus; ) {
511                 if (!ecap_coherent(g_iommus[i]->ecap)) {
512                         domain->iommu_coherency = 0;
513                         break;
514                 }
515                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
516         }
517 }
518
519 static void domain_update_iommu_snooping(struct dmar_domain *domain)
520 {
521         int i;
522
523         domain->iommu_snooping = 1;
524
525         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
526         for (; i < g_num_of_iommus; ) {
527                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
528                         domain->iommu_snooping = 0;
529                         break;
530                 }
531                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
532         }
533 }
534
535 /* Some capabilities may be different across iommus */
536 static void domain_update_iommu_cap(struct dmar_domain *domain)
537 {
538         domain_update_iommu_coherency(domain);
539         domain_update_iommu_snooping(domain);
540 }
541
542 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
543 {
544         struct dmar_drhd_unit *drhd = NULL;
545         int i;
546
547         for_each_drhd_unit(drhd) {
548                 if (drhd->ignored)
549                         continue;
550                 if (segment != drhd->segment)
551                         continue;
552
553                 for (i = 0; i < drhd->devices_cnt; i++) {
554                         if (drhd->devices[i] &&
555                             drhd->devices[i]->bus->number == bus &&
556                             drhd->devices[i]->devfn == devfn)
557                                 return drhd->iommu;
558                         if (drhd->devices[i] &&
559                             drhd->devices[i]->subordinate &&
560                             drhd->devices[i]->subordinate->number <= bus &&
561                             drhd->devices[i]->subordinate->subordinate >= bus)
562                                 return drhd->iommu;
563                 }
564
565                 if (drhd->include_all)
566                         return drhd->iommu;
567         }
568
569         return NULL;
570 }
571
572 static void domain_flush_cache(struct dmar_domain *domain,
573                                void *addr, int size)
574 {
575         if (!domain->iommu_coherency)
576                 clflush_cache_range(addr, size);
577 }
578
579 /* Gets context entry for a given bus and devfn */
580 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
581                 u8 bus, u8 devfn)
582 {
583         struct root_entry *root;
584         struct context_entry *context;
585         unsigned long phy_addr;
586         unsigned long flags;
587
588         spin_lock_irqsave(&iommu->lock, flags);
589         root = &iommu->root_entry[bus];
590         context = get_context_addr_from_root(root);
591         if (!context) {
592                 context = (struct context_entry *)alloc_pgtable_page();
593                 if (!context) {
594                         spin_unlock_irqrestore(&iommu->lock, flags);
595                         return NULL;
596                 }
597                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
598                 phy_addr = virt_to_phys((void *)context);
599                 set_root_value(root, phy_addr);
600                 set_root_present(root);
601                 __iommu_flush_cache(iommu, root, sizeof(*root));
602         }
603         spin_unlock_irqrestore(&iommu->lock, flags);
604         return &context[devfn];
605 }
606
607 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
608 {
609         struct root_entry *root;
610         struct context_entry *context;
611         int ret;
612         unsigned long flags;
613
614         spin_lock_irqsave(&iommu->lock, flags);
615         root = &iommu->root_entry[bus];
616         context = get_context_addr_from_root(root);
617         if (!context) {
618                 ret = 0;
619                 goto out;
620         }
621         ret = context_present(&context[devfn]);
622 out:
623         spin_unlock_irqrestore(&iommu->lock, flags);
624         return ret;
625 }
626
627 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
628 {
629         struct root_entry *root;
630         struct context_entry *context;
631         unsigned long flags;
632
633         spin_lock_irqsave(&iommu->lock, flags);
634         root = &iommu->root_entry[bus];
635         context = get_context_addr_from_root(root);
636         if (context) {
637                 context_clear_entry(&context[devfn]);
638                 __iommu_flush_cache(iommu, &context[devfn], \
639                         sizeof(*context));
640         }
641         spin_unlock_irqrestore(&iommu->lock, flags);
642 }
643
644 static void free_context_table(struct intel_iommu *iommu)
645 {
646         struct root_entry *root;
647         int i;
648         unsigned long flags;
649         struct context_entry *context;
650
651         spin_lock_irqsave(&iommu->lock, flags);
652         if (!iommu->root_entry) {
653                 goto out;
654         }
655         for (i = 0; i < ROOT_ENTRY_NR; i++) {
656                 root = &iommu->root_entry[i];
657                 context = get_context_addr_from_root(root);
658                 if (context)
659                         free_pgtable_page(context);
660         }
661         free_pgtable_page(iommu->root_entry);
662         iommu->root_entry = NULL;
663 out:
664         spin_unlock_irqrestore(&iommu->lock, flags);
665 }
666
667 /* page table handling */
668 #define LEVEL_STRIDE            (9)
669 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
670
671 static inline int agaw_to_level(int agaw)
672 {
673         return agaw + 2;
674 }
675
676 static inline int agaw_to_width(int agaw)
677 {
678         return 30 + agaw * LEVEL_STRIDE;
679
680 }
681
682 static inline int width_to_agaw(int width)
683 {
684         return (width - 30) / LEVEL_STRIDE;
685 }
686
687 static inline unsigned int level_to_offset_bits(int level)
688 {
689         return (level - 1) * LEVEL_STRIDE;
690 }
691
692 static inline int pfn_level_offset(unsigned long pfn, int level)
693 {
694         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
695 }
696
697 static inline unsigned long level_mask(int level)
698 {
699         return -1UL << level_to_offset_bits(level);
700 }
701
702 static inline unsigned long level_size(int level)
703 {
704         return 1UL << level_to_offset_bits(level);
705 }
706
707 static inline unsigned long align_to_level(unsigned long pfn, int level)
708 {
709         return (pfn + level_size(level) - 1) & level_mask(level);
710 }
711
712 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
713                                       unsigned long pfn)
714 {
715         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
716         struct dma_pte *parent, *pte = NULL;
717         int level = agaw_to_level(domain->agaw);
718         int offset;
719
720         BUG_ON(!domain->pgd);
721         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
722         parent = domain->pgd;
723
724         while (level > 0) {
725                 void *tmp_page;
726
727                 offset = pfn_level_offset(pfn, level);
728                 pte = &parent[offset];
729                 if (level == 1)
730                         break;
731
732                 if (!dma_pte_present(pte)) {
733                         uint64_t pteval;
734
735                         tmp_page = alloc_pgtable_page();
736
737                         if (!tmp_page)
738                                 return NULL;
739
740                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
741                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
742                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
743                                 /* Someone else set it while we were thinking; use theirs. */
744                                 free_pgtable_page(tmp_page);
745                         } else {
746                                 dma_pte_addr(pte);
747                                 domain_flush_cache(domain, pte, sizeof(*pte));
748                         }
749                 }
750                 parent = phys_to_virt(dma_pte_addr(pte));
751                 level--;
752         }
753
754         return pte;
755 }
756
757 /* return address's pte at specific level */
758 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
759                                          unsigned long pfn,
760                                          int level)
761 {
762         struct dma_pte *parent, *pte = NULL;
763         int total = agaw_to_level(domain->agaw);
764         int offset;
765
766         parent = domain->pgd;
767         while (level <= total) {
768                 offset = pfn_level_offset(pfn, total);
769                 pte = &parent[offset];
770                 if (level == total)
771                         return pte;
772
773                 if (!dma_pte_present(pte))
774                         break;
775                 parent = phys_to_virt(dma_pte_addr(pte));
776                 total--;
777         }
778         return NULL;
779 }
780
781 /* clear last level pte, a tlb flush should be followed */
782 static void dma_pte_clear_range(struct dmar_domain *domain,
783                                 unsigned long start_pfn,
784                                 unsigned long last_pfn)
785 {
786         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
787         struct dma_pte *first_pte, *pte;
788
789         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
790         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
791         BUG_ON(start_pfn > last_pfn);
792
793         /* we don't need lock here; nobody else touches the iova range */
794         do {
795                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
796                 if (!pte) {
797                         start_pfn = align_to_level(start_pfn + 1, 2);
798                         continue;
799                 }
800                 do { 
801                         dma_clear_pte(pte);
802                         start_pfn++;
803                         pte++;
804                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
805
806                 domain_flush_cache(domain, first_pte,
807                                    (void *)pte - (void *)first_pte);
808
809         } while (start_pfn && start_pfn <= last_pfn);
810 }
811
812 /* free page table pages. last level pte should already be cleared */
813 static void dma_pte_free_pagetable(struct dmar_domain *domain,
814                                    unsigned long start_pfn,
815                                    unsigned long last_pfn)
816 {
817         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
818         struct dma_pte *first_pte, *pte;
819         int total = agaw_to_level(domain->agaw);
820         int level;
821         unsigned long tmp;
822
823         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
824         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
825         BUG_ON(start_pfn > last_pfn);
826
827         /* We don't need lock here; nobody else touches the iova range */
828         level = 2;
829         while (level <= total) {
830                 tmp = align_to_level(start_pfn, level);
831
832                 /* If we can't even clear one PTE at this level, we're done */
833                 if (tmp + level_size(level) - 1 > last_pfn)
834                         return;
835
836                 do {
837                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
838                         if (!pte) {
839                                 tmp = align_to_level(tmp + 1, level + 1);
840                                 continue;
841                         }
842                         do {
843                                 if (dma_pte_present(pte)) {
844                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
845                                         dma_clear_pte(pte);
846                                 }
847                                 pte++;
848                                 tmp += level_size(level);
849                         } while (!first_pte_in_page(pte) &&
850                                  tmp + level_size(level) - 1 <= last_pfn);
851
852                         domain_flush_cache(domain, first_pte,
853                                            (void *)pte - (void *)first_pte);
854                         
855                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
856                 level++;
857         }
858         /* free pgd */
859         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
860                 free_pgtable_page(domain->pgd);
861                 domain->pgd = NULL;
862         }
863 }
864
865 /* iommu handling */
866 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
867 {
868         struct root_entry *root;
869         unsigned long flags;
870
871         root = (struct root_entry *)alloc_pgtable_page();
872         if (!root)
873                 return -ENOMEM;
874
875         __iommu_flush_cache(iommu, root, ROOT_SIZE);
876
877         spin_lock_irqsave(&iommu->lock, flags);
878         iommu->root_entry = root;
879         spin_unlock_irqrestore(&iommu->lock, flags);
880
881         return 0;
882 }
883
884 static void iommu_set_root_entry(struct intel_iommu *iommu)
885 {
886         void *addr;
887         u32 sts;
888         unsigned long flag;
889
890         addr = iommu->root_entry;
891
892         spin_lock_irqsave(&iommu->register_lock, flag);
893         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
894
895         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
896
897         /* Make sure hardware complete it */
898         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
899                       readl, (sts & DMA_GSTS_RTPS), sts);
900
901         spin_unlock_irqrestore(&iommu->register_lock, flag);
902 }
903
904 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
905 {
906         u32 val;
907         unsigned long flag;
908
909         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
910                 return;
911
912         spin_lock_irqsave(&iommu->register_lock, flag);
913         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
914
915         /* Make sure hardware complete it */
916         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
917                       readl, (!(val & DMA_GSTS_WBFS)), val);
918
919         spin_unlock_irqrestore(&iommu->register_lock, flag);
920 }
921
922 /* return value determine if we need a write buffer flush */
923 static void __iommu_flush_context(struct intel_iommu *iommu,
924                                   u16 did, u16 source_id, u8 function_mask,
925                                   u64 type)
926 {
927         u64 val = 0;
928         unsigned long flag;
929
930         switch (type) {
931         case DMA_CCMD_GLOBAL_INVL:
932                 val = DMA_CCMD_GLOBAL_INVL;
933                 break;
934         case DMA_CCMD_DOMAIN_INVL:
935                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
936                 break;
937         case DMA_CCMD_DEVICE_INVL:
938                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
939                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
940                 break;
941         default:
942                 BUG();
943         }
944         val |= DMA_CCMD_ICC;
945
946         spin_lock_irqsave(&iommu->register_lock, flag);
947         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
948
949         /* Make sure hardware complete it */
950         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
951                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
952
953         spin_unlock_irqrestore(&iommu->register_lock, flag);
954 }
955
956 /* return value determine if we need a write buffer flush */
957 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
958                                 u64 addr, unsigned int size_order, u64 type)
959 {
960         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
961         u64 val = 0, val_iva = 0;
962         unsigned long flag;
963
964         switch (type) {
965         case DMA_TLB_GLOBAL_FLUSH:
966                 /* global flush doesn't need set IVA_REG */
967                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
968                 break;
969         case DMA_TLB_DSI_FLUSH:
970                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
971                 break;
972         case DMA_TLB_PSI_FLUSH:
973                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
974                 /* Note: always flush non-leaf currently */
975                 val_iva = size_order | addr;
976                 break;
977         default:
978                 BUG();
979         }
980         /* Note: set drain read/write */
981 #if 0
982         /*
983          * This is probably to be super secure.. Looks like we can
984          * ignore it without any impact.
985          */
986         if (cap_read_drain(iommu->cap))
987                 val |= DMA_TLB_READ_DRAIN;
988 #endif
989         if (cap_write_drain(iommu->cap))
990                 val |= DMA_TLB_WRITE_DRAIN;
991
992         spin_lock_irqsave(&iommu->register_lock, flag);
993         /* Note: Only uses first TLB reg currently */
994         if (val_iva)
995                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
996         dmar_writeq(iommu->reg + tlb_offset + 8, val);
997
998         /* Make sure hardware complete it */
999         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1000                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1001
1002         spin_unlock_irqrestore(&iommu->register_lock, flag);
1003
1004         /* check IOTLB invalidation granularity */
1005         if (DMA_TLB_IAIG(val) == 0)
1006                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1007         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1008                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1009                         (unsigned long long)DMA_TLB_IIRG(type),
1010                         (unsigned long long)DMA_TLB_IAIG(val));
1011 }
1012
1013 static struct device_domain_info *iommu_support_dev_iotlb(
1014         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1015 {
1016         int found = 0;
1017         unsigned long flags;
1018         struct device_domain_info *info;
1019         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1020
1021         if (!ecap_dev_iotlb_support(iommu->ecap))
1022                 return NULL;
1023
1024         if (!iommu->qi)
1025                 return NULL;
1026
1027         spin_lock_irqsave(&device_domain_lock, flags);
1028         list_for_each_entry(info, &domain->devices, link)
1029                 if (info->bus == bus && info->devfn == devfn) {
1030                         found = 1;
1031                         break;
1032                 }
1033         spin_unlock_irqrestore(&device_domain_lock, flags);
1034
1035         if (!found || !info->dev)
1036                 return NULL;
1037
1038         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1039                 return NULL;
1040
1041         if (!dmar_find_matched_atsr_unit(info->dev))
1042                 return NULL;
1043
1044         info->iommu = iommu;
1045
1046         return info;
1047 }
1048
1049 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1050 {
1051         if (!info)
1052                 return;
1053
1054         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1055 }
1056
1057 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1058 {
1059         if (!info->dev || !pci_ats_enabled(info->dev))
1060                 return;
1061
1062         pci_disable_ats(info->dev);
1063 }
1064
1065 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1066                                   u64 addr, unsigned mask)
1067 {
1068         u16 sid, qdep;
1069         unsigned long flags;
1070         struct device_domain_info *info;
1071
1072         spin_lock_irqsave(&device_domain_lock, flags);
1073         list_for_each_entry(info, &domain->devices, link) {
1074                 if (!info->dev || !pci_ats_enabled(info->dev))
1075                         continue;
1076
1077                 sid = info->bus << 8 | info->devfn;
1078                 qdep = pci_ats_queue_depth(info->dev);
1079                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1080         }
1081         spin_unlock_irqrestore(&device_domain_lock, flags);
1082 }
1083
1084 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1085                                   unsigned long pfn, unsigned int pages)
1086 {
1087         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1088         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1089
1090         BUG_ON(pages == 0);
1091
1092         /*
1093          * Fallback to domain selective flush if no PSI support or the size is
1094          * too big.
1095          * PSI requires page size to be 2 ^ x, and the base address is naturally
1096          * aligned to the size
1097          */
1098         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1099                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1100                                                 DMA_TLB_DSI_FLUSH);
1101         else
1102                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1103                                                 DMA_TLB_PSI_FLUSH);
1104
1105         /*
1106          * In caching mode, domain ID 0 is reserved for non-present to present
1107          * mapping flush. Device IOTLB doesn't need to be flushed in this case.
1108          */
1109         if (!cap_caching_mode(iommu->cap) || did)
1110                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1111 }
1112
1113 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1114 {
1115         u32 pmen;
1116         unsigned long flags;
1117
1118         spin_lock_irqsave(&iommu->register_lock, flags);
1119         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1120         pmen &= ~DMA_PMEN_EPM;
1121         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1122
1123         /* wait for the protected region status bit to clear */
1124         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1125                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1126
1127         spin_unlock_irqrestore(&iommu->register_lock, flags);
1128 }
1129
1130 static int iommu_enable_translation(struct intel_iommu *iommu)
1131 {
1132         u32 sts;
1133         unsigned long flags;
1134
1135         spin_lock_irqsave(&iommu->register_lock, flags);
1136         iommu->gcmd |= DMA_GCMD_TE;
1137         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1138
1139         /* Make sure hardware complete it */
1140         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1141                       readl, (sts & DMA_GSTS_TES), sts);
1142
1143         spin_unlock_irqrestore(&iommu->register_lock, flags);
1144         return 0;
1145 }
1146
1147 static int iommu_disable_translation(struct intel_iommu *iommu)
1148 {
1149         u32 sts;
1150         unsigned long flag;
1151
1152         spin_lock_irqsave(&iommu->register_lock, flag);
1153         iommu->gcmd &= ~DMA_GCMD_TE;
1154         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1155
1156         /* Make sure hardware complete it */
1157         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1158                       readl, (!(sts & DMA_GSTS_TES)), sts);
1159
1160         spin_unlock_irqrestore(&iommu->register_lock, flag);
1161         return 0;
1162 }
1163
1164
1165 static int iommu_init_domains(struct intel_iommu *iommu)
1166 {
1167         unsigned long ndomains;
1168         unsigned long nlongs;
1169
1170         ndomains = cap_ndoms(iommu->cap);
1171         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1172         nlongs = BITS_TO_LONGS(ndomains);
1173
1174         spin_lock_init(&iommu->lock);
1175
1176         /* TBD: there might be 64K domains,
1177          * consider other allocation for future chip
1178          */
1179         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1180         if (!iommu->domain_ids) {
1181                 printk(KERN_ERR "Allocating domain id array failed\n");
1182                 return -ENOMEM;
1183         }
1184         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1185                         GFP_KERNEL);
1186         if (!iommu->domains) {
1187                 printk(KERN_ERR "Allocating domain array failed\n");
1188                 return -ENOMEM;
1189         }
1190
1191         /*
1192          * if Caching mode is set, then invalid translations are tagged
1193          * with domainid 0. Hence we need to pre-allocate it.
1194          */
1195         if (cap_caching_mode(iommu->cap))
1196                 set_bit(0, iommu->domain_ids);
1197         return 0;
1198 }
1199
1200
1201 static void domain_exit(struct dmar_domain *domain);
1202 static void vm_domain_exit(struct dmar_domain *domain);
1203
1204 void free_dmar_iommu(struct intel_iommu *iommu)
1205 {
1206         struct dmar_domain *domain;
1207         int i;
1208         unsigned long flags;
1209
1210         if ((iommu->domains) && (iommu->domain_ids)) {
1211                 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1212                 for (; i < cap_ndoms(iommu->cap); ) {
1213                         domain = iommu->domains[i];
1214                         clear_bit(i, iommu->domain_ids);
1215
1216                         spin_lock_irqsave(&domain->iommu_lock, flags);
1217                         if (--domain->iommu_count == 0) {
1218                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1219                                         vm_domain_exit(domain);
1220                                 else
1221                                         domain_exit(domain);
1222                         }
1223                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1224
1225                         i = find_next_bit(iommu->domain_ids,
1226                                 cap_ndoms(iommu->cap), i+1);
1227                 }
1228         }
1229
1230         if (iommu->gcmd & DMA_GCMD_TE)
1231                 iommu_disable_translation(iommu);
1232
1233         if (iommu->irq) {
1234                 set_irq_data(iommu->irq, NULL);
1235                 /* This will mask the irq */
1236                 free_irq(iommu->irq, iommu);
1237                 destroy_irq(iommu->irq);
1238         }
1239
1240         kfree(iommu->domains);
1241         kfree(iommu->domain_ids);
1242
1243         g_iommus[iommu->seq_id] = NULL;
1244
1245         /* if all iommus are freed, free g_iommus */
1246         for (i = 0; i < g_num_of_iommus; i++) {
1247                 if (g_iommus[i])
1248                         break;
1249         }
1250
1251         if (i == g_num_of_iommus)
1252                 kfree(g_iommus);
1253
1254         /* free context mapping */
1255         free_context_table(iommu);
1256 }
1257
1258 static struct dmar_domain *alloc_domain(void)
1259 {
1260         struct dmar_domain *domain;
1261
1262         domain = alloc_domain_mem();
1263         if (!domain)
1264                 return NULL;
1265
1266         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1267         domain->flags = 0;
1268
1269         return domain;
1270 }
1271
1272 static int iommu_attach_domain(struct dmar_domain *domain,
1273                                struct intel_iommu *iommu)
1274 {
1275         int num;
1276         unsigned long ndomains;
1277         unsigned long flags;
1278
1279         ndomains = cap_ndoms(iommu->cap);
1280
1281         spin_lock_irqsave(&iommu->lock, flags);
1282
1283         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1284         if (num >= ndomains) {
1285                 spin_unlock_irqrestore(&iommu->lock, flags);
1286                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1287                 return -ENOMEM;
1288         }
1289
1290         domain->id = num;
1291         set_bit(num, iommu->domain_ids);
1292         set_bit(iommu->seq_id, &domain->iommu_bmp);
1293         iommu->domains[num] = domain;
1294         spin_unlock_irqrestore(&iommu->lock, flags);
1295
1296         return 0;
1297 }
1298
1299 static void iommu_detach_domain(struct dmar_domain *domain,
1300                                 struct intel_iommu *iommu)
1301 {
1302         unsigned long flags;
1303         int num, ndomains;
1304         int found = 0;
1305
1306         spin_lock_irqsave(&iommu->lock, flags);
1307         ndomains = cap_ndoms(iommu->cap);
1308         num = find_first_bit(iommu->domain_ids, ndomains);
1309         for (; num < ndomains; ) {
1310                 if (iommu->domains[num] == domain) {
1311                         found = 1;
1312                         break;
1313                 }
1314                 num = find_next_bit(iommu->domain_ids,
1315                                     cap_ndoms(iommu->cap), num+1);
1316         }
1317
1318         if (found) {
1319                 clear_bit(num, iommu->domain_ids);
1320                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1321                 iommu->domains[num] = NULL;
1322         }
1323         spin_unlock_irqrestore(&iommu->lock, flags);
1324 }
1325
1326 static struct iova_domain reserved_iova_list;
1327 static struct lock_class_key reserved_rbtree_key;
1328
1329 static void dmar_init_reserved_ranges(void)
1330 {
1331         struct pci_dev *pdev = NULL;
1332         struct iova *iova;
1333         int i;
1334
1335         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1336
1337         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1338                 &reserved_rbtree_key);
1339
1340         /* IOAPIC ranges shouldn't be accessed by DMA */
1341         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1342                 IOVA_PFN(IOAPIC_RANGE_END));
1343         if (!iova)
1344                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1345
1346         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1347         for_each_pci_dev(pdev) {
1348                 struct resource *r;
1349
1350                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1351                         r = &pdev->resource[i];
1352                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1353                                 continue;
1354                         iova = reserve_iova(&reserved_iova_list,
1355                                             IOVA_PFN(r->start),
1356                                             IOVA_PFN(r->end));
1357                         if (!iova)
1358                                 printk(KERN_ERR "Reserve iova failed\n");
1359                 }
1360         }
1361
1362 }
1363
1364 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1365 {
1366         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1367 }
1368
1369 static inline int guestwidth_to_adjustwidth(int gaw)
1370 {
1371         int agaw;
1372         int r = (gaw - 12) % 9;
1373
1374         if (r == 0)
1375                 agaw = gaw;
1376         else
1377                 agaw = gaw + 9 - r;
1378         if (agaw > 64)
1379                 agaw = 64;
1380         return agaw;
1381 }
1382
1383 static int domain_init(struct dmar_domain *domain, int guest_width)
1384 {
1385         struct intel_iommu *iommu;
1386         int adjust_width, agaw;
1387         unsigned long sagaw;
1388
1389         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1390         spin_lock_init(&domain->iommu_lock);
1391
1392         domain_reserve_special_ranges(domain);
1393
1394         /* calculate AGAW */
1395         iommu = domain_get_iommu(domain);
1396         if (guest_width > cap_mgaw(iommu->cap))
1397                 guest_width = cap_mgaw(iommu->cap);
1398         domain->gaw = guest_width;
1399         adjust_width = guestwidth_to_adjustwidth(guest_width);
1400         agaw = width_to_agaw(adjust_width);
1401         sagaw = cap_sagaw(iommu->cap);
1402         if (!test_bit(agaw, &sagaw)) {
1403                 /* hardware doesn't support it, choose a bigger one */
1404                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1405                 agaw = find_next_bit(&sagaw, 5, agaw);
1406                 if (agaw >= 5)
1407                         return -ENODEV;
1408         }
1409         domain->agaw = agaw;
1410         INIT_LIST_HEAD(&domain->devices);
1411
1412         if (ecap_coherent(iommu->ecap))
1413                 domain->iommu_coherency = 1;
1414         else
1415                 domain->iommu_coherency = 0;
1416
1417         if (ecap_sc_support(iommu->ecap))
1418                 domain->iommu_snooping = 1;
1419         else
1420                 domain->iommu_snooping = 0;
1421
1422         domain->iommu_count = 1;
1423
1424         /* always allocate the top pgd */
1425         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1426         if (!domain->pgd)
1427                 return -ENOMEM;
1428         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1429         return 0;
1430 }
1431
1432 static void domain_exit(struct dmar_domain *domain)
1433 {
1434         struct dmar_drhd_unit *drhd;
1435         struct intel_iommu *iommu;
1436
1437         /* Domain 0 is reserved, so dont process it */
1438         if (!domain)
1439                 return;
1440
1441         domain_remove_dev_info(domain);
1442         /* destroy iovas */
1443         put_iova_domain(&domain->iovad);
1444
1445         /* clear ptes */
1446         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1447
1448         /* free page tables */
1449         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1450
1451         for_each_active_iommu(iommu, drhd)
1452                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1453                         iommu_detach_domain(domain, iommu);
1454
1455         free_domain_mem(domain);
1456 }
1457
1458 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1459                                  u8 bus, u8 devfn, int translation)
1460 {
1461         struct context_entry *context;
1462         unsigned long flags;
1463         struct intel_iommu *iommu;
1464         struct dma_pte *pgd;
1465         unsigned long num;
1466         unsigned long ndomains;
1467         int id;
1468         int agaw;
1469         struct device_domain_info *info = NULL;
1470
1471         pr_debug("Set context mapping for %02x:%02x.%d\n",
1472                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1473
1474         BUG_ON(!domain->pgd);
1475         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1476                translation != CONTEXT_TT_MULTI_LEVEL);
1477
1478         iommu = device_to_iommu(segment, bus, devfn);
1479         if (!iommu)
1480                 return -ENODEV;
1481
1482         context = device_to_context_entry(iommu, bus, devfn);
1483         if (!context)
1484                 return -ENOMEM;
1485         spin_lock_irqsave(&iommu->lock, flags);
1486         if (context_present(context)) {
1487                 spin_unlock_irqrestore(&iommu->lock, flags);
1488                 return 0;
1489         }
1490
1491         id = domain->id;
1492         pgd = domain->pgd;
1493
1494         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1495             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1496                 int found = 0;
1497
1498                 /* find an available domain id for this device in iommu */
1499                 ndomains = cap_ndoms(iommu->cap);
1500                 num = find_first_bit(iommu->domain_ids, ndomains);
1501                 for (; num < ndomains; ) {
1502                         if (iommu->domains[num] == domain) {
1503                                 id = num;
1504                                 found = 1;
1505                                 break;
1506                         }
1507                         num = find_next_bit(iommu->domain_ids,
1508                                             cap_ndoms(iommu->cap), num+1);
1509                 }
1510
1511                 if (found == 0) {
1512                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1513                         if (num >= ndomains) {
1514                                 spin_unlock_irqrestore(&iommu->lock, flags);
1515                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1516                                 return -EFAULT;
1517                         }
1518
1519                         set_bit(num, iommu->domain_ids);
1520                         iommu->domains[num] = domain;
1521                         id = num;
1522                 }
1523
1524                 /* Skip top levels of page tables for
1525                  * iommu which has less agaw than default.
1526                  */
1527                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1528                         pgd = phys_to_virt(dma_pte_addr(pgd));
1529                         if (!dma_pte_present(pgd)) {
1530                                 spin_unlock_irqrestore(&iommu->lock, flags);
1531                                 return -ENOMEM;
1532                         }
1533                 }
1534         }
1535
1536         context_set_domain_id(context, id);
1537
1538         if (translation != CONTEXT_TT_PASS_THROUGH) {
1539                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1540                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1541                                      CONTEXT_TT_MULTI_LEVEL;
1542         }
1543         /*
1544          * In pass through mode, AW must be programmed to indicate the largest
1545          * AGAW value supported by hardware. And ASR is ignored by hardware.
1546          */
1547         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1548                 context_set_address_width(context, iommu->msagaw);
1549         else {
1550                 context_set_address_root(context, virt_to_phys(pgd));
1551                 context_set_address_width(context, iommu->agaw);
1552         }
1553
1554         context_set_translation_type(context, translation);
1555         context_set_fault_enable(context);
1556         context_set_present(context);
1557         domain_flush_cache(domain, context, sizeof(*context));
1558
1559         /*
1560          * It's a non-present to present mapping. If hardware doesn't cache
1561          * non-present entry we only need to flush the write-buffer. If the
1562          * _does_ cache non-present entries, then it does so in the special
1563          * domain #0, which we have to flush:
1564          */
1565         if (cap_caching_mode(iommu->cap)) {
1566                 iommu->flush.flush_context(iommu, 0,
1567                                            (((u16)bus) << 8) | devfn,
1568                                            DMA_CCMD_MASK_NOBIT,
1569                                            DMA_CCMD_DEVICE_INVL);
1570                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH);
1571         } else {
1572                 iommu_flush_write_buffer(iommu);
1573         }
1574         iommu_enable_dev_iotlb(info);
1575         spin_unlock_irqrestore(&iommu->lock, flags);
1576
1577         spin_lock_irqsave(&domain->iommu_lock, flags);
1578         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1579                 domain->iommu_count++;
1580                 domain_update_iommu_cap(domain);
1581         }
1582         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1583         return 0;
1584 }
1585
1586 static int
1587 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1588                         int translation)
1589 {
1590         int ret;
1591         struct pci_dev *tmp, *parent;
1592
1593         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1594                                          pdev->bus->number, pdev->devfn,
1595                                          translation);
1596         if (ret)
1597                 return ret;
1598
1599         /* dependent device mapping */
1600         tmp = pci_find_upstream_pcie_bridge(pdev);
1601         if (!tmp)
1602                 return 0;
1603         /* Secondary interface's bus number and devfn 0 */
1604         parent = pdev->bus->self;
1605         while (parent != tmp) {
1606                 ret = domain_context_mapping_one(domain,
1607                                                  pci_domain_nr(parent->bus),
1608                                                  parent->bus->number,
1609                                                  parent->devfn, translation);
1610                 if (ret)
1611                         return ret;
1612                 parent = parent->bus->self;
1613         }
1614         if (pci_is_pcie(tmp)) /* this is a PCIE-to-PCI bridge */
1615                 return domain_context_mapping_one(domain,
1616                                         pci_domain_nr(tmp->subordinate),
1617                                         tmp->subordinate->number, 0,
1618                                         translation);
1619         else /* this is a legacy PCI bridge */
1620                 return domain_context_mapping_one(domain,
1621                                                   pci_domain_nr(tmp->bus),
1622                                                   tmp->bus->number,
1623                                                   tmp->devfn,
1624                                                   translation);
1625 }
1626
1627 static int domain_context_mapped(struct pci_dev *pdev)
1628 {
1629         int ret;
1630         struct pci_dev *tmp, *parent;
1631         struct intel_iommu *iommu;
1632
1633         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1634                                 pdev->devfn);
1635         if (!iommu)
1636                 return -ENODEV;
1637
1638         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1639         if (!ret)
1640                 return ret;
1641         /* dependent device mapping */
1642         tmp = pci_find_upstream_pcie_bridge(pdev);
1643         if (!tmp)
1644                 return ret;
1645         /* Secondary interface's bus number and devfn 0 */
1646         parent = pdev->bus->self;
1647         while (parent != tmp) {
1648                 ret = device_context_mapped(iommu, parent->bus->number,
1649                                             parent->devfn);
1650                 if (!ret)
1651                         return ret;
1652                 parent = parent->bus->self;
1653         }
1654         if (pci_is_pcie(tmp))
1655                 return device_context_mapped(iommu, tmp->subordinate->number,
1656                                              0);
1657         else
1658                 return device_context_mapped(iommu, tmp->bus->number,
1659                                              tmp->devfn);
1660 }
1661
1662 /* Returns a number of VTD pages, but aligned to MM page size */
1663 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1664                                             size_t size)
1665 {
1666         host_addr &= ~PAGE_MASK;
1667         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1668 }
1669
1670 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1671                             struct scatterlist *sg, unsigned long phys_pfn,
1672                             unsigned long nr_pages, int prot)
1673 {
1674         struct dma_pte *first_pte = NULL, *pte = NULL;
1675         phys_addr_t uninitialized_var(pteval);
1676         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1677         unsigned long sg_res;
1678
1679         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1680
1681         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1682                 return -EINVAL;
1683
1684         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1685
1686         if (sg)
1687                 sg_res = 0;
1688         else {
1689                 sg_res = nr_pages + 1;
1690                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1691         }
1692
1693         while (nr_pages--) {
1694                 uint64_t tmp;
1695
1696                 if (!sg_res) {
1697                         sg_res = aligned_nrpages(sg->offset, sg->length);
1698                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1699                         sg->dma_length = sg->length;
1700                         pteval = page_to_phys(sg_page(sg)) | prot;
1701                 }
1702                 if (!pte) {
1703                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1704                         if (!pte)
1705                                 return -ENOMEM;
1706                 }
1707                 /* We don't need lock here, nobody else
1708                  * touches the iova range
1709                  */
1710                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1711                 if (tmp) {
1712                         static int dumps = 5;
1713                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1714                                iov_pfn, tmp, (unsigned long long)pteval);
1715                         if (dumps) {
1716                                 dumps--;
1717                                 debug_dma_dump_mappings(NULL);
1718                         }
1719                         WARN_ON(1);
1720                 }
1721                 pte++;
1722                 if (!nr_pages || first_pte_in_page(pte)) {
1723                         domain_flush_cache(domain, first_pte,
1724                                            (void *)pte - (void *)first_pte);
1725                         pte = NULL;
1726                 }
1727                 iov_pfn++;
1728                 pteval += VTD_PAGE_SIZE;
1729                 sg_res--;
1730                 if (!sg_res)
1731                         sg = sg_next(sg);
1732         }
1733         return 0;
1734 }
1735
1736 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1737                                     struct scatterlist *sg, unsigned long nr_pages,
1738                                     int prot)
1739 {
1740         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1741 }
1742
1743 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1744                                      unsigned long phys_pfn, unsigned long nr_pages,
1745                                      int prot)
1746 {
1747         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1748 }
1749
1750 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1751 {
1752         if (!iommu)
1753                 return;
1754
1755         clear_context_table(iommu, bus, devfn);
1756         iommu->flush.flush_context(iommu, 0, 0, 0,
1757                                            DMA_CCMD_GLOBAL_INVL);
1758         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1759 }
1760
1761 static void domain_remove_dev_info(struct dmar_domain *domain)
1762 {
1763         struct device_domain_info *info;
1764         unsigned long flags;
1765         struct intel_iommu *iommu;
1766
1767         spin_lock_irqsave(&device_domain_lock, flags);
1768         while (!list_empty(&domain->devices)) {
1769                 info = list_entry(domain->devices.next,
1770                         struct device_domain_info, link);
1771                 list_del(&info->link);
1772                 list_del(&info->global);
1773                 if (info->dev)
1774                         info->dev->dev.archdata.iommu = NULL;
1775                 spin_unlock_irqrestore(&device_domain_lock, flags);
1776
1777                 iommu_disable_dev_iotlb(info);
1778                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1779                 iommu_detach_dev(iommu, info->bus, info->devfn);
1780                 free_devinfo_mem(info);
1781
1782                 spin_lock_irqsave(&device_domain_lock, flags);
1783         }
1784         spin_unlock_irqrestore(&device_domain_lock, flags);
1785 }
1786
1787 /*
1788  * find_domain
1789  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1790  */
1791 static struct dmar_domain *
1792 find_domain(struct pci_dev *pdev)
1793 {
1794         struct device_domain_info *info;
1795
1796         /* No lock here, assumes no domain exit in normal case */
1797         info = pdev->dev.archdata.iommu;
1798         if (info)
1799                 return info->domain;
1800         return NULL;
1801 }
1802
1803 /* domain is initialized */
1804 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1805 {
1806         struct dmar_domain *domain, *found = NULL;
1807         struct intel_iommu *iommu;
1808         struct dmar_drhd_unit *drhd;
1809         struct device_domain_info *info, *tmp;
1810         struct pci_dev *dev_tmp;
1811         unsigned long flags;
1812         int bus = 0, devfn = 0;
1813         int segment;
1814         int ret;
1815
1816         domain = find_domain(pdev);
1817         if (domain)
1818                 return domain;
1819
1820         segment = pci_domain_nr(pdev->bus);
1821
1822         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1823         if (dev_tmp) {
1824                 if (pci_is_pcie(dev_tmp)) {
1825                         bus = dev_tmp->subordinate->number;
1826                         devfn = 0;
1827                 } else {
1828                         bus = dev_tmp->bus->number;
1829                         devfn = dev_tmp->devfn;
1830                 }
1831                 spin_lock_irqsave(&device_domain_lock, flags);
1832                 list_for_each_entry(info, &device_domain_list, global) {
1833                         if (info->segment == segment &&
1834                             info->bus == bus && info->devfn == devfn) {
1835                                 found = info->domain;
1836                                 break;
1837                         }
1838                 }
1839                 spin_unlock_irqrestore(&device_domain_lock, flags);
1840                 /* pcie-pci bridge already has a domain, uses it */
1841                 if (found) {
1842                         domain = found;
1843                         goto found_domain;
1844                 }
1845         }
1846
1847         domain = alloc_domain();
1848         if (!domain)
1849                 goto error;
1850
1851         /* Allocate new domain for the device */
1852         drhd = dmar_find_matched_drhd_unit(pdev);
1853         if (!drhd) {
1854                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1855                         pci_name(pdev));
1856                 return NULL;
1857         }
1858         iommu = drhd->iommu;
1859
1860         ret = iommu_attach_domain(domain, iommu);
1861         if (ret) {
1862                 domain_exit(domain);
1863                 goto error;
1864         }
1865
1866         if (domain_init(domain, gaw)) {
1867                 domain_exit(domain);
1868                 goto error;
1869         }
1870
1871         /* register pcie-to-pci device */
1872         if (dev_tmp) {
1873                 info = alloc_devinfo_mem();
1874                 if (!info) {
1875                         domain_exit(domain);
1876                         goto error;
1877                 }
1878                 info->segment = segment;
1879                 info->bus = bus;
1880                 info->devfn = devfn;
1881                 info->dev = NULL;
1882                 info->domain = domain;
1883                 /* This domain is shared by devices under p2p bridge */
1884                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1885
1886                 /* pcie-to-pci bridge already has a domain, uses it */
1887                 found = NULL;
1888                 spin_lock_irqsave(&device_domain_lock, flags);
1889                 list_for_each_entry(tmp, &device_domain_list, global) {
1890                         if (tmp->segment == segment &&
1891                             tmp->bus == bus && tmp->devfn == devfn) {
1892                                 found = tmp->domain;
1893                                 break;
1894                         }
1895                 }
1896                 if (found) {
1897                         free_devinfo_mem(info);
1898                         domain_exit(domain);
1899                         domain = found;
1900                 } else {
1901                         list_add(&info->link, &domain->devices);
1902                         list_add(&info->global, &device_domain_list);
1903                 }
1904                 spin_unlock_irqrestore(&device_domain_lock, flags);
1905         }
1906
1907 found_domain:
1908         info = alloc_devinfo_mem();
1909         if (!info)
1910                 goto error;
1911         info->segment = segment;
1912         info->bus = pdev->bus->number;
1913         info->devfn = pdev->devfn;
1914         info->dev = pdev;
1915         info->domain = domain;
1916         spin_lock_irqsave(&device_domain_lock, flags);
1917         /* somebody is fast */
1918         found = find_domain(pdev);
1919         if (found != NULL) {
1920                 spin_unlock_irqrestore(&device_domain_lock, flags);
1921                 if (found != domain) {
1922                         domain_exit(domain);
1923                         domain = found;
1924                 }
1925                 free_devinfo_mem(info);
1926                 return domain;
1927         }
1928         list_add(&info->link, &domain->devices);
1929         list_add(&info->global, &device_domain_list);
1930         pdev->dev.archdata.iommu = info;
1931         spin_unlock_irqrestore(&device_domain_lock, flags);
1932         return domain;
1933 error:
1934         /* recheck it here, maybe others set it */
1935         return find_domain(pdev);
1936 }
1937
1938 static int iommu_identity_mapping;
1939 #define IDENTMAP_ALL            1
1940 #define IDENTMAP_GFX            2
1941 #define IDENTMAP_AZALIA         4
1942
1943 static int iommu_domain_identity_map(struct dmar_domain *domain,
1944                                      unsigned long long start,
1945                                      unsigned long long end)
1946 {
1947         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1948         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1949
1950         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1951                           dma_to_mm_pfn(last_vpfn))) {
1952                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1953                 return -ENOMEM;
1954         }
1955
1956         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1957                  start, end, domain->id);
1958         /*
1959          * RMRR range might have overlap with physical memory range,
1960          * clear it first
1961          */
1962         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1963
1964         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1965                                   last_vpfn - first_vpfn + 1,
1966                                   DMA_PTE_READ|DMA_PTE_WRITE);
1967 }
1968
1969 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1970                                       unsigned long long start,
1971                                       unsigned long long end)
1972 {
1973         struct dmar_domain *domain;
1974         int ret;
1975
1976         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1977         if (!domain)
1978                 return -ENOMEM;
1979
1980         /* For _hardware_ passthrough, don't bother. But for software
1981            passthrough, we do it anyway -- it may indicate a memory
1982            range which is reserved in E820, so which didn't get set
1983            up to start with in si_domain */
1984         if (domain == si_domain && hw_pass_through) {
1985                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1986                        pci_name(pdev), start, end);
1987                 return 0;
1988         }
1989
1990         printk(KERN_INFO
1991                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1992                pci_name(pdev), start, end);
1993         
1994         if (end >> agaw_to_width(domain->agaw)) {
1995                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1996                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1997                      agaw_to_width(domain->agaw),
1998                      dmi_get_system_info(DMI_BIOS_VENDOR),
1999                      dmi_get_system_info(DMI_BIOS_VERSION),
2000                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2001                 ret = -EIO;
2002                 goto error;
2003         }
2004
2005         ret = iommu_domain_identity_map(domain, start, end);
2006         if (ret)
2007                 goto error;
2008
2009         /* context entry init */
2010         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2011         if (ret)
2012                 goto error;
2013
2014         return 0;
2015
2016  error:
2017         domain_exit(domain);
2018         return ret;
2019 }
2020
2021 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2022         struct pci_dev *pdev)
2023 {
2024         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2025                 return 0;
2026         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2027                 rmrr->end_address + 1);
2028 }
2029
2030 #ifdef CONFIG_DMAR_FLOPPY_WA
2031 static inline void iommu_prepare_isa(void)
2032 {
2033         struct pci_dev *pdev;
2034         int ret;
2035
2036         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2037         if (!pdev)
2038                 return;
2039
2040         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2041         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2042
2043         if (ret)
2044                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2045                        "floppy might not work\n");
2046
2047 }
2048 #else
2049 static inline void iommu_prepare_isa(void)
2050 {
2051         return;
2052 }
2053 #endif /* !CONFIG_DMAR_FLPY_WA */
2054
2055 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2056
2057 static int __init si_domain_work_fn(unsigned long start_pfn,
2058                                     unsigned long end_pfn, void *datax)
2059 {
2060         int *ret = datax;
2061
2062         *ret = iommu_domain_identity_map(si_domain,
2063                                          (uint64_t)start_pfn << PAGE_SHIFT,
2064                                          (uint64_t)end_pfn << PAGE_SHIFT);
2065         return *ret;
2066
2067 }
2068
2069 static int __init si_domain_init(int hw)
2070 {
2071         struct dmar_drhd_unit *drhd;
2072         struct intel_iommu *iommu;
2073         int nid, ret = 0;
2074
2075         si_domain = alloc_domain();
2076         if (!si_domain)
2077                 return -EFAULT;
2078
2079         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2080
2081         for_each_active_iommu(iommu, drhd) {
2082                 ret = iommu_attach_domain(si_domain, iommu);
2083                 if (ret) {
2084                         domain_exit(si_domain);
2085                         return -EFAULT;
2086                 }
2087         }
2088
2089         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2090                 domain_exit(si_domain);
2091                 return -EFAULT;
2092         }
2093
2094         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2095
2096         if (hw)
2097                 return 0;
2098
2099         for_each_online_node(nid) {
2100                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2101                 if (ret)
2102                         return ret;
2103         }
2104
2105         return 0;
2106 }
2107
2108 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2109                                           struct pci_dev *pdev);
2110 static int identity_mapping(struct pci_dev *pdev)
2111 {
2112         struct device_domain_info *info;
2113
2114         if (likely(!iommu_identity_mapping))
2115                 return 0;
2116
2117
2118         list_for_each_entry(info, &si_domain->devices, link)
2119                 if (info->dev == pdev)
2120                         return 1;
2121         return 0;
2122 }
2123
2124 static int domain_add_dev_info(struct dmar_domain *domain,
2125                                struct pci_dev *pdev,
2126                                int translation)
2127 {
2128         struct device_domain_info *info;
2129         unsigned long flags;
2130         int ret;
2131
2132         info = alloc_devinfo_mem();
2133         if (!info)
2134                 return -ENOMEM;
2135
2136         ret = domain_context_mapping(domain, pdev, translation);
2137         if (ret) {
2138                 free_devinfo_mem(info);
2139                 return ret;
2140         }
2141
2142         info->segment = pci_domain_nr(pdev->bus);
2143         info->bus = pdev->bus->number;
2144         info->devfn = pdev->devfn;
2145         info->dev = pdev;
2146         info->domain = domain;
2147
2148         spin_lock_irqsave(&device_domain_lock, flags);
2149         list_add(&info->link, &domain->devices);
2150         list_add(&info->global, &device_domain_list);
2151         pdev->dev.archdata.iommu = info;
2152         spin_unlock_irqrestore(&device_domain_lock, flags);
2153
2154         return 0;
2155 }
2156
2157 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2158 {
2159         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2160                 return 1;
2161
2162         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2163                 return 1;
2164
2165         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2166                 return 0;
2167
2168         /*
2169          * We want to start off with all devices in the 1:1 domain, and
2170          * take them out later if we find they can't access all of memory.
2171          *
2172          * However, we can't do this for PCI devices behind bridges,
2173          * because all PCI devices behind the same bridge will end up
2174          * with the same source-id on their transactions.
2175          *
2176          * Practically speaking, we can't change things around for these
2177          * devices at run-time, because we can't be sure there'll be no
2178          * DMA transactions in flight for any of their siblings.
2179          * 
2180          * So PCI devices (unless they're on the root bus) as well as
2181          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2182          * the 1:1 domain, just in _case_ one of their siblings turns out
2183          * not to be able to map all of memory.
2184          */
2185         if (!pci_is_pcie(pdev)) {
2186                 if (!pci_is_root_bus(pdev->bus))
2187                         return 0;
2188                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2189                         return 0;
2190         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2191                 return 0;
2192
2193         /* 
2194          * At boot time, we don't yet know if devices will be 64-bit capable.
2195          * Assume that they will -- if they turn out not to be, then we can 
2196          * take them out of the 1:1 domain later.
2197          */
2198         if (!startup)
2199                 return pdev->dma_mask > DMA_BIT_MASK(32);
2200
2201         return 1;
2202 }
2203
2204 static int __init iommu_prepare_static_identity_mapping(int hw)
2205 {
2206         struct pci_dev *pdev = NULL;
2207         int ret;
2208
2209         ret = si_domain_init(hw);
2210         if (ret)
2211                 return -EFAULT;
2212
2213         for_each_pci_dev(pdev) {
2214                 if (iommu_should_identity_map(pdev, 1)) {
2215                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2216                                hw ? "hardware" : "software", pci_name(pdev));
2217
2218                         ret = domain_add_dev_info(si_domain, pdev,
2219                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2220                                                      CONTEXT_TT_MULTI_LEVEL);
2221                         if (ret)
2222                                 return ret;
2223                 }
2224         }
2225
2226         return 0;
2227 }
2228
2229 int __init init_dmars(void)
2230 {
2231         struct dmar_drhd_unit *drhd;
2232         struct dmar_rmrr_unit *rmrr;
2233         struct pci_dev *pdev;
2234         struct intel_iommu *iommu;
2235         int i, ret;
2236
2237         /*
2238          * for each drhd
2239          *    allocate root
2240          *    initialize and program root entry to not present
2241          * endfor
2242          */
2243         for_each_drhd_unit(drhd) {
2244                 g_num_of_iommus++;
2245                 /*
2246                  * lock not needed as this is only incremented in the single
2247                  * threaded kernel __init code path all other access are read
2248                  * only
2249                  */
2250         }
2251
2252         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2253                         GFP_KERNEL);
2254         if (!g_iommus) {
2255                 printk(KERN_ERR "Allocating global iommu array failed\n");
2256                 ret = -ENOMEM;
2257                 goto error;
2258         }
2259
2260         deferred_flush = kzalloc(g_num_of_iommus *
2261                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2262         if (!deferred_flush) {
2263                 ret = -ENOMEM;
2264                 goto error;
2265         }
2266
2267         for_each_drhd_unit(drhd) {
2268                 if (drhd->ignored)
2269                         continue;
2270
2271                 iommu = drhd->iommu;
2272                 g_iommus[iommu->seq_id] = iommu;
2273
2274                 ret = iommu_init_domains(iommu);
2275                 if (ret)
2276                         goto error;
2277
2278                 /*
2279                  * TBD:
2280                  * we could share the same root & context tables
2281                  * amoung all IOMMU's. Need to Split it later.
2282                  */
2283                 ret = iommu_alloc_root_entry(iommu);
2284                 if (ret) {
2285                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2286                         goto error;
2287                 }
2288                 if (!ecap_pass_through(iommu->ecap))
2289                         hw_pass_through = 0;
2290         }
2291
2292         /*
2293          * Start from the sane iommu hardware state.
2294          */
2295         for_each_drhd_unit(drhd) {
2296                 if (drhd->ignored)
2297                         continue;
2298
2299                 iommu = drhd->iommu;
2300
2301                 /*
2302                  * If the queued invalidation is already initialized by us
2303                  * (for example, while enabling interrupt-remapping) then
2304                  * we got the things already rolling from a sane state.
2305                  */
2306                 if (iommu->qi)
2307                         continue;
2308
2309                 /*
2310                  * Clear any previous faults.
2311                  */
2312                 dmar_fault(-1, iommu);
2313                 /*
2314                  * Disable queued invalidation if supported and already enabled
2315                  * before OS handover.
2316                  */
2317                 dmar_disable_qi(iommu);
2318         }
2319
2320         for_each_drhd_unit(drhd) {
2321                 if (drhd->ignored)
2322                         continue;
2323
2324                 iommu = drhd->iommu;
2325
2326                 if (dmar_enable_qi(iommu)) {
2327                         /*
2328                          * Queued Invalidate not enabled, use Register Based
2329                          * Invalidate
2330                          */
2331                         iommu->flush.flush_context = __iommu_flush_context;
2332                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2333                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2334                                "invalidation\n",
2335                                (unsigned long long)drhd->reg_base_addr);
2336                 } else {
2337                         iommu->flush.flush_context = qi_flush_context;
2338                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2339                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2340                                "invalidation\n",
2341                                (unsigned long long)drhd->reg_base_addr);
2342                 }
2343         }
2344
2345         if (iommu_pass_through)
2346                 iommu_identity_mapping |= IDENTMAP_ALL;
2347
2348 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2349         iommu_identity_mapping |= IDENTMAP_GFX;
2350 #endif
2351
2352         check_tylersburg_isoch();
2353
2354         /*
2355          * If pass through is not set or not enabled, setup context entries for
2356          * identity mappings for rmrr, gfx, and isa and may fall back to static
2357          * identity mapping if iommu_identity_mapping is set.
2358          */
2359         if (iommu_identity_mapping) {
2360                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2361                 if (ret) {
2362                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2363                         goto error;
2364                 }
2365         }
2366         /*
2367          * For each rmrr
2368          *   for each dev attached to rmrr
2369          *   do
2370          *     locate drhd for dev, alloc domain for dev
2371          *     allocate free domain
2372          *     allocate page table entries for rmrr
2373          *     if context not allocated for bus
2374          *           allocate and init context
2375          *           set present in root table for this bus
2376          *     init context with domain, translation etc
2377          *    endfor
2378          * endfor
2379          */
2380         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2381         for_each_rmrr_units(rmrr) {
2382                 for (i = 0; i < rmrr->devices_cnt; i++) {
2383                         pdev = rmrr->devices[i];
2384                         /*
2385                          * some BIOS lists non-exist devices in DMAR
2386                          * table.
2387                          */
2388                         if (!pdev)
2389                                 continue;
2390                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2391                         if (ret)
2392                                 printk(KERN_ERR
2393                                        "IOMMU: mapping reserved region failed\n");
2394                 }
2395         }
2396
2397         iommu_prepare_isa();
2398
2399         /*
2400          * for each drhd
2401          *   enable fault log
2402          *   global invalidate context cache
2403          *   global invalidate iotlb
2404          *   enable translation
2405          */
2406         for_each_drhd_unit(drhd) {
2407                 if (drhd->ignored)
2408                         continue;
2409                 iommu = drhd->iommu;
2410
2411                 iommu_flush_write_buffer(iommu);
2412
2413                 ret = dmar_set_interrupt(iommu);
2414                 if (ret)
2415                         goto error;
2416
2417                 iommu_set_root_entry(iommu);
2418
2419                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2420                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2421
2422                 ret = iommu_enable_translation(iommu);
2423                 if (ret)
2424                         goto error;
2425
2426                 iommu_disable_protect_mem_regions(iommu);
2427         }
2428
2429         return 0;
2430 error:
2431         for_each_drhd_unit(drhd) {
2432                 if (drhd->ignored)
2433                         continue;
2434                 iommu = drhd->iommu;
2435                 free_iommu(iommu);
2436         }
2437         kfree(g_iommus);
2438         return ret;
2439 }
2440
2441 /* This takes a number of _MM_ pages, not VTD pages */
2442 static struct iova *intel_alloc_iova(struct device *dev,
2443                                      struct dmar_domain *domain,
2444                                      unsigned long nrpages, uint64_t dma_mask)
2445 {
2446         struct pci_dev *pdev = to_pci_dev(dev);
2447         struct iova *iova = NULL;
2448
2449         /* Restrict dma_mask to the width that the iommu can handle */
2450         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2451
2452         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2453                 /*
2454                  * First try to allocate an io virtual address in
2455                  * DMA_BIT_MASK(32) and if that fails then try allocating
2456                  * from higher range
2457                  */
2458                 iova = alloc_iova(&domain->iovad, nrpages,
2459                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2460                 if (iova)
2461                         return iova;
2462         }
2463         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2464         if (unlikely(!iova)) {
2465                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2466                        nrpages, pci_name(pdev));
2467                 return NULL;
2468         }
2469
2470         return iova;
2471 }
2472
2473 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2474 {
2475         struct dmar_domain *domain;
2476         int ret;
2477
2478         domain = get_domain_for_dev(pdev,
2479                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2480         if (!domain) {
2481                 printk(KERN_ERR
2482                         "Allocating domain for %s failed", pci_name(pdev));
2483                 return NULL;
2484         }
2485
2486         /* make sure context mapping is ok */
2487         if (unlikely(!domain_context_mapped(pdev))) {
2488                 ret = domain_context_mapping(domain, pdev,
2489                                              CONTEXT_TT_MULTI_LEVEL);
2490                 if (ret) {
2491                         printk(KERN_ERR
2492                                 "Domain context map for %s failed",
2493                                 pci_name(pdev));
2494                         return NULL;
2495                 }
2496         }
2497
2498         return domain;
2499 }
2500
2501 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2502 {
2503         struct device_domain_info *info;
2504
2505         /* No lock here, assumes no domain exit in normal case */
2506         info = dev->dev.archdata.iommu;
2507         if (likely(info))
2508                 return info->domain;
2509
2510         return __get_valid_domain_for_dev(dev);
2511 }
2512
2513 static int iommu_dummy(struct pci_dev *pdev)
2514 {
2515         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2516 }
2517
2518 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2519 static int iommu_no_mapping(struct device *dev)
2520 {
2521         struct pci_dev *pdev;
2522         int found;
2523
2524         if (unlikely(dev->bus != &pci_bus_type))
2525                 return 1;
2526
2527         pdev = to_pci_dev(dev);
2528         if (iommu_dummy(pdev))
2529                 return 1;
2530
2531         if (!iommu_identity_mapping)
2532                 return 0;
2533
2534         found = identity_mapping(pdev);
2535         if (found) {
2536                 if (iommu_should_identity_map(pdev, 0))
2537                         return 1;
2538                 else {
2539                         /*
2540                          * 32 bit DMA is removed from si_domain and fall back
2541                          * to non-identity mapping.
2542                          */
2543                         domain_remove_one_dev_info(si_domain, pdev);
2544                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2545                                pci_name(pdev));
2546                         return 0;
2547                 }
2548         } else {
2549                 /*
2550                  * In case of a detached 64 bit DMA device from vm, the device
2551                  * is put into si_domain for identity mapping.
2552                  */
2553                 if (iommu_should_identity_map(pdev, 0)) {
2554                         int ret;
2555                         ret = domain_add_dev_info(si_domain, pdev,
2556                                                   hw_pass_through ?
2557                                                   CONTEXT_TT_PASS_THROUGH :
2558                                                   CONTEXT_TT_MULTI_LEVEL);
2559                         if (!ret) {
2560                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2561                                        pci_name(pdev));
2562                                 return 1;
2563                         }
2564                 }
2565         }
2566
2567         return 0;
2568 }
2569
2570 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2571                                      size_t size, int dir, u64 dma_mask)
2572 {
2573         struct pci_dev *pdev = to_pci_dev(hwdev);
2574         struct dmar_domain *domain;
2575         phys_addr_t start_paddr;
2576         struct iova *iova;
2577         int prot = 0;
2578         int ret;
2579         struct intel_iommu *iommu;
2580         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2581
2582         BUG_ON(dir == DMA_NONE);
2583
2584         if (iommu_no_mapping(hwdev))
2585                 return paddr;
2586
2587         domain = get_valid_domain_for_dev(pdev);
2588         if (!domain)
2589                 return 0;
2590
2591         iommu = domain_get_iommu(domain);
2592         size = aligned_nrpages(paddr, size);
2593
2594         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2595                                 pdev->dma_mask);
2596         if (!iova)
2597                 goto error;
2598
2599         /*
2600          * Check if DMAR supports zero-length reads on write only
2601          * mappings..
2602          */
2603         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2604                         !cap_zlr(iommu->cap))
2605                 prot |= DMA_PTE_READ;
2606         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2607                 prot |= DMA_PTE_WRITE;
2608         /*
2609          * paddr - (paddr + size) might be partial page, we should map the whole
2610          * page.  Note: if two part of one page are separately mapped, we
2611          * might have two guest_addr mapping to the same host paddr, but this
2612          * is not a big problem
2613          */
2614         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2615                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2616         if (ret)
2617                 goto error;
2618
2619         /* it's a non-present to present mapping. Only flush if caching mode */
2620         if (cap_caching_mode(iommu->cap))
2621                 iommu_flush_iotlb_psi(iommu, 0, mm_to_dma_pfn(iova->pfn_lo), size);
2622         else
2623                 iommu_flush_write_buffer(iommu);
2624
2625         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2626         start_paddr += paddr & ~PAGE_MASK;
2627         return start_paddr;
2628
2629 error:
2630         if (iova)
2631                 __free_iova(&domain->iovad, iova);
2632         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2633                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2634         return 0;
2635 }
2636
2637 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2638                                  unsigned long offset, size_t size,
2639                                  enum dma_data_direction dir,
2640                                  struct dma_attrs *attrs)
2641 {
2642         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2643                                   dir, to_pci_dev(dev)->dma_mask);
2644 }
2645
2646 static void flush_unmaps(void)
2647 {
2648         int i, j;
2649
2650         timer_on = 0;
2651
2652         /* just flush them all */
2653         for (i = 0; i < g_num_of_iommus; i++) {
2654                 struct intel_iommu *iommu = g_iommus[i];
2655                 if (!iommu)
2656                         continue;
2657
2658                 if (!deferred_flush[i].next)
2659                         continue;
2660
2661                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2662                                          DMA_TLB_GLOBAL_FLUSH);
2663                 for (j = 0; j < deferred_flush[i].next; j++) {
2664                         unsigned long mask;
2665                         struct iova *iova = deferred_flush[i].iova[j];
2666
2667                         mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2668                         iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2669                                         (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2670                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2671                 }
2672                 deferred_flush[i].next = 0;
2673         }
2674
2675         list_size = 0;
2676 }
2677
2678 static void flush_unmaps_timeout(unsigned long data)
2679 {
2680         unsigned long flags;
2681
2682         spin_lock_irqsave(&async_umap_flush_lock, flags);
2683         flush_unmaps();
2684         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2685 }
2686
2687 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2688 {
2689         unsigned long flags;
2690         int next, iommu_id;
2691         struct intel_iommu *iommu;
2692
2693         spin_lock_irqsave(&async_umap_flush_lock, flags);
2694         if (list_size == HIGH_WATER_MARK)
2695                 flush_unmaps();
2696
2697         iommu = domain_get_iommu(dom);
2698         iommu_id = iommu->seq_id;
2699
2700         next = deferred_flush[iommu_id].next;
2701         deferred_flush[iommu_id].domain[next] = dom;
2702         deferred_flush[iommu_id].iova[next] = iova;
2703         deferred_flush[iommu_id].next++;
2704
2705         if (!timer_on) {
2706                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2707                 timer_on = 1;
2708         }
2709         list_size++;
2710         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2711 }
2712
2713 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2714                              size_t size, enum dma_data_direction dir,
2715                              struct dma_attrs *attrs)
2716 {
2717         struct pci_dev *pdev = to_pci_dev(dev);
2718         struct dmar_domain *domain;
2719         unsigned long start_pfn, last_pfn;
2720         struct iova *iova;
2721         struct intel_iommu *iommu;
2722
2723         if (iommu_no_mapping(dev))
2724                 return;
2725
2726         domain = find_domain(pdev);
2727         BUG_ON(!domain);
2728
2729         iommu = domain_get_iommu(domain);
2730
2731         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2732         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2733                       (unsigned long long)dev_addr))
2734                 return;
2735
2736         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2737         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2738
2739         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2740                  pci_name(pdev), start_pfn, last_pfn);
2741
2742         /*  clear the whole page */
2743         dma_pte_clear_range(domain, start_pfn, last_pfn);
2744
2745         /* free page tables */
2746         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2747
2748         if (intel_iommu_strict) {
2749                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2750                                       last_pfn - start_pfn + 1);
2751                 /* free iova */
2752                 __free_iova(&domain->iovad, iova);
2753         } else {
2754                 add_unmap(domain, iova);
2755                 /*
2756                  * queue up the release of the unmap to save the 1/6th of the
2757                  * cpu used up by the iotlb flush operation...
2758                  */
2759         }
2760 }
2761
2762 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2763                                   dma_addr_t *dma_handle, gfp_t flags)
2764 {
2765         void *vaddr;
2766         int order;
2767
2768         size = PAGE_ALIGN(size);
2769         order = get_order(size);
2770
2771         if (!iommu_no_mapping(hwdev))
2772                 flags &= ~(GFP_DMA | GFP_DMA32);
2773         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2774                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2775                         flags |= GFP_DMA;
2776                 else
2777                         flags |= GFP_DMA32;
2778         }
2779
2780         vaddr = (void *)__get_free_pages(flags, order);
2781         if (!vaddr)
2782                 return NULL;
2783         memset(vaddr, 0, size);
2784
2785         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2786                                          DMA_BIDIRECTIONAL,
2787                                          hwdev->coherent_dma_mask);
2788         if (*dma_handle)
2789                 return vaddr;
2790         free_pages((unsigned long)vaddr, order);
2791         return NULL;
2792 }
2793
2794 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2795                                 dma_addr_t dma_handle)
2796 {
2797         int order;
2798
2799         size = PAGE_ALIGN(size);
2800         order = get_order(size);
2801
2802         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2803         free_pages((unsigned long)vaddr, order);
2804 }
2805
2806 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2807                            int nelems, enum dma_data_direction dir,
2808                            struct dma_attrs *attrs)
2809 {
2810         struct pci_dev *pdev = to_pci_dev(hwdev);
2811         struct dmar_domain *domain;
2812         unsigned long start_pfn, last_pfn;
2813         struct iova *iova;
2814         struct intel_iommu *iommu;
2815
2816         if (iommu_no_mapping(hwdev))
2817                 return;
2818
2819         domain = find_domain(pdev);
2820         BUG_ON(!domain);
2821
2822         iommu = domain_get_iommu(domain);
2823
2824         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2825         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2826                       (unsigned long long)sglist[0].dma_address))
2827                 return;
2828
2829         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2830         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2831
2832         /*  clear the whole page */
2833         dma_pte_clear_range(domain, start_pfn, last_pfn);
2834
2835         /* free page tables */
2836         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2837
2838         if (intel_iommu_strict) {
2839                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2840                                       last_pfn - start_pfn + 1);
2841                 /* free iova */
2842                 __free_iova(&domain->iovad, iova);
2843         } else {
2844                 add_unmap(domain, iova);
2845                 /*
2846                  * queue up the release of the unmap to save the 1/6th of the
2847                  * cpu used up by the iotlb flush operation...
2848                  */
2849         }
2850 }
2851
2852 static int intel_nontranslate_map_sg(struct device *hddev,
2853         struct scatterlist *sglist, int nelems, int dir)
2854 {
2855         int i;
2856         struct scatterlist *sg;
2857
2858         for_each_sg(sglist, sg, nelems, i) {
2859                 BUG_ON(!sg_page(sg));
2860                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2861                 sg->dma_length = sg->length;
2862         }
2863         return nelems;
2864 }
2865
2866 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2867                         enum dma_data_direction dir, struct dma_attrs *attrs)
2868 {
2869         int i;
2870         struct pci_dev *pdev = to_pci_dev(hwdev);
2871         struct dmar_domain *domain;
2872         size_t size = 0;
2873         int prot = 0;
2874         size_t offset_pfn = 0;
2875         struct iova *iova = NULL;
2876         int ret;
2877         struct scatterlist *sg;
2878         unsigned long start_vpfn;
2879         struct intel_iommu *iommu;
2880
2881         BUG_ON(dir == DMA_NONE);
2882         if (iommu_no_mapping(hwdev))
2883                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2884
2885         domain = get_valid_domain_for_dev(pdev);
2886         if (!domain)
2887                 return 0;
2888
2889         iommu = domain_get_iommu(domain);
2890
2891         for_each_sg(sglist, sg, nelems, i)
2892                 size += aligned_nrpages(sg->offset, sg->length);
2893
2894         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2895                                 pdev->dma_mask);
2896         if (!iova) {
2897                 sglist->dma_length = 0;
2898                 return 0;
2899         }
2900
2901         /*
2902          * Check if DMAR supports zero-length reads on write only
2903          * mappings..
2904          */
2905         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2906                         !cap_zlr(iommu->cap))
2907                 prot |= DMA_PTE_READ;
2908         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2909                 prot |= DMA_PTE_WRITE;
2910
2911         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2912
2913         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2914         if (unlikely(ret)) {
2915                 /*  clear the page */
2916                 dma_pte_clear_range(domain, start_vpfn,
2917                                     start_vpfn + size - 1);
2918                 /* free page tables */
2919                 dma_pte_free_pagetable(domain, start_vpfn,
2920                                        start_vpfn + size - 1);
2921                 /* free iova */
2922                 __free_iova(&domain->iovad, iova);
2923                 return 0;
2924         }
2925
2926         /* it's a non-present to present mapping. Only flush if caching mode */
2927         if (cap_caching_mode(iommu->cap))
2928                 iommu_flush_iotlb_psi(iommu, 0, start_vpfn, offset_pfn);
2929         else
2930                 iommu_flush_write_buffer(iommu);
2931
2932         return nelems;
2933 }
2934
2935 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2936 {
2937         return !dma_addr;
2938 }
2939
2940 struct dma_map_ops intel_dma_ops = {
2941         .alloc_coherent = intel_alloc_coherent,
2942         .free_coherent = intel_free_coherent,
2943         .map_sg = intel_map_sg,
2944         .unmap_sg = intel_unmap_sg,
2945         .map_page = intel_map_page,
2946         .unmap_page = intel_unmap_page,
2947         .mapping_error = intel_mapping_error,
2948 };
2949
2950 static inline int iommu_domain_cache_init(void)
2951 {
2952         int ret = 0;
2953
2954         iommu_domain_cache = kmem_cache_create("iommu_domain",
2955                                          sizeof(struct dmar_domain),
2956                                          0,
2957                                          SLAB_HWCACHE_ALIGN,
2958
2959                                          NULL);
2960         if (!iommu_domain_cache) {
2961                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2962                 ret = -ENOMEM;
2963         }
2964
2965         return ret;
2966 }
2967
2968 static inline int iommu_devinfo_cache_init(void)
2969 {
2970         int ret = 0;
2971
2972         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2973                                          sizeof(struct device_domain_info),
2974                                          0,
2975                                          SLAB_HWCACHE_ALIGN,
2976                                          NULL);
2977         if (!iommu_devinfo_cache) {
2978                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2979                 ret = -ENOMEM;
2980         }
2981
2982         return ret;
2983 }
2984
2985 static inline int iommu_iova_cache_init(void)
2986 {
2987         int ret = 0;
2988
2989         iommu_iova_cache = kmem_cache_create("iommu_iova",
2990                                          sizeof(struct iova),
2991                                          0,
2992                                          SLAB_HWCACHE_ALIGN,
2993                                          NULL);
2994         if (!iommu_iova_cache) {
2995                 printk(KERN_ERR "Couldn't create iova cache\n");
2996                 ret = -ENOMEM;
2997         }
2998
2999         return ret;
3000 }
3001
3002 static int __init iommu_init_mempool(void)
3003 {
3004         int ret;
3005         ret = iommu_iova_cache_init();
3006         if (ret)
3007                 return ret;
3008
3009         ret = iommu_domain_cache_init();
3010         if (ret)
3011                 goto domain_error;
3012
3013         ret = iommu_devinfo_cache_init();
3014         if (!ret)
3015                 return ret;
3016
3017         kmem_cache_destroy(iommu_domain_cache);
3018 domain_error:
3019         kmem_cache_destroy(iommu_iova_cache);
3020
3021         return -ENOMEM;
3022 }
3023
3024 static void __init iommu_exit_mempool(void)
3025 {
3026         kmem_cache_destroy(iommu_devinfo_cache);
3027         kmem_cache_destroy(iommu_domain_cache);
3028         kmem_cache_destroy(iommu_iova_cache);
3029
3030 }
3031
3032 static void __init init_no_remapping_devices(void)
3033 {
3034         struct dmar_drhd_unit *drhd;
3035
3036         for_each_drhd_unit(drhd) {
3037                 if (!drhd->include_all) {
3038                         int i;
3039                         for (i = 0; i < drhd->devices_cnt; i++)
3040                                 if (drhd->devices[i] != NULL)
3041                                         break;
3042                         /* ignore DMAR unit if no pci devices exist */
3043                         if (i == drhd->devices_cnt)
3044                                 drhd->ignored = 1;
3045                 }
3046         }
3047
3048         if (dmar_map_gfx)
3049                 return;
3050
3051         for_each_drhd_unit(drhd) {
3052                 int i;
3053                 if (drhd->ignored || drhd->include_all)
3054                         continue;
3055
3056                 for (i = 0; i < drhd->devices_cnt; i++)
3057                         if (drhd->devices[i] &&
3058                                 !IS_GFX_DEVICE(drhd->devices[i]))
3059                                 break;
3060
3061                 if (i < drhd->devices_cnt)
3062                         continue;
3063
3064                 /* bypass IOMMU if it is just for gfx devices */
3065                 drhd->ignored = 1;
3066                 for (i = 0; i < drhd->devices_cnt; i++) {
3067                         if (!drhd->devices[i])
3068                                 continue;
3069                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3070                 }
3071         }
3072 }
3073
3074 #ifdef CONFIG_SUSPEND
3075 static int init_iommu_hw(void)
3076 {
3077         struct dmar_drhd_unit *drhd;
3078         struct intel_iommu *iommu = NULL;
3079
3080         for_each_active_iommu(iommu, drhd)
3081                 if (iommu->qi)
3082                         dmar_reenable_qi(iommu);
3083
3084         for_each_active_iommu(iommu, drhd) {
3085                 iommu_flush_write_buffer(iommu);
3086
3087                 iommu_set_root_entry(iommu);
3088
3089                 iommu->flush.flush_context(iommu, 0, 0, 0,
3090                                            DMA_CCMD_GLOBAL_INVL);
3091                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3092                                          DMA_TLB_GLOBAL_FLUSH);
3093                 iommu_enable_translation(iommu);
3094                 iommu_disable_protect_mem_regions(iommu);
3095         }
3096
3097         return 0;
3098 }
3099
3100 static void iommu_flush_all(void)
3101 {
3102         struct dmar_drhd_unit *drhd;
3103         struct intel_iommu *iommu;
3104
3105         for_each_active_iommu(iommu, drhd) {
3106                 iommu->flush.flush_context(iommu, 0, 0, 0,
3107                                            DMA_CCMD_GLOBAL_INVL);
3108                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3109                                          DMA_TLB_GLOBAL_FLUSH);
3110         }
3111 }
3112
3113 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
3114 {
3115         struct dmar_drhd_unit *drhd;
3116         struct intel_iommu *iommu = NULL;
3117         unsigned long flag;
3118
3119         for_each_active_iommu(iommu, drhd) {
3120                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3121                                                  GFP_ATOMIC);
3122                 if (!iommu->iommu_state)
3123                         goto nomem;
3124         }
3125
3126         iommu_flush_all();
3127
3128         for_each_active_iommu(iommu, drhd) {
3129                 iommu_disable_translation(iommu);
3130
3131                 spin_lock_irqsave(&iommu->register_lock, flag);
3132
3133                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3134                         readl(iommu->reg + DMAR_FECTL_REG);
3135                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3136                         readl(iommu->reg + DMAR_FEDATA_REG);
3137                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3138                         readl(iommu->reg + DMAR_FEADDR_REG);
3139                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3140                         readl(iommu->reg + DMAR_FEUADDR_REG);
3141
3142                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3143         }
3144         return 0;
3145
3146 nomem:
3147         for_each_active_iommu(iommu, drhd)
3148                 kfree(iommu->iommu_state);
3149
3150         return -ENOMEM;
3151 }
3152
3153 static int iommu_resume(struct sys_device *dev)
3154 {
3155         struct dmar_drhd_unit *drhd;
3156         struct intel_iommu *iommu = NULL;
3157         unsigned long flag;
3158
3159         if (init_iommu_hw()) {
3160                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3161                 return -EIO;
3162         }
3163
3164         for_each_active_iommu(iommu, drhd) {
3165
3166                 spin_lock_irqsave(&iommu->register_lock, flag);
3167
3168                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3169                         iommu->reg + DMAR_FECTL_REG);
3170                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3171                         iommu->reg + DMAR_FEDATA_REG);
3172                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3173                         iommu->reg + DMAR_FEADDR_REG);
3174                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3175                         iommu->reg + DMAR_FEUADDR_REG);
3176
3177                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3178         }
3179
3180         for_each_active_iommu(iommu, drhd)
3181                 kfree(iommu->iommu_state);
3182
3183         return 0;
3184 }
3185
3186 static struct sysdev_class iommu_sysclass = {
3187         .name           = "iommu",
3188         .resume         = iommu_resume,
3189         .suspend        = iommu_suspend,
3190 };
3191
3192 static struct sys_device device_iommu = {
3193         .cls    = &iommu_sysclass,
3194 };
3195
3196 static int __init init_iommu_sysfs(void)
3197 {
3198         int error;
3199
3200         error = sysdev_class_register(&iommu_sysclass);
3201         if (error)
3202                 return error;
3203
3204         error = sysdev_register(&device_iommu);
3205         if (error)
3206                 sysdev_class_unregister(&iommu_sysclass);
3207
3208         return error;
3209 }
3210
3211 #else
3212 static int __init init_iommu_sysfs(void)
3213 {
3214         return 0;
3215 }
3216 #endif  /* CONFIG_PM */
3217
3218 /*
3219  * Here we only respond to action of unbound device from driver.
3220  *
3221  * Added device is not attached to its DMAR domain here yet. That will happen
3222  * when mapping the device to iova.
3223  */
3224 static int device_notifier(struct notifier_block *nb,
3225                                   unsigned long action, void *data)
3226 {
3227         struct device *dev = data;
3228         struct pci_dev *pdev = to_pci_dev(dev);
3229         struct dmar_domain *domain;
3230
3231         domain = find_domain(pdev);
3232         if (!domain)
3233                 return 0;
3234
3235         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through)
3236                 domain_remove_one_dev_info(domain, pdev);
3237
3238         return 0;
3239 }
3240
3241 static struct notifier_block device_nb = {
3242         .notifier_call = device_notifier,
3243 };
3244
3245 int __init intel_iommu_init(void)
3246 {
3247         int ret = 0;
3248         int force_on = 0;
3249
3250         /* VT-d is required for a TXT/tboot launch, so enforce that */
3251         force_on = tboot_force_iommu();
3252
3253         if (dmar_table_init()) {
3254                 if (force_on)
3255                         panic("tboot: Failed to initialize DMAR table\n");
3256                 return  -ENODEV;
3257         }
3258
3259         if (dmar_dev_scope_init()) {
3260                 if (force_on)
3261                         panic("tboot: Failed to initialize DMAR device scope\n");
3262                 return  -ENODEV;
3263         }
3264
3265         /*
3266          * Check the need for DMA-remapping initialization now.
3267          * Above initialization will also be used by Interrupt-remapping.
3268          */
3269         if (no_iommu || dmar_disabled)
3270                 return -ENODEV;
3271
3272         iommu_init_mempool();
3273         dmar_init_reserved_ranges();
3274
3275         init_no_remapping_devices();
3276
3277         ret = init_dmars();
3278         if (ret) {
3279                 if (force_on)
3280                         panic("tboot: Failed to initialize DMARs\n");
3281                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3282                 put_iova_domain(&reserved_iova_list);
3283                 iommu_exit_mempool();
3284                 return ret;
3285         }
3286         printk(KERN_INFO
3287         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3288
3289         init_timer(&unmap_timer);
3290 #ifdef CONFIG_SWIOTLB
3291         swiotlb = 0;
3292 #endif
3293         dma_ops = &intel_dma_ops;
3294
3295         init_iommu_sysfs();
3296
3297         register_iommu(&intel_iommu_ops);
3298
3299         bus_register_notifier(&pci_bus_type, &device_nb);
3300
3301         return 0;
3302 }
3303
3304 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3305                                            struct pci_dev *pdev)
3306 {
3307         struct pci_dev *tmp, *parent;
3308
3309         if (!iommu || !pdev)
3310                 return;
3311
3312         /* dependent device detach */
3313         tmp = pci_find_upstream_pcie_bridge(pdev);
3314         /* Secondary interface's bus number and devfn 0 */
3315         if (tmp) {
3316                 parent = pdev->bus->self;
3317                 while (parent != tmp) {
3318                         iommu_detach_dev(iommu, parent->bus->number,
3319                                          parent->devfn);
3320                         parent = parent->bus->self;
3321                 }
3322                 if (pci_is_pcie(tmp)) /* this is a PCIE-to-PCI bridge */
3323                         iommu_detach_dev(iommu,
3324                                 tmp->subordinate->number, 0);
3325                 else /* this is a legacy PCI bridge */
3326                         iommu_detach_dev(iommu, tmp->bus->number,
3327                                          tmp->devfn);
3328         }
3329 }
3330
3331 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3332                                           struct pci_dev *pdev)
3333 {
3334         struct device_domain_info *info;
3335         struct intel_iommu *iommu;
3336         unsigned long flags;
3337         int found = 0;
3338         struct list_head *entry, *tmp;
3339
3340         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3341                                 pdev->devfn);
3342         if (!iommu)
3343                 return;
3344
3345         spin_lock_irqsave(&device_domain_lock, flags);
3346         list_for_each_safe(entry, tmp, &domain->devices) {
3347                 info = list_entry(entry, struct device_domain_info, link);
3348                 /* No need to compare PCI domain; it has to be the same */
3349                 if (info->bus == pdev->bus->number &&
3350                     info->devfn == pdev->devfn) {
3351                         list_del(&info->link);
3352                         list_del(&info->global);
3353                         if (info->dev)
3354                                 info->dev->dev.archdata.iommu = NULL;
3355                         spin_unlock_irqrestore(&device_domain_lock, flags);
3356
3357                         iommu_disable_dev_iotlb(info);
3358                         iommu_detach_dev(iommu, info->bus, info->devfn);
3359                         iommu_detach_dependent_devices(iommu, pdev);
3360                         free_devinfo_mem(info);
3361
3362                         spin_lock_irqsave(&device_domain_lock, flags);
3363
3364                         if (found)
3365                                 break;
3366                         else
3367                                 continue;
3368                 }
3369
3370                 /* if there is no other devices under the same iommu
3371                  * owned by this domain, clear this iommu in iommu_bmp
3372                  * update iommu count and coherency
3373                  */
3374                 if (iommu == device_to_iommu(info->segment, info->bus,
3375                                             info->devfn))
3376                         found = 1;
3377         }
3378
3379         if (found == 0) {
3380                 unsigned long tmp_flags;
3381                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3382                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3383                 domain->iommu_count--;
3384                 domain_update_iommu_cap(domain);
3385                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3386         }
3387
3388         spin_unlock_irqrestore(&device_domain_lock, flags);
3389 }
3390
3391 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3392 {
3393         struct device_domain_info *info;
3394         struct intel_iommu *iommu;
3395         unsigned long flags1, flags2;
3396
3397         spin_lock_irqsave(&device_domain_lock, flags1);
3398         while (!list_empty(&domain->devices)) {
3399                 info = list_entry(domain->devices.next,
3400                         struct device_domain_info, link);
3401                 list_del(&info->link);
3402                 list_del(&info->global);
3403                 if (info->dev)
3404                         info->dev->dev.archdata.iommu = NULL;
3405
3406                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3407
3408                 iommu_disable_dev_iotlb(info);
3409                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3410                 iommu_detach_dev(iommu, info->bus, info->devfn);
3411                 iommu_detach_dependent_devices(iommu, info->dev);
3412
3413                 /* clear this iommu in iommu_bmp, update iommu count
3414                  * and capabilities
3415                  */
3416                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3417                 if (test_and_clear_bit(iommu->seq_id,
3418                                        &domain->iommu_bmp)) {
3419                         domain->iommu_count--;
3420                         domain_update_iommu_cap(domain);
3421                 }
3422                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3423
3424                 free_devinfo_mem(info);
3425                 spin_lock_irqsave(&device_domain_lock, flags1);
3426         }
3427         spin_unlock_irqrestore(&device_domain_lock, flags1);
3428 }
3429
3430 /* domain id for virtual machine, it won't be set in context */
3431 static unsigned long vm_domid;
3432
3433 static int vm_domain_min_agaw(struct dmar_domain *domain)
3434 {
3435         int i;
3436         int min_agaw = domain->agaw;
3437
3438         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
3439         for (; i < g_num_of_iommus; ) {
3440                 if (min_agaw > g_iommus[i]->agaw)
3441                         min_agaw = g_iommus[i]->agaw;
3442
3443                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
3444         }
3445
3446         return min_agaw;
3447 }
3448
3449 static struct dmar_domain *iommu_alloc_vm_domain(void)
3450 {
3451         struct dmar_domain *domain;
3452
3453         domain = alloc_domain_mem();
3454         if (!domain)
3455                 return NULL;
3456
3457         domain->id = vm_domid++;
3458         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3459         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3460
3461         return domain;
3462 }
3463
3464 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3465 {
3466         int adjust_width;
3467
3468         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3469         spin_lock_init(&domain->iommu_lock);
3470
3471         domain_reserve_special_ranges(domain);
3472
3473         /* calculate AGAW */
3474         domain->gaw = guest_width;
3475         adjust_width = guestwidth_to_adjustwidth(guest_width);
3476         domain->agaw = width_to_agaw(adjust_width);
3477
3478         INIT_LIST_HEAD(&domain->devices);
3479
3480         domain->iommu_count = 0;
3481         domain->iommu_coherency = 0;
3482         domain->iommu_snooping = 0;
3483         domain->max_addr = 0;
3484
3485         /* always allocate the top pgd */
3486         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
3487         if (!domain->pgd)
3488                 return -ENOMEM;
3489         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3490         return 0;
3491 }
3492
3493 static void iommu_free_vm_domain(struct dmar_domain *domain)
3494 {
3495         unsigned long flags;
3496         struct dmar_drhd_unit *drhd;
3497         struct intel_iommu *iommu;
3498         unsigned long i;
3499         unsigned long ndomains;
3500
3501         for_each_drhd_unit(drhd) {
3502                 if (drhd->ignored)
3503                         continue;
3504                 iommu = drhd->iommu;
3505
3506                 ndomains = cap_ndoms(iommu->cap);
3507                 i = find_first_bit(iommu->domain_ids, ndomains);
3508                 for (; i < ndomains; ) {
3509                         if (iommu->domains[i] == domain) {
3510                                 spin_lock_irqsave(&iommu->lock, flags);
3511                                 clear_bit(i, iommu->domain_ids);
3512                                 iommu->domains[i] = NULL;
3513                                 spin_unlock_irqrestore(&iommu->lock, flags);
3514                                 break;
3515                         }
3516                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3517                 }
3518         }
3519 }
3520
3521 static void vm_domain_exit(struct dmar_domain *domain)
3522 {
3523         /* Domain 0 is reserved, so dont process it */
3524         if (!domain)
3525                 return;
3526
3527         vm_domain_remove_all_dev_info(domain);
3528         /* destroy iovas */
3529         put_iova_domain(&domain->iovad);
3530
3531         /* clear ptes */
3532         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3533
3534         /* free page tables */
3535         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3536
3537         iommu_free_vm_domain(domain);
3538         free_domain_mem(domain);
3539 }
3540
3541 static int intel_iommu_domain_init(struct iommu_domain *domain)
3542 {
3543         struct dmar_domain *dmar_domain;
3544
3545         dmar_domain = iommu_alloc_vm_domain();
3546         if (!dmar_domain) {
3547                 printk(KERN_ERR
3548                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3549                 return -ENOMEM;
3550         }
3551         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3552                 printk(KERN_ERR
3553                         "intel_iommu_domain_init() failed\n");
3554                 vm_domain_exit(dmar_domain);
3555                 return -ENOMEM;
3556         }
3557         domain->priv = dmar_domain;
3558
3559         return 0;
3560 }
3561
3562 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3563 {
3564         struct dmar_domain *dmar_domain = domain->priv;
3565
3566         domain->priv = NULL;
3567         vm_domain_exit(dmar_domain);
3568 }
3569
3570 static int intel_iommu_attach_device(struct iommu_domain *domain,
3571                                      struct device *dev)
3572 {
3573         struct dmar_domain *dmar_domain = domain->priv;
3574         struct pci_dev *pdev = to_pci_dev(dev);
3575         struct intel_iommu *iommu;
3576         int addr_width;
3577         u64 end;
3578
3579         /* normally pdev is not mapped */
3580         if (unlikely(domain_context_mapped(pdev))) {
3581                 struct dmar_domain *old_domain;
3582
3583                 old_domain = find_domain(pdev);
3584                 if (old_domain) {
3585                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3586                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3587                                 domain_remove_one_dev_info(old_domain, pdev);
3588                         else
3589                                 domain_remove_dev_info(old_domain);
3590                 }
3591         }
3592
3593         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3594                                 pdev->devfn);
3595         if (!iommu)
3596                 return -ENODEV;
3597
3598         /* check if this iommu agaw is sufficient for max mapped address */
3599         addr_width = agaw_to_width(iommu->agaw);
3600         end = DOMAIN_MAX_ADDR(addr_width);
3601         end = end & VTD_PAGE_MASK;
3602         if (end < dmar_domain->max_addr) {
3603                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3604                        "sufficient for the mapped address (%llx)\n",
3605                        __func__, iommu->agaw, dmar_domain->max_addr);
3606                 return -EFAULT;
3607         }
3608
3609         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3610 }
3611
3612 static void intel_iommu_detach_device(struct iommu_domain *domain,
3613                                       struct device *dev)
3614 {
3615         struct dmar_domain *dmar_domain = domain->priv;
3616         struct pci_dev *pdev = to_pci_dev(dev);
3617
3618         domain_remove_one_dev_info(dmar_domain, pdev);
3619 }
3620
3621 static int intel_iommu_map_range(struct iommu_domain *domain,
3622                                  unsigned long iova, phys_addr_t hpa,
3623                                  size_t size, int iommu_prot)
3624 {
3625         struct dmar_domain *dmar_domain = domain->priv;
3626         u64 max_addr;
3627         int addr_width;
3628         int prot = 0;
3629         int ret;
3630
3631         if (iommu_prot & IOMMU_READ)
3632                 prot |= DMA_PTE_READ;
3633         if (iommu_prot & IOMMU_WRITE)
3634                 prot |= DMA_PTE_WRITE;
3635         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3636                 prot |= DMA_PTE_SNP;
3637
3638         max_addr = iova + size;
3639         if (dmar_domain->max_addr < max_addr) {
3640                 int min_agaw;
3641                 u64 end;
3642
3643                 /* check if minimum agaw is sufficient for mapped address */
3644                 min_agaw = vm_domain_min_agaw(dmar_domain);
3645                 addr_width = agaw_to_width(min_agaw);
3646                 end = DOMAIN_MAX_ADDR(addr_width);
3647                 end = end & VTD_PAGE_MASK;
3648                 if (end < max_addr) {
3649                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3650                                "sufficient for the mapped address (%llx)\n",
3651                                __func__, min_agaw, max_addr);
3652                         return -EFAULT;
3653                 }
3654                 dmar_domain->max_addr = max_addr;
3655         }
3656         /* Round up size to next multiple of PAGE_SIZE, if it and
3657            the low bits of hpa would take us onto the next page */
3658         size = aligned_nrpages(hpa, size);
3659         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3660                                  hpa >> VTD_PAGE_SHIFT, size, prot);
3661         return ret;
3662 }
3663
3664 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3665                                     unsigned long iova, size_t size)
3666 {
3667         struct dmar_domain *dmar_domain = domain->priv;
3668
3669         if (!size)
3670                 return;
3671
3672         dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3673                             (iova + size - 1) >> VTD_PAGE_SHIFT);
3674
3675         if (dmar_domain->max_addr == iova + size)
3676                 dmar_domain->max_addr = iova;
3677 }
3678
3679 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3680                                             unsigned long iova)
3681 {
3682         struct dmar_domain *dmar_domain = domain->priv;
3683         struct dma_pte *pte;
3684         u64 phys = 0;
3685
3686         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3687         if (pte)
3688                 phys = dma_pte_addr(pte);
3689
3690         return phys;
3691 }
3692
3693 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3694                                       unsigned long cap)
3695 {
3696         struct dmar_domain *dmar_domain = domain->priv;
3697
3698         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3699                 return dmar_domain->iommu_snooping;
3700
3701         return 0;
3702 }
3703
3704 static struct iommu_ops intel_iommu_ops = {
3705         .domain_init    = intel_iommu_domain_init,
3706         .domain_destroy = intel_iommu_domain_destroy,
3707         .attach_dev     = intel_iommu_attach_device,
3708         .detach_dev     = intel_iommu_detach_device,
3709         .map            = intel_iommu_map_range,
3710         .unmap          = intel_iommu_unmap_range,
3711         .iova_to_phys   = intel_iommu_iova_to_phys,
3712         .domain_has_cap = intel_iommu_domain_has_cap,
3713 };
3714
3715 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3716 {
3717         /*
3718          * Mobile 4 Series Chipset neglects to set RWBF capability,
3719          * but needs it:
3720          */
3721         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3722         rwbf_quirk = 1;
3723 }
3724
3725 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3726
3727 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3728    ISOCH DMAR unit for the Azalia sound device, but not give it any
3729    TLB entries, which causes it to deadlock. Check for that.  We do
3730    this in a function called from init_dmars(), instead of in a PCI
3731    quirk, because we don't want to print the obnoxious "BIOS broken"
3732    message if VT-d is actually disabled.
3733 */
3734 static void __init check_tylersburg_isoch(void)
3735 {
3736         struct pci_dev *pdev;
3737         uint32_t vtisochctrl;
3738
3739         /* If there's no Azalia in the system anyway, forget it. */
3740         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3741         if (!pdev)
3742                 return;
3743         pci_dev_put(pdev);
3744
3745         /* System Management Registers. Might be hidden, in which case
3746            we can't do the sanity check. But that's OK, because the
3747            known-broken BIOSes _don't_ actually hide it, so far. */
3748         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3749         if (!pdev)
3750                 return;
3751
3752         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3753                 pci_dev_put(pdev);
3754                 return;
3755         }
3756
3757         pci_dev_put(pdev);
3758
3759         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3760         if (vtisochctrl & 1)
3761                 return;
3762
3763         /* Drop all bits other than the number of TLB entries */
3764         vtisochctrl &= 0x1c;
3765
3766         /* If we have the recommended number of TLB entries (16), fine. */
3767         if (vtisochctrl == 0x10)
3768                 return;
3769
3770         /* Zero TLB entries? You get to ride the short bus to school. */
3771         if (!vtisochctrl) {
3772                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3773                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3774                      dmi_get_system_info(DMI_BIOS_VENDOR),
3775                      dmi_get_system_info(DMI_BIOS_VERSION),
3776                      dmi_get_system_info(DMI_PRODUCT_VERSION));
3777                 iommu_identity_mapping |= IDENTMAP_AZALIA;
3778                 return;
3779         }
3780         
3781         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3782                vtisochctrl);
3783 }