c831136462230405ea1a24b8c929c4eceff2a57a
[linux-2.6.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
45
46 #define ROOT_SIZE               VTD_PAGE_SIZE
47 #define CONTEXT_SIZE            VTD_PAGE_SIZE
48
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
52
53 #define IOAPIC_RANGE_START      (0xfee00000)
54 #define IOAPIC_RANGE_END        (0xfeefffff)
55 #define IOVA_START_ADDR         (0x1000)
56
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
58
59 #define MAX_AGAW_WIDTH 64
60
61 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
63
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
67                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
69
70 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
73
74
75 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
76    are never going to work. */
77 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
78 {
79         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
80 }
81
82 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
83 {
84         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
85 }
86 static inline unsigned long page_to_dma_pfn(struct page *pg)
87 {
88         return mm_to_dma_pfn(page_to_pfn(pg));
89 }
90 static inline unsigned long virt_to_dma_pfn(void *p)
91 {
92         return page_to_dma_pfn(virt_to_page(p));
93 }
94
95 /* global iommu list, set NULL for ignored DMAR units */
96 static struct intel_iommu **g_iommus;
97
98 static void __init check_tylersburg_isoch(void);
99 static int rwbf_quirk;
100
101 /*
102  * 0: Present
103  * 1-11: Reserved
104  * 12-63: Context Ptr (12 - (haw-1))
105  * 64-127: Reserved
106  */
107 struct root_entry {
108         u64     val;
109         u64     rsvd1;
110 };
111 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
112 static inline bool root_present(struct root_entry *root)
113 {
114         return (root->val & 1);
115 }
116 static inline void set_root_present(struct root_entry *root)
117 {
118         root->val |= 1;
119 }
120 static inline void set_root_value(struct root_entry *root, unsigned long value)
121 {
122         root->val |= value & VTD_PAGE_MASK;
123 }
124
125 static inline struct context_entry *
126 get_context_addr_from_root(struct root_entry *root)
127 {
128         return (struct context_entry *)
129                 (root_present(root)?phys_to_virt(
130                 root->val & VTD_PAGE_MASK) :
131                 NULL);
132 }
133
134 /*
135  * low 64 bits:
136  * 0: present
137  * 1: fault processing disable
138  * 2-3: translation type
139  * 12-63: address space root
140  * high 64 bits:
141  * 0-2: address width
142  * 3-6: aval
143  * 8-23: domain id
144  */
145 struct context_entry {
146         u64 lo;
147         u64 hi;
148 };
149
150 static inline bool context_present(struct context_entry *context)
151 {
152         return (context->lo & 1);
153 }
154 static inline void context_set_present(struct context_entry *context)
155 {
156         context->lo |= 1;
157 }
158
159 static inline void context_set_fault_enable(struct context_entry *context)
160 {
161         context->lo &= (((u64)-1) << 2) | 1;
162 }
163
164 static inline void context_set_translation_type(struct context_entry *context,
165                                                 unsigned long value)
166 {
167         context->lo &= (((u64)-1) << 4) | 3;
168         context->lo |= (value & 3) << 2;
169 }
170
171 static inline void context_set_address_root(struct context_entry *context,
172                                             unsigned long value)
173 {
174         context->lo |= value & VTD_PAGE_MASK;
175 }
176
177 static inline void context_set_address_width(struct context_entry *context,
178                                              unsigned long value)
179 {
180         context->hi |= value & 7;
181 }
182
183 static inline void context_set_domain_id(struct context_entry *context,
184                                          unsigned long value)
185 {
186         context->hi |= (value & ((1 << 16) - 1)) << 8;
187 }
188
189 static inline void context_clear_entry(struct context_entry *context)
190 {
191         context->lo = 0;
192         context->hi = 0;
193 }
194
195 /*
196  * 0: readable
197  * 1: writable
198  * 2-6: reserved
199  * 7: super page
200  * 8-10: available
201  * 11: snoop behavior
202  * 12-63: Host physcial address
203  */
204 struct dma_pte {
205         u64 val;
206 };
207
208 static inline void dma_clear_pte(struct dma_pte *pte)
209 {
210         pte->val = 0;
211 }
212
213 static inline void dma_set_pte_readable(struct dma_pte *pte)
214 {
215         pte->val |= DMA_PTE_READ;
216 }
217
218 static inline void dma_set_pte_writable(struct dma_pte *pte)
219 {
220         pte->val |= DMA_PTE_WRITE;
221 }
222
223 static inline void dma_set_pte_snp(struct dma_pte *pte)
224 {
225         pte->val |= DMA_PTE_SNP;
226 }
227
228 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
229 {
230         pte->val = (pte->val & ~3) | (prot & 3);
231 }
232
233 static inline u64 dma_pte_addr(struct dma_pte *pte)
234 {
235 #ifdef CONFIG_64BIT
236         return pte->val & VTD_PAGE_MASK;
237 #else
238         /* Must have a full atomic 64-bit read */
239         return  __cmpxchg64(pte, 0ULL, 0ULL) & VTD_PAGE_MASK;
240 #endif
241 }
242
243 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
244 {
245         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
246 }
247
248 static inline bool dma_pte_present(struct dma_pte *pte)
249 {
250         return (pte->val & 3) != 0;
251 }
252
253 static inline int first_pte_in_page(struct dma_pte *pte)
254 {
255         return !((unsigned long)pte & ~VTD_PAGE_MASK);
256 }
257
258 /*
259  * This domain is a statically identity mapping domain.
260  *      1. This domain creats a static 1:1 mapping to all usable memory.
261  *      2. It maps to each iommu if successful.
262  *      3. Each iommu mapps to this domain if successful.
263  */
264 static struct dmar_domain *si_domain;
265 static int hw_pass_through = 1;
266
267 /* devices under the same p2p bridge are owned in one domain */
268 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
269
270 /* domain represents a virtual machine, more than one devices
271  * across iommus may be owned in one domain, e.g. kvm guest.
272  */
273 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
274
275 /* si_domain contains mulitple devices */
276 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
277
278 struct dmar_domain {
279         int     id;                     /* domain id */
280         int     nid;                    /* node id */
281         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
282
283         struct list_head devices;       /* all devices' list */
284         struct iova_domain iovad;       /* iova's that belong to this domain */
285
286         struct dma_pte  *pgd;           /* virtual address */
287         int             gaw;            /* max guest address width */
288
289         /* adjusted guest address width, 0 is level 2 30-bit */
290         int             agaw;
291
292         int             flags;          /* flags to find out type of domain */
293
294         int             iommu_coherency;/* indicate coherency of iommu access */
295         int             iommu_snooping; /* indicate snooping control feature*/
296         int             iommu_count;    /* reference count of iommu */
297         spinlock_t      iommu_lock;     /* protect iommu set in domain */
298         u64             max_addr;       /* maximum mapped address */
299 };
300
301 /* PCI domain-device relationship */
302 struct device_domain_info {
303         struct list_head link;  /* link to domain siblings */
304         struct list_head global; /* link to global list */
305         int segment;            /* PCI domain */
306         u8 bus;                 /* PCI bus number */
307         u8 devfn;               /* PCI devfn number */
308         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
309         struct intel_iommu *iommu; /* IOMMU used by this device */
310         struct dmar_domain *domain; /* pointer to domain */
311 };
312
313 static void flush_unmaps_timeout(unsigned long data);
314
315 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
316
317 #define HIGH_WATER_MARK 250
318 struct deferred_flush_tables {
319         int next;
320         struct iova *iova[HIGH_WATER_MARK];
321         struct dmar_domain *domain[HIGH_WATER_MARK];
322 };
323
324 static struct deferred_flush_tables *deferred_flush;
325
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328
329 static DEFINE_SPINLOCK(async_umap_flush_lock);
330 static LIST_HEAD(unmaps_to_do);
331
332 static int timer_on;
333 static long list_size;
334
335 static void domain_remove_dev_info(struct dmar_domain *domain);
336
337 #ifdef CONFIG_DMAR_DEFAULT_ON
338 int dmar_disabled = 0;
339 #else
340 int dmar_disabled = 1;
341 #endif /*CONFIG_DMAR_DEFAULT_ON*/
342
343 static int __initdata dmar_map_gfx = 1;
344 static int dmar_forcedac;
345 static int intel_iommu_strict;
346
347 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
348 static DEFINE_SPINLOCK(device_domain_lock);
349 static LIST_HEAD(device_domain_list);
350
351 static struct iommu_ops intel_iommu_ops;
352
353 static int __init intel_iommu_setup(char *str)
354 {
355         if (!str)
356                 return -EINVAL;
357         while (*str) {
358                 if (!strncmp(str, "on", 2)) {
359                         dmar_disabled = 0;
360                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
361                 } else if (!strncmp(str, "off", 3)) {
362                         dmar_disabled = 1;
363                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
364                 } else if (!strncmp(str, "igfx_off", 8)) {
365                         dmar_map_gfx = 0;
366                         printk(KERN_INFO
367                                 "Intel-IOMMU: disable GFX device mapping\n");
368                 } else if (!strncmp(str, "forcedac", 8)) {
369                         printk(KERN_INFO
370                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
371                         dmar_forcedac = 1;
372                 } else if (!strncmp(str, "strict", 6)) {
373                         printk(KERN_INFO
374                                 "Intel-IOMMU: disable batched IOTLB flush\n");
375                         intel_iommu_strict = 1;
376                 }
377
378                 str += strcspn(str, ",");
379                 while (*str == ',')
380                         str++;
381         }
382         return 0;
383 }
384 __setup("intel_iommu=", intel_iommu_setup);
385
386 static struct kmem_cache *iommu_domain_cache;
387 static struct kmem_cache *iommu_devinfo_cache;
388 static struct kmem_cache *iommu_iova_cache;
389
390 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
391 {
392         unsigned int flags;
393         void *vaddr;
394
395         /* trying to avoid low memory issues */
396         flags = current->flags & PF_MEMALLOC;
397         current->flags |= PF_MEMALLOC;
398         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
399         current->flags &= (~PF_MEMALLOC | flags);
400         return vaddr;
401 }
402
403
404 static inline void *alloc_pgtable_page(int node)
405 {
406         unsigned int flags;
407         struct page *page;
408         void *vaddr = NULL;
409
410         /* trying to avoid low memory issues */
411         flags = current->flags & PF_MEMALLOC;
412         current->flags |= PF_MEMALLOC;
413         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
414         if (page)
415                 vaddr = page_address(page);
416         current->flags &= (~PF_MEMALLOC | flags);
417         return vaddr;
418 }
419
420 static inline void free_pgtable_page(void *vaddr)
421 {
422         free_page((unsigned long)vaddr);
423 }
424
425 static inline void *alloc_domain_mem(void)
426 {
427         return iommu_kmem_cache_alloc(iommu_domain_cache);
428 }
429
430 static void free_domain_mem(void *vaddr)
431 {
432         kmem_cache_free(iommu_domain_cache, vaddr);
433 }
434
435 static inline void * alloc_devinfo_mem(void)
436 {
437         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
438 }
439
440 static inline void free_devinfo_mem(void *vaddr)
441 {
442         kmem_cache_free(iommu_devinfo_cache, vaddr);
443 }
444
445 struct iova *alloc_iova_mem(void)
446 {
447         return iommu_kmem_cache_alloc(iommu_iova_cache);
448 }
449
450 void free_iova_mem(struct iova *iova)
451 {
452         kmem_cache_free(iommu_iova_cache, iova);
453 }
454
455
456 static inline int width_to_agaw(int width);
457
458 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
459 {
460         unsigned long sagaw;
461         int agaw = -1;
462
463         sagaw = cap_sagaw(iommu->cap);
464         for (agaw = width_to_agaw(max_gaw);
465              agaw >= 0; agaw--) {
466                 if (test_bit(agaw, &sagaw))
467                         break;
468         }
469
470         return agaw;
471 }
472
473 /*
474  * Calculate max SAGAW for each iommu.
475  */
476 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
477 {
478         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
479 }
480
481 /*
482  * calculate agaw for each iommu.
483  * "SAGAW" may be different across iommus, use a default agaw, and
484  * get a supported less agaw for iommus that don't support the default agaw.
485  */
486 int iommu_calculate_agaw(struct intel_iommu *iommu)
487 {
488         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
489 }
490
491 /* This functionin only returns single iommu in a domain */
492 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
493 {
494         int iommu_id;
495
496         /* si_domain and vm domain should not get here. */
497         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
498         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
499
500         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
501         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
502                 return NULL;
503
504         return g_iommus[iommu_id];
505 }
506
507 static void domain_update_iommu_coherency(struct dmar_domain *domain)
508 {
509         int i;
510
511         domain->iommu_coherency = 1;
512
513         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
514         for (; i < g_num_of_iommus; ) {
515                 if (!ecap_coherent(g_iommus[i]->ecap)) {
516                         domain->iommu_coherency = 0;
517                         break;
518                 }
519                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
520         }
521 }
522
523 static void domain_update_iommu_snooping(struct dmar_domain *domain)
524 {
525         int i;
526
527         domain->iommu_snooping = 1;
528
529         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
530         for (; i < g_num_of_iommus; ) {
531                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
532                         domain->iommu_snooping = 0;
533                         break;
534                 }
535                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
536         }
537 }
538
539 /* Some capabilities may be different across iommus */
540 static void domain_update_iommu_cap(struct dmar_domain *domain)
541 {
542         domain_update_iommu_coherency(domain);
543         domain_update_iommu_snooping(domain);
544 }
545
546 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
547 {
548         struct dmar_drhd_unit *drhd = NULL;
549         int i;
550
551         for_each_drhd_unit(drhd) {
552                 if (drhd->ignored)
553                         continue;
554                 if (segment != drhd->segment)
555                         continue;
556
557                 for (i = 0; i < drhd->devices_cnt; i++) {
558                         if (drhd->devices[i] &&
559                             drhd->devices[i]->bus->number == bus &&
560                             drhd->devices[i]->devfn == devfn)
561                                 return drhd->iommu;
562                         if (drhd->devices[i] &&
563                             drhd->devices[i]->subordinate &&
564                             drhd->devices[i]->subordinate->number <= bus &&
565                             drhd->devices[i]->subordinate->subordinate >= bus)
566                                 return drhd->iommu;
567                 }
568
569                 if (drhd->include_all)
570                         return drhd->iommu;
571         }
572
573         return NULL;
574 }
575
576 static void domain_flush_cache(struct dmar_domain *domain,
577                                void *addr, int size)
578 {
579         if (!domain->iommu_coherency)
580                 clflush_cache_range(addr, size);
581 }
582
583 /* Gets context entry for a given bus and devfn */
584 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
585                 u8 bus, u8 devfn)
586 {
587         struct root_entry *root;
588         struct context_entry *context;
589         unsigned long phy_addr;
590         unsigned long flags;
591
592         spin_lock_irqsave(&iommu->lock, flags);
593         root = &iommu->root_entry[bus];
594         context = get_context_addr_from_root(root);
595         if (!context) {
596                 context = (struct context_entry *)
597                                 alloc_pgtable_page(iommu->node);
598                 if (!context) {
599                         spin_unlock_irqrestore(&iommu->lock, flags);
600                         return NULL;
601                 }
602                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
603                 phy_addr = virt_to_phys((void *)context);
604                 set_root_value(root, phy_addr);
605                 set_root_present(root);
606                 __iommu_flush_cache(iommu, root, sizeof(*root));
607         }
608         spin_unlock_irqrestore(&iommu->lock, flags);
609         return &context[devfn];
610 }
611
612 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
613 {
614         struct root_entry *root;
615         struct context_entry *context;
616         int ret;
617         unsigned long flags;
618
619         spin_lock_irqsave(&iommu->lock, flags);
620         root = &iommu->root_entry[bus];
621         context = get_context_addr_from_root(root);
622         if (!context) {
623                 ret = 0;
624                 goto out;
625         }
626         ret = context_present(&context[devfn]);
627 out:
628         spin_unlock_irqrestore(&iommu->lock, flags);
629         return ret;
630 }
631
632 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
633 {
634         struct root_entry *root;
635         struct context_entry *context;
636         unsigned long flags;
637
638         spin_lock_irqsave(&iommu->lock, flags);
639         root = &iommu->root_entry[bus];
640         context = get_context_addr_from_root(root);
641         if (context) {
642                 context_clear_entry(&context[devfn]);
643                 __iommu_flush_cache(iommu, &context[devfn], \
644                         sizeof(*context));
645         }
646         spin_unlock_irqrestore(&iommu->lock, flags);
647 }
648
649 static void free_context_table(struct intel_iommu *iommu)
650 {
651         struct root_entry *root;
652         int i;
653         unsigned long flags;
654         struct context_entry *context;
655
656         spin_lock_irqsave(&iommu->lock, flags);
657         if (!iommu->root_entry) {
658                 goto out;
659         }
660         for (i = 0; i < ROOT_ENTRY_NR; i++) {
661                 root = &iommu->root_entry[i];
662                 context = get_context_addr_from_root(root);
663                 if (context)
664                         free_pgtable_page(context);
665         }
666         free_pgtable_page(iommu->root_entry);
667         iommu->root_entry = NULL;
668 out:
669         spin_unlock_irqrestore(&iommu->lock, flags);
670 }
671
672 /* page table handling */
673 #define LEVEL_STRIDE            (9)
674 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
675
676 static inline int agaw_to_level(int agaw)
677 {
678         return agaw + 2;
679 }
680
681 static inline int agaw_to_width(int agaw)
682 {
683         return 30 + agaw * LEVEL_STRIDE;
684
685 }
686
687 static inline int width_to_agaw(int width)
688 {
689         return (width - 30) / LEVEL_STRIDE;
690 }
691
692 static inline unsigned int level_to_offset_bits(int level)
693 {
694         return (level - 1) * LEVEL_STRIDE;
695 }
696
697 static inline int pfn_level_offset(unsigned long pfn, int level)
698 {
699         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
700 }
701
702 static inline unsigned long level_mask(int level)
703 {
704         return -1UL << level_to_offset_bits(level);
705 }
706
707 static inline unsigned long level_size(int level)
708 {
709         return 1UL << level_to_offset_bits(level);
710 }
711
712 static inline unsigned long align_to_level(unsigned long pfn, int level)
713 {
714         return (pfn + level_size(level) - 1) & level_mask(level);
715 }
716
717 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
718                                       unsigned long pfn)
719 {
720         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
721         struct dma_pte *parent, *pte = NULL;
722         int level = agaw_to_level(domain->agaw);
723         int offset;
724
725         BUG_ON(!domain->pgd);
726         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
727         parent = domain->pgd;
728
729         while (level > 0) {
730                 void *tmp_page;
731
732                 offset = pfn_level_offset(pfn, level);
733                 pte = &parent[offset];
734                 if (level == 1)
735                         break;
736
737                 if (!dma_pte_present(pte)) {
738                         uint64_t pteval;
739
740                         tmp_page = alloc_pgtable_page(domain->nid);
741
742                         if (!tmp_page)
743                                 return NULL;
744
745                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
746                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
747                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
748                                 /* Someone else set it while we were thinking; use theirs. */
749                                 free_pgtable_page(tmp_page);
750                         } else {
751                                 dma_pte_addr(pte);
752                                 domain_flush_cache(domain, pte, sizeof(*pte));
753                         }
754                 }
755                 parent = phys_to_virt(dma_pte_addr(pte));
756                 level--;
757         }
758
759         return pte;
760 }
761
762 /* return address's pte at specific level */
763 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
764                                          unsigned long pfn,
765                                          int level)
766 {
767         struct dma_pte *parent, *pte = NULL;
768         int total = agaw_to_level(domain->agaw);
769         int offset;
770
771         parent = domain->pgd;
772         while (level <= total) {
773                 offset = pfn_level_offset(pfn, total);
774                 pte = &parent[offset];
775                 if (level == total)
776                         return pte;
777
778                 if (!dma_pte_present(pte))
779                         break;
780                 parent = phys_to_virt(dma_pte_addr(pte));
781                 total--;
782         }
783         return NULL;
784 }
785
786 /* clear last level pte, a tlb flush should be followed */
787 static void dma_pte_clear_range(struct dmar_domain *domain,
788                                 unsigned long start_pfn,
789                                 unsigned long last_pfn)
790 {
791         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
792         struct dma_pte *first_pte, *pte;
793
794         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
795         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
796         BUG_ON(start_pfn > last_pfn);
797
798         /* we don't need lock here; nobody else touches the iova range */
799         do {
800                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
801                 if (!pte) {
802                         start_pfn = align_to_level(start_pfn + 1, 2);
803                         continue;
804                 }
805                 do { 
806                         dma_clear_pte(pte);
807                         start_pfn++;
808                         pte++;
809                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
810
811                 domain_flush_cache(domain, first_pte,
812                                    (void *)pte - (void *)first_pte);
813
814         } while (start_pfn && start_pfn <= last_pfn);
815 }
816
817 /* free page table pages. last level pte should already be cleared */
818 static void dma_pte_free_pagetable(struct dmar_domain *domain,
819                                    unsigned long start_pfn,
820                                    unsigned long last_pfn)
821 {
822         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
823         struct dma_pte *first_pte, *pte;
824         int total = agaw_to_level(domain->agaw);
825         int level;
826         unsigned long tmp;
827
828         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
829         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
830         BUG_ON(start_pfn > last_pfn);
831
832         /* We don't need lock here; nobody else touches the iova range */
833         level = 2;
834         while (level <= total) {
835                 tmp = align_to_level(start_pfn, level);
836
837                 /* If we can't even clear one PTE at this level, we're done */
838                 if (tmp + level_size(level) - 1 > last_pfn)
839                         return;
840
841                 do {
842                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
843                         if (!pte) {
844                                 tmp = align_to_level(tmp + 1, level + 1);
845                                 continue;
846                         }
847                         do {
848                                 if (dma_pte_present(pte)) {
849                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
850                                         dma_clear_pte(pte);
851                                 }
852                                 pte++;
853                                 tmp += level_size(level);
854                         } while (!first_pte_in_page(pte) &&
855                                  tmp + level_size(level) - 1 <= last_pfn);
856
857                         domain_flush_cache(domain, first_pte,
858                                            (void *)pte - (void *)first_pte);
859                         
860                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
861                 level++;
862         }
863         /* free pgd */
864         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
865                 free_pgtable_page(domain->pgd);
866                 domain->pgd = NULL;
867         }
868 }
869
870 /* iommu handling */
871 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
872 {
873         struct root_entry *root;
874         unsigned long flags;
875
876         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
877         if (!root)
878                 return -ENOMEM;
879
880         __iommu_flush_cache(iommu, root, ROOT_SIZE);
881
882         spin_lock_irqsave(&iommu->lock, flags);
883         iommu->root_entry = root;
884         spin_unlock_irqrestore(&iommu->lock, flags);
885
886         return 0;
887 }
888
889 static void iommu_set_root_entry(struct intel_iommu *iommu)
890 {
891         void *addr;
892         u32 sts;
893         unsigned long flag;
894
895         addr = iommu->root_entry;
896
897         spin_lock_irqsave(&iommu->register_lock, flag);
898         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
899
900         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
901
902         /* Make sure hardware complete it */
903         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
904                       readl, (sts & DMA_GSTS_RTPS), sts);
905
906         spin_unlock_irqrestore(&iommu->register_lock, flag);
907 }
908
909 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
910 {
911         u32 val;
912         unsigned long flag;
913
914         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
915                 return;
916
917         spin_lock_irqsave(&iommu->register_lock, flag);
918         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
919
920         /* Make sure hardware complete it */
921         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
922                       readl, (!(val & DMA_GSTS_WBFS)), val);
923
924         spin_unlock_irqrestore(&iommu->register_lock, flag);
925 }
926
927 /* return value determine if we need a write buffer flush */
928 static void __iommu_flush_context(struct intel_iommu *iommu,
929                                   u16 did, u16 source_id, u8 function_mask,
930                                   u64 type)
931 {
932         u64 val = 0;
933         unsigned long flag;
934
935         switch (type) {
936         case DMA_CCMD_GLOBAL_INVL:
937                 val = DMA_CCMD_GLOBAL_INVL;
938                 break;
939         case DMA_CCMD_DOMAIN_INVL:
940                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
941                 break;
942         case DMA_CCMD_DEVICE_INVL:
943                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
944                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
945                 break;
946         default:
947                 BUG();
948         }
949         val |= DMA_CCMD_ICC;
950
951         spin_lock_irqsave(&iommu->register_lock, flag);
952         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
953
954         /* Make sure hardware complete it */
955         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
956                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
957
958         spin_unlock_irqrestore(&iommu->register_lock, flag);
959 }
960
961 /* return value determine if we need a write buffer flush */
962 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
963                                 u64 addr, unsigned int size_order, u64 type)
964 {
965         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
966         u64 val = 0, val_iva = 0;
967         unsigned long flag;
968
969         switch (type) {
970         case DMA_TLB_GLOBAL_FLUSH:
971                 /* global flush doesn't need set IVA_REG */
972                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
973                 break;
974         case DMA_TLB_DSI_FLUSH:
975                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
976                 break;
977         case DMA_TLB_PSI_FLUSH:
978                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
979                 /* Note: always flush non-leaf currently */
980                 val_iva = size_order | addr;
981                 break;
982         default:
983                 BUG();
984         }
985         /* Note: set drain read/write */
986 #if 0
987         /*
988          * This is probably to be super secure.. Looks like we can
989          * ignore it without any impact.
990          */
991         if (cap_read_drain(iommu->cap))
992                 val |= DMA_TLB_READ_DRAIN;
993 #endif
994         if (cap_write_drain(iommu->cap))
995                 val |= DMA_TLB_WRITE_DRAIN;
996
997         spin_lock_irqsave(&iommu->register_lock, flag);
998         /* Note: Only uses first TLB reg currently */
999         if (val_iva)
1000                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1001         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1002
1003         /* Make sure hardware complete it */
1004         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1005                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1006
1007         spin_unlock_irqrestore(&iommu->register_lock, flag);
1008
1009         /* check IOTLB invalidation granularity */
1010         if (DMA_TLB_IAIG(val) == 0)
1011                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1012         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1013                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1014                         (unsigned long long)DMA_TLB_IIRG(type),
1015                         (unsigned long long)DMA_TLB_IAIG(val));
1016 }
1017
1018 static struct device_domain_info *iommu_support_dev_iotlb(
1019         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1020 {
1021         int found = 0;
1022         unsigned long flags;
1023         struct device_domain_info *info;
1024         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1025
1026         if (!ecap_dev_iotlb_support(iommu->ecap))
1027                 return NULL;
1028
1029         if (!iommu->qi)
1030                 return NULL;
1031
1032         spin_lock_irqsave(&device_domain_lock, flags);
1033         list_for_each_entry(info, &domain->devices, link)
1034                 if (info->bus == bus && info->devfn == devfn) {
1035                         found = 1;
1036                         break;
1037                 }
1038         spin_unlock_irqrestore(&device_domain_lock, flags);
1039
1040         if (!found || !info->dev)
1041                 return NULL;
1042
1043         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1044                 return NULL;
1045
1046         if (!dmar_find_matched_atsr_unit(info->dev))
1047                 return NULL;
1048
1049         info->iommu = iommu;
1050
1051         return info;
1052 }
1053
1054 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1055 {
1056         if (!info)
1057                 return;
1058
1059         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1060 }
1061
1062 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1063 {
1064         if (!info->dev || !pci_ats_enabled(info->dev))
1065                 return;
1066
1067         pci_disable_ats(info->dev);
1068 }
1069
1070 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1071                                   u64 addr, unsigned mask)
1072 {
1073         u16 sid, qdep;
1074         unsigned long flags;
1075         struct device_domain_info *info;
1076
1077         spin_lock_irqsave(&device_domain_lock, flags);
1078         list_for_each_entry(info, &domain->devices, link) {
1079                 if (!info->dev || !pci_ats_enabled(info->dev))
1080                         continue;
1081
1082                 sid = info->bus << 8 | info->devfn;
1083                 qdep = pci_ats_queue_depth(info->dev);
1084                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1085         }
1086         spin_unlock_irqrestore(&device_domain_lock, flags);
1087 }
1088
1089 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1090                                   unsigned long pfn, unsigned int pages)
1091 {
1092         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1093         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1094
1095         BUG_ON(pages == 0);
1096
1097         /*
1098          * Fallback to domain selective flush if no PSI support or the size is
1099          * too big.
1100          * PSI requires page size to be 2 ^ x, and the base address is naturally
1101          * aligned to the size
1102          */
1103         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1104                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1105                                                 DMA_TLB_DSI_FLUSH);
1106         else
1107                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1108                                                 DMA_TLB_PSI_FLUSH);
1109
1110         /*
1111          * In caching mode, domain ID 0 is reserved for non-present to present
1112          * mapping flush. Device IOTLB doesn't need to be flushed in this case.
1113          */
1114         if (!cap_caching_mode(iommu->cap) || did)
1115                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1116 }
1117
1118 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1119 {
1120         u32 pmen;
1121         unsigned long flags;
1122
1123         spin_lock_irqsave(&iommu->register_lock, flags);
1124         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1125         pmen &= ~DMA_PMEN_EPM;
1126         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1127
1128         /* wait for the protected region status bit to clear */
1129         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1130                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1131
1132         spin_unlock_irqrestore(&iommu->register_lock, flags);
1133 }
1134
1135 static int iommu_enable_translation(struct intel_iommu *iommu)
1136 {
1137         u32 sts;
1138         unsigned long flags;
1139
1140         spin_lock_irqsave(&iommu->register_lock, flags);
1141         iommu->gcmd |= DMA_GCMD_TE;
1142         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1143
1144         /* Make sure hardware complete it */
1145         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1146                       readl, (sts & DMA_GSTS_TES), sts);
1147
1148         spin_unlock_irqrestore(&iommu->register_lock, flags);
1149         return 0;
1150 }
1151
1152 static int iommu_disable_translation(struct intel_iommu *iommu)
1153 {
1154         u32 sts;
1155         unsigned long flag;
1156
1157         spin_lock_irqsave(&iommu->register_lock, flag);
1158         iommu->gcmd &= ~DMA_GCMD_TE;
1159         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1160
1161         /* Make sure hardware complete it */
1162         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1163                       readl, (!(sts & DMA_GSTS_TES)), sts);
1164
1165         spin_unlock_irqrestore(&iommu->register_lock, flag);
1166         return 0;
1167 }
1168
1169
1170 static int iommu_init_domains(struct intel_iommu *iommu)
1171 {
1172         unsigned long ndomains;
1173         unsigned long nlongs;
1174
1175         ndomains = cap_ndoms(iommu->cap);
1176         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1177         nlongs = BITS_TO_LONGS(ndomains);
1178
1179         spin_lock_init(&iommu->lock);
1180
1181         /* TBD: there might be 64K domains,
1182          * consider other allocation for future chip
1183          */
1184         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1185         if (!iommu->domain_ids) {
1186                 printk(KERN_ERR "Allocating domain id array failed\n");
1187                 return -ENOMEM;
1188         }
1189         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1190                         GFP_KERNEL);
1191         if (!iommu->domains) {
1192                 printk(KERN_ERR "Allocating domain array failed\n");
1193                 return -ENOMEM;
1194         }
1195
1196         /*
1197          * if Caching mode is set, then invalid translations are tagged
1198          * with domainid 0. Hence we need to pre-allocate it.
1199          */
1200         if (cap_caching_mode(iommu->cap))
1201                 set_bit(0, iommu->domain_ids);
1202         return 0;
1203 }
1204
1205
1206 static void domain_exit(struct dmar_domain *domain);
1207 static void vm_domain_exit(struct dmar_domain *domain);
1208
1209 void free_dmar_iommu(struct intel_iommu *iommu)
1210 {
1211         struct dmar_domain *domain;
1212         int i;
1213         unsigned long flags;
1214
1215         if ((iommu->domains) && (iommu->domain_ids)) {
1216                 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1217                 for (; i < cap_ndoms(iommu->cap); ) {
1218                         domain = iommu->domains[i];
1219                         clear_bit(i, iommu->domain_ids);
1220
1221                         spin_lock_irqsave(&domain->iommu_lock, flags);
1222                         if (--domain->iommu_count == 0) {
1223                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1224                                         vm_domain_exit(domain);
1225                                 else
1226                                         domain_exit(domain);
1227                         }
1228                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1229
1230                         i = find_next_bit(iommu->domain_ids,
1231                                 cap_ndoms(iommu->cap), i+1);
1232                 }
1233         }
1234
1235         if (iommu->gcmd & DMA_GCMD_TE)
1236                 iommu_disable_translation(iommu);
1237
1238         if (iommu->irq) {
1239                 set_irq_data(iommu->irq, NULL);
1240                 /* This will mask the irq */
1241                 free_irq(iommu->irq, iommu);
1242                 destroy_irq(iommu->irq);
1243         }
1244
1245         kfree(iommu->domains);
1246         kfree(iommu->domain_ids);
1247
1248         g_iommus[iommu->seq_id] = NULL;
1249
1250         /* if all iommus are freed, free g_iommus */
1251         for (i = 0; i < g_num_of_iommus; i++) {
1252                 if (g_iommus[i])
1253                         break;
1254         }
1255
1256         if (i == g_num_of_iommus)
1257                 kfree(g_iommus);
1258
1259         /* free context mapping */
1260         free_context_table(iommu);
1261 }
1262
1263 static struct dmar_domain *alloc_domain(void)
1264 {
1265         struct dmar_domain *domain;
1266
1267         domain = alloc_domain_mem();
1268         if (!domain)
1269                 return NULL;
1270
1271         domain->nid = -1;
1272         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1273         domain->flags = 0;
1274
1275         return domain;
1276 }
1277
1278 static int iommu_attach_domain(struct dmar_domain *domain,
1279                                struct intel_iommu *iommu)
1280 {
1281         int num;
1282         unsigned long ndomains;
1283         unsigned long flags;
1284
1285         ndomains = cap_ndoms(iommu->cap);
1286
1287         spin_lock_irqsave(&iommu->lock, flags);
1288
1289         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1290         if (num >= ndomains) {
1291                 spin_unlock_irqrestore(&iommu->lock, flags);
1292                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1293                 return -ENOMEM;
1294         }
1295
1296         domain->id = num;
1297         set_bit(num, iommu->domain_ids);
1298         set_bit(iommu->seq_id, &domain->iommu_bmp);
1299         iommu->domains[num] = domain;
1300         spin_unlock_irqrestore(&iommu->lock, flags);
1301
1302         return 0;
1303 }
1304
1305 static void iommu_detach_domain(struct dmar_domain *domain,
1306                                 struct intel_iommu *iommu)
1307 {
1308         unsigned long flags;
1309         int num, ndomains;
1310         int found = 0;
1311
1312         spin_lock_irqsave(&iommu->lock, flags);
1313         ndomains = cap_ndoms(iommu->cap);
1314         num = find_first_bit(iommu->domain_ids, ndomains);
1315         for (; num < ndomains; ) {
1316                 if (iommu->domains[num] == domain) {
1317                         found = 1;
1318                         break;
1319                 }
1320                 num = find_next_bit(iommu->domain_ids,
1321                                     cap_ndoms(iommu->cap), num+1);
1322         }
1323
1324         if (found) {
1325                 clear_bit(num, iommu->domain_ids);
1326                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1327                 iommu->domains[num] = NULL;
1328         }
1329         spin_unlock_irqrestore(&iommu->lock, flags);
1330 }
1331
1332 static struct iova_domain reserved_iova_list;
1333 static struct lock_class_key reserved_rbtree_key;
1334
1335 static void dmar_init_reserved_ranges(void)
1336 {
1337         struct pci_dev *pdev = NULL;
1338         struct iova *iova;
1339         int i;
1340
1341         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1342
1343         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1344                 &reserved_rbtree_key);
1345
1346         /* IOAPIC ranges shouldn't be accessed by DMA */
1347         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1348                 IOVA_PFN(IOAPIC_RANGE_END));
1349         if (!iova)
1350                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1351
1352         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1353         for_each_pci_dev(pdev) {
1354                 struct resource *r;
1355
1356                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1357                         r = &pdev->resource[i];
1358                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1359                                 continue;
1360                         iova = reserve_iova(&reserved_iova_list,
1361                                             IOVA_PFN(r->start),
1362                                             IOVA_PFN(r->end));
1363                         if (!iova)
1364                                 printk(KERN_ERR "Reserve iova failed\n");
1365                 }
1366         }
1367
1368 }
1369
1370 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1371 {
1372         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1373 }
1374
1375 static inline int guestwidth_to_adjustwidth(int gaw)
1376 {
1377         int agaw;
1378         int r = (gaw - 12) % 9;
1379
1380         if (r == 0)
1381                 agaw = gaw;
1382         else
1383                 agaw = gaw + 9 - r;
1384         if (agaw > 64)
1385                 agaw = 64;
1386         return agaw;
1387 }
1388
1389 static int domain_init(struct dmar_domain *domain, int guest_width)
1390 {
1391         struct intel_iommu *iommu;
1392         int adjust_width, agaw;
1393         unsigned long sagaw;
1394
1395         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1396         spin_lock_init(&domain->iommu_lock);
1397
1398         domain_reserve_special_ranges(domain);
1399
1400         /* calculate AGAW */
1401         iommu = domain_get_iommu(domain);
1402         if (guest_width > cap_mgaw(iommu->cap))
1403                 guest_width = cap_mgaw(iommu->cap);
1404         domain->gaw = guest_width;
1405         adjust_width = guestwidth_to_adjustwidth(guest_width);
1406         agaw = width_to_agaw(adjust_width);
1407         sagaw = cap_sagaw(iommu->cap);
1408         if (!test_bit(agaw, &sagaw)) {
1409                 /* hardware doesn't support it, choose a bigger one */
1410                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1411                 agaw = find_next_bit(&sagaw, 5, agaw);
1412                 if (agaw >= 5)
1413                         return -ENODEV;
1414         }
1415         domain->agaw = agaw;
1416         INIT_LIST_HEAD(&domain->devices);
1417
1418         if (ecap_coherent(iommu->ecap))
1419                 domain->iommu_coherency = 1;
1420         else
1421                 domain->iommu_coherency = 0;
1422
1423         if (ecap_sc_support(iommu->ecap))
1424                 domain->iommu_snooping = 1;
1425         else
1426                 domain->iommu_snooping = 0;
1427
1428         domain->iommu_count = 1;
1429         domain->nid = iommu->node;
1430
1431         /* always allocate the top pgd */
1432         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1433         if (!domain->pgd)
1434                 return -ENOMEM;
1435         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1436         return 0;
1437 }
1438
1439 static void domain_exit(struct dmar_domain *domain)
1440 {
1441         struct dmar_drhd_unit *drhd;
1442         struct intel_iommu *iommu;
1443
1444         /* Domain 0 is reserved, so dont process it */
1445         if (!domain)
1446                 return;
1447
1448         domain_remove_dev_info(domain);
1449         /* destroy iovas */
1450         put_iova_domain(&domain->iovad);
1451
1452         /* clear ptes */
1453         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1454
1455         /* free page tables */
1456         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1457
1458         for_each_active_iommu(iommu, drhd)
1459                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1460                         iommu_detach_domain(domain, iommu);
1461
1462         free_domain_mem(domain);
1463 }
1464
1465 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1466                                  u8 bus, u8 devfn, int translation)
1467 {
1468         struct context_entry *context;
1469         unsigned long flags;
1470         struct intel_iommu *iommu;
1471         struct dma_pte *pgd;
1472         unsigned long num;
1473         unsigned long ndomains;
1474         int id;
1475         int agaw;
1476         struct device_domain_info *info = NULL;
1477
1478         pr_debug("Set context mapping for %02x:%02x.%d\n",
1479                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1480
1481         BUG_ON(!domain->pgd);
1482         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1483                translation != CONTEXT_TT_MULTI_LEVEL);
1484
1485         iommu = device_to_iommu(segment, bus, devfn);
1486         if (!iommu)
1487                 return -ENODEV;
1488
1489         context = device_to_context_entry(iommu, bus, devfn);
1490         if (!context)
1491                 return -ENOMEM;
1492         spin_lock_irqsave(&iommu->lock, flags);
1493         if (context_present(context)) {
1494                 spin_unlock_irqrestore(&iommu->lock, flags);
1495                 return 0;
1496         }
1497
1498         id = domain->id;
1499         pgd = domain->pgd;
1500
1501         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1502             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1503                 int found = 0;
1504
1505                 /* find an available domain id for this device in iommu */
1506                 ndomains = cap_ndoms(iommu->cap);
1507                 num = find_first_bit(iommu->domain_ids, ndomains);
1508                 for (; num < ndomains; ) {
1509                         if (iommu->domains[num] == domain) {
1510                                 id = num;
1511                                 found = 1;
1512                                 break;
1513                         }
1514                         num = find_next_bit(iommu->domain_ids,
1515                                             cap_ndoms(iommu->cap), num+1);
1516                 }
1517
1518                 if (found == 0) {
1519                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1520                         if (num >= ndomains) {
1521                                 spin_unlock_irqrestore(&iommu->lock, flags);
1522                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1523                                 return -EFAULT;
1524                         }
1525
1526                         set_bit(num, iommu->domain_ids);
1527                         iommu->domains[num] = domain;
1528                         id = num;
1529                 }
1530
1531                 /* Skip top levels of page tables for
1532                  * iommu which has less agaw than default.
1533                  */
1534                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1535                         pgd = phys_to_virt(dma_pte_addr(pgd));
1536                         if (!dma_pte_present(pgd)) {
1537                                 spin_unlock_irqrestore(&iommu->lock, flags);
1538                                 return -ENOMEM;
1539                         }
1540                 }
1541         }
1542
1543         context_set_domain_id(context, id);
1544
1545         if (translation != CONTEXT_TT_PASS_THROUGH) {
1546                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1547                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1548                                      CONTEXT_TT_MULTI_LEVEL;
1549         }
1550         /*
1551          * In pass through mode, AW must be programmed to indicate the largest
1552          * AGAW value supported by hardware. And ASR is ignored by hardware.
1553          */
1554         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1555                 context_set_address_width(context, iommu->msagaw);
1556         else {
1557                 context_set_address_root(context, virt_to_phys(pgd));
1558                 context_set_address_width(context, iommu->agaw);
1559         }
1560
1561         context_set_translation_type(context, translation);
1562         context_set_fault_enable(context);
1563         context_set_present(context);
1564         domain_flush_cache(domain, context, sizeof(*context));
1565
1566         /*
1567          * It's a non-present to present mapping. If hardware doesn't cache
1568          * non-present entry we only need to flush the write-buffer. If the
1569          * _does_ cache non-present entries, then it does so in the special
1570          * domain #0, which we have to flush:
1571          */
1572         if (cap_caching_mode(iommu->cap)) {
1573                 iommu->flush.flush_context(iommu, 0,
1574                                            (((u16)bus) << 8) | devfn,
1575                                            DMA_CCMD_MASK_NOBIT,
1576                                            DMA_CCMD_DEVICE_INVL);
1577                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH);
1578         } else {
1579                 iommu_flush_write_buffer(iommu);
1580         }
1581         iommu_enable_dev_iotlb(info);
1582         spin_unlock_irqrestore(&iommu->lock, flags);
1583
1584         spin_lock_irqsave(&domain->iommu_lock, flags);
1585         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1586                 domain->iommu_count++;
1587                 if (domain->iommu_count == 1)
1588                         domain->nid = iommu->node;
1589                 domain_update_iommu_cap(domain);
1590         }
1591         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1592         return 0;
1593 }
1594
1595 static int
1596 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1597                         int translation)
1598 {
1599         int ret;
1600         struct pci_dev *tmp, *parent;
1601
1602         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1603                                          pdev->bus->number, pdev->devfn,
1604                                          translation);
1605         if (ret)
1606                 return ret;
1607
1608         /* dependent device mapping */
1609         tmp = pci_find_upstream_pcie_bridge(pdev);
1610         if (!tmp)
1611                 return 0;
1612         /* Secondary interface's bus number and devfn 0 */
1613         parent = pdev->bus->self;
1614         while (parent != tmp) {
1615                 ret = domain_context_mapping_one(domain,
1616                                                  pci_domain_nr(parent->bus),
1617                                                  parent->bus->number,
1618                                                  parent->devfn, translation);
1619                 if (ret)
1620                         return ret;
1621                 parent = parent->bus->self;
1622         }
1623         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1624                 return domain_context_mapping_one(domain,
1625                                         pci_domain_nr(tmp->subordinate),
1626                                         tmp->subordinate->number, 0,
1627                                         translation);
1628         else /* this is a legacy PCI bridge */
1629                 return domain_context_mapping_one(domain,
1630                                                   pci_domain_nr(tmp->bus),
1631                                                   tmp->bus->number,
1632                                                   tmp->devfn,
1633                                                   translation);
1634 }
1635
1636 static int domain_context_mapped(struct pci_dev *pdev)
1637 {
1638         int ret;
1639         struct pci_dev *tmp, *parent;
1640         struct intel_iommu *iommu;
1641
1642         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1643                                 pdev->devfn);
1644         if (!iommu)
1645                 return -ENODEV;
1646
1647         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1648         if (!ret)
1649                 return ret;
1650         /* dependent device mapping */
1651         tmp = pci_find_upstream_pcie_bridge(pdev);
1652         if (!tmp)
1653                 return ret;
1654         /* Secondary interface's bus number and devfn 0 */
1655         parent = pdev->bus->self;
1656         while (parent != tmp) {
1657                 ret = device_context_mapped(iommu, parent->bus->number,
1658                                             parent->devfn);
1659                 if (!ret)
1660                         return ret;
1661                 parent = parent->bus->self;
1662         }
1663         if (tmp->is_pcie)
1664                 return device_context_mapped(iommu, tmp->subordinate->number,
1665                                              0);
1666         else
1667                 return device_context_mapped(iommu, tmp->bus->number,
1668                                              tmp->devfn);
1669 }
1670
1671 /* Returns a number of VTD pages, but aligned to MM page size */
1672 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1673                                             size_t size)
1674 {
1675         host_addr &= ~PAGE_MASK;
1676         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1677 }
1678
1679 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1680                             struct scatterlist *sg, unsigned long phys_pfn,
1681                             unsigned long nr_pages, int prot)
1682 {
1683         struct dma_pte *first_pte = NULL, *pte = NULL;
1684         phys_addr_t uninitialized_var(pteval);
1685         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1686         unsigned long sg_res;
1687
1688         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1689
1690         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1691                 return -EINVAL;
1692
1693         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1694
1695         if (sg)
1696                 sg_res = 0;
1697         else {
1698                 sg_res = nr_pages + 1;
1699                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1700         }
1701
1702         while (nr_pages--) {
1703                 uint64_t tmp;
1704
1705                 if (!sg_res) {
1706                         sg_res = aligned_nrpages(sg->offset, sg->length);
1707                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1708                         sg->dma_length = sg->length;
1709                         pteval = page_to_phys(sg_page(sg)) | prot;
1710                 }
1711                 if (!pte) {
1712                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1713                         if (!pte)
1714                                 return -ENOMEM;
1715                 }
1716                 /* We don't need lock here, nobody else
1717                  * touches the iova range
1718                  */
1719                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1720                 if (tmp) {
1721                         static int dumps = 5;
1722                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1723                                iov_pfn, tmp, (unsigned long long)pteval);
1724                         if (dumps) {
1725                                 dumps--;
1726                                 debug_dma_dump_mappings(NULL);
1727                         }
1728                         WARN_ON(1);
1729                 }
1730                 pte++;
1731                 if (!nr_pages || first_pte_in_page(pte)) {
1732                         domain_flush_cache(domain, first_pte,
1733                                            (void *)pte - (void *)first_pte);
1734                         pte = NULL;
1735                 }
1736                 iov_pfn++;
1737                 pteval += VTD_PAGE_SIZE;
1738                 sg_res--;
1739                 if (!sg_res)
1740                         sg = sg_next(sg);
1741         }
1742         return 0;
1743 }
1744
1745 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1746                                     struct scatterlist *sg, unsigned long nr_pages,
1747                                     int prot)
1748 {
1749         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1750 }
1751
1752 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1753                                      unsigned long phys_pfn, unsigned long nr_pages,
1754                                      int prot)
1755 {
1756         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1757 }
1758
1759 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1760 {
1761         if (!iommu)
1762                 return;
1763
1764         clear_context_table(iommu, bus, devfn);
1765         iommu->flush.flush_context(iommu, 0, 0, 0,
1766                                            DMA_CCMD_GLOBAL_INVL);
1767         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1768 }
1769
1770 static void domain_remove_dev_info(struct dmar_domain *domain)
1771 {
1772         struct device_domain_info *info;
1773         unsigned long flags;
1774         struct intel_iommu *iommu;
1775
1776         spin_lock_irqsave(&device_domain_lock, flags);
1777         while (!list_empty(&domain->devices)) {
1778                 info = list_entry(domain->devices.next,
1779                         struct device_domain_info, link);
1780                 list_del(&info->link);
1781                 list_del(&info->global);
1782                 if (info->dev)
1783                         info->dev->dev.archdata.iommu = NULL;
1784                 spin_unlock_irqrestore(&device_domain_lock, flags);
1785
1786                 iommu_disable_dev_iotlb(info);
1787                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1788                 iommu_detach_dev(iommu, info->bus, info->devfn);
1789                 free_devinfo_mem(info);
1790
1791                 spin_lock_irqsave(&device_domain_lock, flags);
1792         }
1793         spin_unlock_irqrestore(&device_domain_lock, flags);
1794 }
1795
1796 /*
1797  * find_domain
1798  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1799  */
1800 static struct dmar_domain *
1801 find_domain(struct pci_dev *pdev)
1802 {
1803         struct device_domain_info *info;
1804
1805         /* No lock here, assumes no domain exit in normal case */
1806         info = pdev->dev.archdata.iommu;
1807         if (info)
1808                 return info->domain;
1809         return NULL;
1810 }
1811
1812 /* domain is initialized */
1813 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1814 {
1815         struct dmar_domain *domain, *found = NULL;
1816         struct intel_iommu *iommu;
1817         struct dmar_drhd_unit *drhd;
1818         struct device_domain_info *info, *tmp;
1819         struct pci_dev *dev_tmp;
1820         unsigned long flags;
1821         int bus = 0, devfn = 0;
1822         int segment;
1823         int ret;
1824
1825         domain = find_domain(pdev);
1826         if (domain)
1827                 return domain;
1828
1829         segment = pci_domain_nr(pdev->bus);
1830
1831         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1832         if (dev_tmp) {
1833                 if (dev_tmp->is_pcie) {
1834                         bus = dev_tmp->subordinate->number;
1835                         devfn = 0;
1836                 } else {
1837                         bus = dev_tmp->bus->number;
1838                         devfn = dev_tmp->devfn;
1839                 }
1840                 spin_lock_irqsave(&device_domain_lock, flags);
1841                 list_for_each_entry(info, &device_domain_list, global) {
1842                         if (info->segment == segment &&
1843                             info->bus == bus && info->devfn == devfn) {
1844                                 found = info->domain;
1845                                 break;
1846                         }
1847                 }
1848                 spin_unlock_irqrestore(&device_domain_lock, flags);
1849                 /* pcie-pci bridge already has a domain, uses it */
1850                 if (found) {
1851                         domain = found;
1852                         goto found_domain;
1853                 }
1854         }
1855
1856         domain = alloc_domain();
1857         if (!domain)
1858                 goto error;
1859
1860         /* Allocate new domain for the device */
1861         drhd = dmar_find_matched_drhd_unit(pdev);
1862         if (!drhd) {
1863                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1864                         pci_name(pdev));
1865                 return NULL;
1866         }
1867         iommu = drhd->iommu;
1868
1869         ret = iommu_attach_domain(domain, iommu);
1870         if (ret) {
1871                 domain_exit(domain);
1872                 goto error;
1873         }
1874
1875         if (domain_init(domain, gaw)) {
1876                 domain_exit(domain);
1877                 goto error;
1878         }
1879
1880         /* register pcie-to-pci device */
1881         if (dev_tmp) {
1882                 info = alloc_devinfo_mem();
1883                 if (!info) {
1884                         domain_exit(domain);
1885                         goto error;
1886                 }
1887                 info->segment = segment;
1888                 info->bus = bus;
1889                 info->devfn = devfn;
1890                 info->dev = NULL;
1891                 info->domain = domain;
1892                 /* This domain is shared by devices under p2p bridge */
1893                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1894
1895                 /* pcie-to-pci bridge already has a domain, uses it */
1896                 found = NULL;
1897                 spin_lock_irqsave(&device_domain_lock, flags);
1898                 list_for_each_entry(tmp, &device_domain_list, global) {
1899                         if (tmp->segment == segment &&
1900                             tmp->bus == bus && tmp->devfn == devfn) {
1901                                 found = tmp->domain;
1902                                 break;
1903                         }
1904                 }
1905                 if (found) {
1906                         free_devinfo_mem(info);
1907                         domain_exit(domain);
1908                         domain = found;
1909                 } else {
1910                         list_add(&info->link, &domain->devices);
1911                         list_add(&info->global, &device_domain_list);
1912                 }
1913                 spin_unlock_irqrestore(&device_domain_lock, flags);
1914         }
1915
1916 found_domain:
1917         info = alloc_devinfo_mem();
1918         if (!info)
1919                 goto error;
1920         info->segment = segment;
1921         info->bus = pdev->bus->number;
1922         info->devfn = pdev->devfn;
1923         info->dev = pdev;
1924         info->domain = domain;
1925         spin_lock_irqsave(&device_domain_lock, flags);
1926         /* somebody is fast */
1927         found = find_domain(pdev);
1928         if (found != NULL) {
1929                 spin_unlock_irqrestore(&device_domain_lock, flags);
1930                 if (found != domain) {
1931                         domain_exit(domain);
1932                         domain = found;
1933                 }
1934                 free_devinfo_mem(info);
1935                 return domain;
1936         }
1937         list_add(&info->link, &domain->devices);
1938         list_add(&info->global, &device_domain_list);
1939         pdev->dev.archdata.iommu = info;
1940         spin_unlock_irqrestore(&device_domain_lock, flags);
1941         return domain;
1942 error:
1943         /* recheck it here, maybe others set it */
1944         return find_domain(pdev);
1945 }
1946
1947 static int iommu_identity_mapping;
1948 #define IDENTMAP_ALL            1
1949 #define IDENTMAP_GFX            2
1950 #define IDENTMAP_AZALIA         4
1951
1952 static int iommu_domain_identity_map(struct dmar_domain *domain,
1953                                      unsigned long long start,
1954                                      unsigned long long end)
1955 {
1956         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1957         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1958
1959         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1960                           dma_to_mm_pfn(last_vpfn))) {
1961                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1962                 return -ENOMEM;
1963         }
1964
1965         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1966                  start, end, domain->id);
1967         /*
1968          * RMRR range might have overlap with physical memory range,
1969          * clear it first
1970          */
1971         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1972
1973         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1974                                   last_vpfn - first_vpfn + 1,
1975                                   DMA_PTE_READ|DMA_PTE_WRITE);
1976 }
1977
1978 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1979                                       unsigned long long start,
1980                                       unsigned long long end)
1981 {
1982         struct dmar_domain *domain;
1983         int ret;
1984
1985         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1986         if (!domain)
1987                 return -ENOMEM;
1988
1989         /* For _hardware_ passthrough, don't bother. But for software
1990            passthrough, we do it anyway -- it may indicate a memory
1991            range which is reserved in E820, so which didn't get set
1992            up to start with in si_domain */
1993         if (domain == si_domain && hw_pass_through) {
1994                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1995                        pci_name(pdev), start, end);
1996                 return 0;
1997         }
1998
1999         printk(KERN_INFO
2000                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2001                pci_name(pdev), start, end);
2002         
2003         if (end >> agaw_to_width(domain->agaw)) {
2004                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2005                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2006                      agaw_to_width(domain->agaw),
2007                      dmi_get_system_info(DMI_BIOS_VENDOR),
2008                      dmi_get_system_info(DMI_BIOS_VERSION),
2009                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2010                 ret = -EIO;
2011                 goto error;
2012         }
2013
2014         ret = iommu_domain_identity_map(domain, start, end);
2015         if (ret)
2016                 goto error;
2017
2018         /* context entry init */
2019         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2020         if (ret)
2021                 goto error;
2022
2023         return 0;
2024
2025  error:
2026         domain_exit(domain);
2027         return ret;
2028 }
2029
2030 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2031         struct pci_dev *pdev)
2032 {
2033         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2034                 return 0;
2035         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2036                 rmrr->end_address + 1);
2037 }
2038
2039 #ifdef CONFIG_DMAR_FLOPPY_WA
2040 static inline void iommu_prepare_isa(void)
2041 {
2042         struct pci_dev *pdev;
2043         int ret;
2044
2045         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2046         if (!pdev)
2047                 return;
2048
2049         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2050         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2051
2052         if (ret)
2053                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2054                        "floppy might not work\n");
2055
2056 }
2057 #else
2058 static inline void iommu_prepare_isa(void)
2059 {
2060         return;
2061 }
2062 #endif /* !CONFIG_DMAR_FLPY_WA */
2063
2064 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2065
2066 static int __init si_domain_work_fn(unsigned long start_pfn,
2067                                     unsigned long end_pfn, void *datax)
2068 {
2069         int *ret = datax;
2070
2071         *ret = iommu_domain_identity_map(si_domain,
2072                                          (uint64_t)start_pfn << PAGE_SHIFT,
2073                                          (uint64_t)end_pfn << PAGE_SHIFT);
2074         return *ret;
2075
2076 }
2077
2078 static int __init si_domain_init(int hw)
2079 {
2080         struct dmar_drhd_unit *drhd;
2081         struct intel_iommu *iommu;
2082         int nid, ret = 0;
2083
2084         si_domain = alloc_domain();
2085         if (!si_domain)
2086                 return -EFAULT;
2087
2088         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2089
2090         for_each_active_iommu(iommu, drhd) {
2091                 ret = iommu_attach_domain(si_domain, iommu);
2092                 if (ret) {
2093                         domain_exit(si_domain);
2094                         return -EFAULT;
2095                 }
2096         }
2097
2098         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2099                 domain_exit(si_domain);
2100                 return -EFAULT;
2101         }
2102
2103         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2104
2105         if (hw)
2106                 return 0;
2107
2108         for_each_online_node(nid) {
2109                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2110                 if (ret)
2111                         return ret;
2112         }
2113
2114         return 0;
2115 }
2116
2117 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2118                                           struct pci_dev *pdev);
2119 static int identity_mapping(struct pci_dev *pdev)
2120 {
2121         struct device_domain_info *info;
2122
2123         if (likely(!iommu_identity_mapping))
2124                 return 0;
2125
2126
2127         list_for_each_entry(info, &si_domain->devices, link)
2128                 if (info->dev == pdev)
2129                         return 1;
2130         return 0;
2131 }
2132
2133 static int domain_add_dev_info(struct dmar_domain *domain,
2134                                struct pci_dev *pdev,
2135                                int translation)
2136 {
2137         struct device_domain_info *info;
2138         unsigned long flags;
2139         int ret;
2140
2141         info = alloc_devinfo_mem();
2142         if (!info)
2143                 return -ENOMEM;
2144
2145         ret = domain_context_mapping(domain, pdev, translation);
2146         if (ret) {
2147                 free_devinfo_mem(info);
2148                 return ret;
2149         }
2150
2151         info->segment = pci_domain_nr(pdev->bus);
2152         info->bus = pdev->bus->number;
2153         info->devfn = pdev->devfn;
2154         info->dev = pdev;
2155         info->domain = domain;
2156
2157         spin_lock_irqsave(&device_domain_lock, flags);
2158         list_add(&info->link, &domain->devices);
2159         list_add(&info->global, &device_domain_list);
2160         pdev->dev.archdata.iommu = info;
2161         spin_unlock_irqrestore(&device_domain_lock, flags);
2162
2163         return 0;
2164 }
2165
2166 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2167 {
2168         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2169                 return 1;
2170
2171         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2172                 return 1;
2173
2174         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2175                 return 0;
2176
2177         /*
2178          * We want to start off with all devices in the 1:1 domain, and
2179          * take them out later if we find they can't access all of memory.
2180          *
2181          * However, we can't do this for PCI devices behind bridges,
2182          * because all PCI devices behind the same bridge will end up
2183          * with the same source-id on their transactions.
2184          *
2185          * Practically speaking, we can't change things around for these
2186          * devices at run-time, because we can't be sure there'll be no
2187          * DMA transactions in flight for any of their siblings.
2188          * 
2189          * So PCI devices (unless they're on the root bus) as well as
2190          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2191          * the 1:1 domain, just in _case_ one of their siblings turns out
2192          * not to be able to map all of memory.
2193          */
2194         if (!pdev->is_pcie) {
2195                 if (!pci_is_root_bus(pdev->bus))
2196                         return 0;
2197                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2198                         return 0;
2199         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2200                 return 0;
2201
2202         /* 
2203          * At boot time, we don't yet know if devices will be 64-bit capable.
2204          * Assume that they will -- if they turn out not to be, then we can 
2205          * take them out of the 1:1 domain later.
2206          */
2207         if (!startup)
2208                 return pdev->dma_mask > DMA_BIT_MASK(32);
2209
2210         return 1;
2211 }
2212
2213 static int __init iommu_prepare_static_identity_mapping(int hw)
2214 {
2215         struct pci_dev *pdev = NULL;
2216         int ret;
2217
2218         ret = si_domain_init(hw);
2219         if (ret)
2220                 return -EFAULT;
2221
2222         for_each_pci_dev(pdev) {
2223                 if (iommu_should_identity_map(pdev, 1)) {
2224                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2225                                hw ? "hardware" : "software", pci_name(pdev));
2226
2227                         ret = domain_add_dev_info(si_domain, pdev,
2228                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2229                                                      CONTEXT_TT_MULTI_LEVEL);
2230                         if (ret)
2231                                 return ret;
2232                 }
2233         }
2234
2235         return 0;
2236 }
2237
2238 int __init init_dmars(void)
2239 {
2240         struct dmar_drhd_unit *drhd;
2241         struct dmar_rmrr_unit *rmrr;
2242         struct pci_dev *pdev;
2243         struct intel_iommu *iommu;
2244         int i, ret;
2245
2246         /*
2247          * for each drhd
2248          *    allocate root
2249          *    initialize and program root entry to not present
2250          * endfor
2251          */
2252         for_each_drhd_unit(drhd) {
2253                 g_num_of_iommus++;
2254                 /*
2255                  * lock not needed as this is only incremented in the single
2256                  * threaded kernel __init code path all other access are read
2257                  * only
2258                  */
2259         }
2260
2261         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2262                         GFP_KERNEL);
2263         if (!g_iommus) {
2264                 printk(KERN_ERR "Allocating global iommu array failed\n");
2265                 ret = -ENOMEM;
2266                 goto error;
2267         }
2268
2269         deferred_flush = kzalloc(g_num_of_iommus *
2270                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2271         if (!deferred_flush) {
2272                 ret = -ENOMEM;
2273                 goto error;
2274         }
2275
2276         for_each_drhd_unit(drhd) {
2277                 if (drhd->ignored)
2278                         continue;
2279
2280                 iommu = drhd->iommu;
2281                 g_iommus[iommu->seq_id] = iommu;
2282
2283                 ret = iommu_init_domains(iommu);
2284                 if (ret)
2285                         goto error;
2286
2287                 /*
2288                  * TBD:
2289                  * we could share the same root & context tables
2290                  * amoung all IOMMU's. Need to Split it later.
2291                  */
2292                 ret = iommu_alloc_root_entry(iommu);
2293                 if (ret) {
2294                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2295                         goto error;
2296                 }
2297                 if (!ecap_pass_through(iommu->ecap))
2298                         hw_pass_through = 0;
2299         }
2300
2301         /*
2302          * Start from the sane iommu hardware state.
2303          */
2304         for_each_drhd_unit(drhd) {
2305                 if (drhd->ignored)
2306                         continue;
2307
2308                 iommu = drhd->iommu;
2309
2310                 /*
2311                  * If the queued invalidation is already initialized by us
2312                  * (for example, while enabling interrupt-remapping) then
2313                  * we got the things already rolling from a sane state.
2314                  */
2315                 if (iommu->qi)
2316                         continue;
2317
2318                 /*
2319                  * Clear any previous faults.
2320                  */
2321                 dmar_fault(-1, iommu);
2322                 /*
2323                  * Disable queued invalidation if supported and already enabled
2324                  * before OS handover.
2325                  */
2326                 dmar_disable_qi(iommu);
2327         }
2328
2329         for_each_drhd_unit(drhd) {
2330                 if (drhd->ignored)
2331                         continue;
2332
2333                 iommu = drhd->iommu;
2334
2335                 if (dmar_enable_qi(iommu)) {
2336                         /*
2337                          * Queued Invalidate not enabled, use Register Based
2338                          * Invalidate
2339                          */
2340                         iommu->flush.flush_context = __iommu_flush_context;
2341                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2342                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2343                                "invalidation\n",
2344                                (unsigned long long)drhd->reg_base_addr);
2345                 } else {
2346                         iommu->flush.flush_context = qi_flush_context;
2347                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2348                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2349                                "invalidation\n",
2350                                (unsigned long long)drhd->reg_base_addr);
2351                 }
2352         }
2353
2354         if (iommu_pass_through)
2355                 iommu_identity_mapping |= IDENTMAP_ALL;
2356
2357 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2358         iommu_identity_mapping |= IDENTMAP_GFX;
2359 #endif
2360
2361         check_tylersburg_isoch();
2362
2363         /*
2364          * If pass through is not set or not enabled, setup context entries for
2365          * identity mappings for rmrr, gfx, and isa and may fall back to static
2366          * identity mapping if iommu_identity_mapping is set.
2367          */
2368         if (iommu_identity_mapping) {
2369                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2370                 if (ret) {
2371                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2372                         goto error;
2373                 }
2374         }
2375         /*
2376          * For each rmrr
2377          *   for each dev attached to rmrr
2378          *   do
2379          *     locate drhd for dev, alloc domain for dev
2380          *     allocate free domain
2381          *     allocate page table entries for rmrr
2382          *     if context not allocated for bus
2383          *           allocate and init context
2384          *           set present in root table for this bus
2385          *     init context with domain, translation etc
2386          *    endfor
2387          * endfor
2388          */
2389         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2390         for_each_rmrr_units(rmrr) {
2391                 for (i = 0; i < rmrr->devices_cnt; i++) {
2392                         pdev = rmrr->devices[i];
2393                         /*
2394                          * some BIOS lists non-exist devices in DMAR
2395                          * table.
2396                          */
2397                         if (!pdev)
2398                                 continue;
2399                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2400                         if (ret)
2401                                 printk(KERN_ERR
2402                                        "IOMMU: mapping reserved region failed\n");
2403                 }
2404         }
2405
2406         iommu_prepare_isa();
2407
2408         /*
2409          * for each drhd
2410          *   enable fault log
2411          *   global invalidate context cache
2412          *   global invalidate iotlb
2413          *   enable translation
2414          */
2415         for_each_drhd_unit(drhd) {
2416                 if (drhd->ignored)
2417                         continue;
2418                 iommu = drhd->iommu;
2419
2420                 iommu_flush_write_buffer(iommu);
2421
2422                 ret = dmar_set_interrupt(iommu);
2423                 if (ret)
2424                         goto error;
2425
2426                 iommu_set_root_entry(iommu);
2427
2428                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2429                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2430
2431                 ret = iommu_enable_translation(iommu);
2432                 if (ret)
2433                         goto error;
2434
2435                 iommu_disable_protect_mem_regions(iommu);
2436         }
2437
2438         return 0;
2439 error:
2440         for_each_drhd_unit(drhd) {
2441                 if (drhd->ignored)
2442                         continue;
2443                 iommu = drhd->iommu;
2444                 free_iommu(iommu);
2445         }
2446         kfree(g_iommus);
2447         return ret;
2448 }
2449
2450 /* This takes a number of _MM_ pages, not VTD pages */
2451 static struct iova *intel_alloc_iova(struct device *dev,
2452                                      struct dmar_domain *domain,
2453                                      unsigned long nrpages, uint64_t dma_mask)
2454 {
2455         struct pci_dev *pdev = to_pci_dev(dev);
2456         struct iova *iova = NULL;
2457
2458         /* Restrict dma_mask to the width that the iommu can handle */
2459         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2460
2461         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2462                 /*
2463                  * First try to allocate an io virtual address in
2464                  * DMA_BIT_MASK(32) and if that fails then try allocating
2465                  * from higher range
2466                  */
2467                 iova = alloc_iova(&domain->iovad, nrpages,
2468                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2469                 if (iova)
2470                         return iova;
2471         }
2472         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2473         if (unlikely(!iova)) {
2474                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2475                        nrpages, pci_name(pdev));
2476                 return NULL;
2477         }
2478
2479         return iova;
2480 }
2481
2482 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2483 {
2484         struct dmar_domain *domain;
2485         int ret;
2486
2487         domain = get_domain_for_dev(pdev,
2488                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2489         if (!domain) {
2490                 printk(KERN_ERR
2491                         "Allocating domain for %s failed", pci_name(pdev));
2492                 return NULL;
2493         }
2494
2495         /* make sure context mapping is ok */
2496         if (unlikely(!domain_context_mapped(pdev))) {
2497                 ret = domain_context_mapping(domain, pdev,
2498                                              CONTEXT_TT_MULTI_LEVEL);
2499                 if (ret) {
2500                         printk(KERN_ERR
2501                                 "Domain context map for %s failed",
2502                                 pci_name(pdev));
2503                         return NULL;
2504                 }
2505         }
2506
2507         return domain;
2508 }
2509
2510 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2511 {
2512         struct device_domain_info *info;
2513
2514         /* No lock here, assumes no domain exit in normal case */
2515         info = dev->dev.archdata.iommu;
2516         if (likely(info))
2517                 return info->domain;
2518
2519         return __get_valid_domain_for_dev(dev);
2520 }
2521
2522 static int iommu_dummy(struct pci_dev *pdev)
2523 {
2524         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2525 }
2526
2527 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2528 static int iommu_no_mapping(struct device *dev)
2529 {
2530         struct pci_dev *pdev;
2531         int found;
2532
2533         if (unlikely(dev->bus != &pci_bus_type))
2534                 return 1;
2535
2536         pdev = to_pci_dev(dev);
2537         if (iommu_dummy(pdev))
2538                 return 1;
2539
2540         if (!iommu_identity_mapping)
2541                 return 0;
2542
2543         found = identity_mapping(pdev);
2544         if (found) {
2545                 if (iommu_should_identity_map(pdev, 0))
2546                         return 1;
2547                 else {
2548                         /*
2549                          * 32 bit DMA is removed from si_domain and fall back
2550                          * to non-identity mapping.
2551                          */
2552                         domain_remove_one_dev_info(si_domain, pdev);
2553                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2554                                pci_name(pdev));
2555                         return 0;
2556                 }
2557         } else {
2558                 /*
2559                  * In case of a detached 64 bit DMA device from vm, the device
2560                  * is put into si_domain for identity mapping.
2561                  */
2562                 if (iommu_should_identity_map(pdev, 0)) {
2563                         int ret;
2564                         ret = domain_add_dev_info(si_domain, pdev,
2565                                                   hw_pass_through ?
2566                                                   CONTEXT_TT_PASS_THROUGH :
2567                                                   CONTEXT_TT_MULTI_LEVEL);
2568                         if (!ret) {
2569                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2570                                        pci_name(pdev));
2571                                 return 1;
2572                         }
2573                 }
2574         }
2575
2576         return 0;
2577 }
2578
2579 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2580                                      size_t size, int dir, u64 dma_mask)
2581 {
2582         struct pci_dev *pdev = to_pci_dev(hwdev);
2583         struct dmar_domain *domain;
2584         phys_addr_t start_paddr;
2585         struct iova *iova;
2586         int prot = 0;
2587         int ret;
2588         struct intel_iommu *iommu;
2589         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2590
2591         BUG_ON(dir == DMA_NONE);
2592
2593         if (iommu_no_mapping(hwdev))
2594                 return paddr;
2595
2596         domain = get_valid_domain_for_dev(pdev);
2597         if (!domain)
2598                 return 0;
2599
2600         iommu = domain_get_iommu(domain);
2601         size = aligned_nrpages(paddr, size);
2602
2603         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2604                                 pdev->dma_mask);
2605         if (!iova)
2606                 goto error;
2607
2608         /*
2609          * Check if DMAR supports zero-length reads on write only
2610          * mappings..
2611          */
2612         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2613                         !cap_zlr(iommu->cap))
2614                 prot |= DMA_PTE_READ;
2615         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2616                 prot |= DMA_PTE_WRITE;
2617         /*
2618          * paddr - (paddr + size) might be partial page, we should map the whole
2619          * page.  Note: if two part of one page are separately mapped, we
2620          * might have two guest_addr mapping to the same host paddr, but this
2621          * is not a big problem
2622          */
2623         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2624                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2625         if (ret)
2626                 goto error;
2627
2628         /* it's a non-present to present mapping. Only flush if caching mode */
2629         if (cap_caching_mode(iommu->cap))
2630                 iommu_flush_iotlb_psi(iommu, 0, mm_to_dma_pfn(iova->pfn_lo), size);
2631         else
2632                 iommu_flush_write_buffer(iommu);
2633
2634         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2635         start_paddr += paddr & ~PAGE_MASK;
2636         return start_paddr;
2637
2638 error:
2639         if (iova)
2640                 __free_iova(&domain->iovad, iova);
2641         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2642                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2643         return 0;
2644 }
2645
2646 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2647                                  unsigned long offset, size_t size,
2648                                  enum dma_data_direction dir,
2649                                  struct dma_attrs *attrs)
2650 {
2651         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2652                                   dir, to_pci_dev(dev)->dma_mask);
2653 }
2654
2655 static void flush_unmaps(void)
2656 {
2657         int i, j;
2658
2659         timer_on = 0;
2660
2661         /* just flush them all */
2662         for (i = 0; i < g_num_of_iommus; i++) {
2663                 struct intel_iommu *iommu = g_iommus[i];
2664                 if (!iommu)
2665                         continue;
2666
2667                 if (!deferred_flush[i].next)
2668                         continue;
2669
2670                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2671                                          DMA_TLB_GLOBAL_FLUSH);
2672                 for (j = 0; j < deferred_flush[i].next; j++) {
2673                         unsigned long mask;
2674                         struct iova *iova = deferred_flush[i].iova[j];
2675
2676                         mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2677                         iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2678                                         (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2679                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2680                 }
2681                 deferred_flush[i].next = 0;
2682         }
2683
2684         list_size = 0;
2685 }
2686
2687 static void flush_unmaps_timeout(unsigned long data)
2688 {
2689         unsigned long flags;
2690
2691         spin_lock_irqsave(&async_umap_flush_lock, flags);
2692         flush_unmaps();
2693         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2694 }
2695
2696 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2697 {
2698         unsigned long flags;
2699         int next, iommu_id;
2700         struct intel_iommu *iommu;
2701
2702         spin_lock_irqsave(&async_umap_flush_lock, flags);
2703         if (list_size == HIGH_WATER_MARK)
2704                 flush_unmaps();
2705
2706         iommu = domain_get_iommu(dom);
2707         iommu_id = iommu->seq_id;
2708
2709         next = deferred_flush[iommu_id].next;
2710         deferred_flush[iommu_id].domain[next] = dom;
2711         deferred_flush[iommu_id].iova[next] = iova;
2712         deferred_flush[iommu_id].next++;
2713
2714         if (!timer_on) {
2715                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2716                 timer_on = 1;
2717         }
2718         list_size++;
2719         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2720 }
2721
2722 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2723                              size_t size, enum dma_data_direction dir,
2724                              struct dma_attrs *attrs)
2725 {
2726         struct pci_dev *pdev = to_pci_dev(dev);
2727         struct dmar_domain *domain;
2728         unsigned long start_pfn, last_pfn;
2729         struct iova *iova;
2730         struct intel_iommu *iommu;
2731
2732         if (iommu_no_mapping(dev))
2733                 return;
2734
2735         domain = find_domain(pdev);
2736         BUG_ON(!domain);
2737
2738         iommu = domain_get_iommu(domain);
2739
2740         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2741         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2742                       (unsigned long long)dev_addr))
2743                 return;
2744
2745         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2746         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2747
2748         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2749                  pci_name(pdev), start_pfn, last_pfn);
2750
2751         /*  clear the whole page */
2752         dma_pte_clear_range(domain, start_pfn, last_pfn);
2753
2754         /* free page tables */
2755         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2756
2757         if (intel_iommu_strict) {
2758                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2759                                       last_pfn - start_pfn + 1);
2760                 /* free iova */
2761                 __free_iova(&domain->iovad, iova);
2762         } else {
2763                 add_unmap(domain, iova);
2764                 /*
2765                  * queue up the release of the unmap to save the 1/6th of the
2766                  * cpu used up by the iotlb flush operation...
2767                  */
2768         }
2769 }
2770
2771 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2772                                   dma_addr_t *dma_handle, gfp_t flags)
2773 {
2774         void *vaddr;
2775         int order;
2776
2777         size = PAGE_ALIGN(size);
2778         order = get_order(size);
2779         flags &= ~(GFP_DMA | GFP_DMA32);
2780
2781         vaddr = (void *)__get_free_pages(flags, order);
2782         if (!vaddr)
2783                 return NULL;
2784         memset(vaddr, 0, size);
2785
2786         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2787                                          DMA_BIDIRECTIONAL,
2788                                          hwdev->coherent_dma_mask);
2789         if (*dma_handle)
2790                 return vaddr;
2791         free_pages((unsigned long)vaddr, order);
2792         return NULL;
2793 }
2794
2795 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2796                                 dma_addr_t dma_handle)
2797 {
2798         int order;
2799
2800         size = PAGE_ALIGN(size);
2801         order = get_order(size);
2802
2803         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2804         free_pages((unsigned long)vaddr, order);
2805 }
2806
2807 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2808                            int nelems, enum dma_data_direction dir,
2809                            struct dma_attrs *attrs)
2810 {
2811         struct pci_dev *pdev = to_pci_dev(hwdev);
2812         struct dmar_domain *domain;
2813         unsigned long start_pfn, last_pfn;
2814         struct iova *iova;
2815         struct intel_iommu *iommu;
2816
2817         if (iommu_no_mapping(hwdev))
2818                 return;
2819
2820         domain = find_domain(pdev);
2821         BUG_ON(!domain);
2822
2823         iommu = domain_get_iommu(domain);
2824
2825         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2826         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2827                       (unsigned long long)sglist[0].dma_address))
2828                 return;
2829
2830         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2831         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2832
2833         /*  clear the whole page */
2834         dma_pte_clear_range(domain, start_pfn, last_pfn);
2835
2836         /* free page tables */
2837         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2838
2839         if (intel_iommu_strict) {
2840                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2841                                       last_pfn - start_pfn + 1);
2842                 /* free iova */
2843                 __free_iova(&domain->iovad, iova);
2844         } else {
2845                 add_unmap(domain, iova);
2846                 /*
2847                  * queue up the release of the unmap to save the 1/6th of the
2848                  * cpu used up by the iotlb flush operation...
2849                  */
2850         }
2851 }
2852
2853 static int intel_nontranslate_map_sg(struct device *hddev,
2854         struct scatterlist *sglist, int nelems, int dir)
2855 {
2856         int i;
2857         struct scatterlist *sg;
2858
2859         for_each_sg(sglist, sg, nelems, i) {
2860                 BUG_ON(!sg_page(sg));
2861                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2862                 sg->dma_length = sg->length;
2863         }
2864         return nelems;
2865 }
2866
2867 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2868                         enum dma_data_direction dir, struct dma_attrs *attrs)
2869 {
2870         int i;
2871         struct pci_dev *pdev = to_pci_dev(hwdev);
2872         struct dmar_domain *domain;
2873         size_t size = 0;
2874         int prot = 0;
2875         size_t offset_pfn = 0;
2876         struct iova *iova = NULL;
2877         int ret;
2878         struct scatterlist *sg;
2879         unsigned long start_vpfn;
2880         struct intel_iommu *iommu;
2881
2882         BUG_ON(dir == DMA_NONE);
2883         if (iommu_no_mapping(hwdev))
2884                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2885
2886         domain = get_valid_domain_for_dev(pdev);
2887         if (!domain)
2888                 return 0;
2889
2890         iommu = domain_get_iommu(domain);
2891
2892         for_each_sg(sglist, sg, nelems, i)
2893                 size += aligned_nrpages(sg->offset, sg->length);
2894
2895         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2896                                 pdev->dma_mask);
2897         if (!iova) {
2898                 sglist->dma_length = 0;
2899                 return 0;
2900         }
2901
2902         /*
2903          * Check if DMAR supports zero-length reads on write only
2904          * mappings..
2905          */
2906         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2907                         !cap_zlr(iommu->cap))
2908                 prot |= DMA_PTE_READ;
2909         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2910                 prot |= DMA_PTE_WRITE;
2911
2912         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2913
2914         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2915         if (unlikely(ret)) {
2916                 /*  clear the page */
2917                 dma_pte_clear_range(domain, start_vpfn,
2918                                     start_vpfn + size - 1);
2919                 /* free page tables */
2920                 dma_pte_free_pagetable(domain, start_vpfn,
2921                                        start_vpfn + size - 1);
2922                 /* free iova */
2923                 __free_iova(&domain->iovad, iova);
2924                 return 0;
2925         }
2926
2927         /* it's a non-present to present mapping. Only flush if caching mode */
2928         if (cap_caching_mode(iommu->cap))
2929                 iommu_flush_iotlb_psi(iommu, 0, start_vpfn, offset_pfn);
2930         else
2931                 iommu_flush_write_buffer(iommu);
2932
2933         return nelems;
2934 }
2935
2936 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2937 {
2938         return !dma_addr;
2939 }
2940
2941 struct dma_map_ops intel_dma_ops = {
2942         .alloc_coherent = intel_alloc_coherent,
2943         .free_coherent = intel_free_coherent,
2944         .map_sg = intel_map_sg,
2945         .unmap_sg = intel_unmap_sg,
2946         .map_page = intel_map_page,
2947         .unmap_page = intel_unmap_page,
2948         .mapping_error = intel_mapping_error,
2949 };
2950
2951 static inline int iommu_domain_cache_init(void)
2952 {
2953         int ret = 0;
2954
2955         iommu_domain_cache = kmem_cache_create("iommu_domain",
2956                                          sizeof(struct dmar_domain),
2957                                          0,
2958                                          SLAB_HWCACHE_ALIGN,
2959
2960                                          NULL);
2961         if (!iommu_domain_cache) {
2962                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2963                 ret = -ENOMEM;
2964         }
2965
2966         return ret;
2967 }
2968
2969 static inline int iommu_devinfo_cache_init(void)
2970 {
2971         int ret = 0;
2972
2973         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2974                                          sizeof(struct device_domain_info),
2975                                          0,
2976                                          SLAB_HWCACHE_ALIGN,
2977                                          NULL);
2978         if (!iommu_devinfo_cache) {
2979                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2980                 ret = -ENOMEM;
2981         }
2982
2983         return ret;
2984 }
2985
2986 static inline int iommu_iova_cache_init(void)
2987 {
2988         int ret = 0;
2989
2990         iommu_iova_cache = kmem_cache_create("iommu_iova",
2991                                          sizeof(struct iova),
2992                                          0,
2993                                          SLAB_HWCACHE_ALIGN,
2994                                          NULL);
2995         if (!iommu_iova_cache) {
2996                 printk(KERN_ERR "Couldn't create iova cache\n");
2997                 ret = -ENOMEM;
2998         }
2999
3000         return ret;
3001 }
3002
3003 static int __init iommu_init_mempool(void)
3004 {
3005         int ret;
3006         ret = iommu_iova_cache_init();
3007         if (ret)
3008                 return ret;
3009
3010         ret = iommu_domain_cache_init();
3011         if (ret)
3012                 goto domain_error;
3013
3014         ret = iommu_devinfo_cache_init();
3015         if (!ret)
3016                 return ret;
3017
3018         kmem_cache_destroy(iommu_domain_cache);
3019 domain_error:
3020         kmem_cache_destroy(iommu_iova_cache);
3021
3022         return -ENOMEM;
3023 }
3024
3025 static void __init iommu_exit_mempool(void)
3026 {
3027         kmem_cache_destroy(iommu_devinfo_cache);
3028         kmem_cache_destroy(iommu_domain_cache);
3029         kmem_cache_destroy(iommu_iova_cache);
3030
3031 }
3032
3033 static void __init init_no_remapping_devices(void)
3034 {
3035         struct dmar_drhd_unit *drhd;
3036
3037         for_each_drhd_unit(drhd) {
3038                 if (!drhd->include_all) {
3039                         int i;
3040                         for (i = 0; i < drhd->devices_cnt; i++)
3041                                 if (drhd->devices[i] != NULL)
3042                                         break;
3043                         /* ignore DMAR unit if no pci devices exist */
3044                         if (i == drhd->devices_cnt)
3045                                 drhd->ignored = 1;
3046                 }
3047         }
3048
3049         if (dmar_map_gfx)
3050                 return;
3051
3052         for_each_drhd_unit(drhd) {
3053                 int i;
3054                 if (drhd->ignored || drhd->include_all)
3055                         continue;
3056
3057                 for (i = 0; i < drhd->devices_cnt; i++)
3058                         if (drhd->devices[i] &&
3059                                 !IS_GFX_DEVICE(drhd->devices[i]))
3060                                 break;
3061
3062                 if (i < drhd->devices_cnt)
3063                         continue;
3064
3065                 /* bypass IOMMU if it is just for gfx devices */
3066                 drhd->ignored = 1;
3067                 for (i = 0; i < drhd->devices_cnt; i++) {
3068                         if (!drhd->devices[i])
3069                                 continue;
3070                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3071                 }
3072         }
3073 }
3074
3075 #ifdef CONFIG_SUSPEND
3076 static int init_iommu_hw(void)
3077 {
3078         struct dmar_drhd_unit *drhd;
3079         struct intel_iommu *iommu = NULL;
3080
3081         for_each_active_iommu(iommu, drhd)
3082                 if (iommu->qi)
3083                         dmar_reenable_qi(iommu);
3084
3085         for_each_active_iommu(iommu, drhd) {
3086                 iommu_flush_write_buffer(iommu);
3087
3088                 iommu_set_root_entry(iommu);
3089
3090                 iommu->flush.flush_context(iommu, 0, 0, 0,
3091                                            DMA_CCMD_GLOBAL_INVL);
3092                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3093                                          DMA_TLB_GLOBAL_FLUSH);
3094                 iommu_enable_translation(iommu);
3095                 iommu_disable_protect_mem_regions(iommu);
3096         }
3097
3098         return 0;
3099 }
3100
3101 static void iommu_flush_all(void)
3102 {
3103         struct dmar_drhd_unit *drhd;
3104         struct intel_iommu *iommu;
3105
3106         for_each_active_iommu(iommu, drhd) {
3107                 iommu->flush.flush_context(iommu, 0, 0, 0,
3108                                            DMA_CCMD_GLOBAL_INVL);
3109                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3110                                          DMA_TLB_GLOBAL_FLUSH);
3111         }
3112 }
3113
3114 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
3115 {
3116         struct dmar_drhd_unit *drhd;
3117         struct intel_iommu *iommu = NULL;
3118         unsigned long flag;
3119
3120         for_each_active_iommu(iommu, drhd) {
3121                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3122                                                  GFP_ATOMIC);
3123                 if (!iommu->iommu_state)
3124                         goto nomem;
3125         }
3126
3127         iommu_flush_all();
3128
3129         for_each_active_iommu(iommu, drhd) {
3130                 iommu_disable_translation(iommu);
3131
3132                 spin_lock_irqsave(&iommu->register_lock, flag);
3133
3134                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3135                         readl(iommu->reg + DMAR_FECTL_REG);
3136                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3137                         readl(iommu->reg + DMAR_FEDATA_REG);
3138                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3139                         readl(iommu->reg + DMAR_FEADDR_REG);
3140                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3141                         readl(iommu->reg + DMAR_FEUADDR_REG);
3142
3143                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3144         }
3145         return 0;
3146
3147 nomem:
3148         for_each_active_iommu(iommu, drhd)
3149                 kfree(iommu->iommu_state);
3150
3151         return -ENOMEM;
3152 }
3153
3154 static int iommu_resume(struct sys_device *dev)
3155 {
3156         struct dmar_drhd_unit *drhd;
3157         struct intel_iommu *iommu = NULL;
3158         unsigned long flag;
3159
3160         if (init_iommu_hw()) {
3161                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3162                 return -EIO;
3163         }
3164
3165         for_each_active_iommu(iommu, drhd) {
3166
3167                 spin_lock_irqsave(&iommu->register_lock, flag);
3168
3169                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3170                         iommu->reg + DMAR_FECTL_REG);
3171                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3172                         iommu->reg + DMAR_FEDATA_REG);
3173                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3174                         iommu->reg + DMAR_FEADDR_REG);
3175                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3176                         iommu->reg + DMAR_FEUADDR_REG);
3177
3178                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3179         }
3180
3181         for_each_active_iommu(iommu, drhd)
3182                 kfree(iommu->iommu_state);
3183
3184         return 0;
3185 }
3186
3187 static struct sysdev_class iommu_sysclass = {
3188         .name           = "iommu",
3189         .resume         = iommu_resume,
3190         .suspend        = iommu_suspend,
3191 };
3192
3193 static struct sys_device device_iommu = {
3194         .cls    = &iommu_sysclass,
3195 };
3196
3197 static int __init init_iommu_sysfs(void)
3198 {
3199         int error;
3200
3201         error = sysdev_class_register(&iommu_sysclass);
3202         if (error)
3203                 return error;
3204
3205         error = sysdev_register(&device_iommu);
3206         if (error)
3207                 sysdev_class_unregister(&iommu_sysclass);
3208
3209         return error;
3210 }
3211
3212 #else
3213 static int __init init_iommu_sysfs(void)
3214 {
3215         return 0;
3216 }
3217 #endif  /* CONFIG_PM */
3218
3219 int __init intel_iommu_init(void)
3220 {
3221         int ret = 0;
3222         int force_on = 0;
3223
3224         /* VT-d is required for a TXT/tboot launch, so enforce that */
3225         force_on = tboot_force_iommu();
3226
3227         if (dmar_table_init()) {
3228                 if (force_on)
3229                         panic("tboot: Failed to initialize DMAR table\n");
3230                 return  -ENODEV;
3231         }
3232
3233         if (dmar_dev_scope_init()) {
3234                 if (force_on)
3235                         panic("tboot: Failed to initialize DMAR device scope\n");
3236                 return  -ENODEV;
3237         }
3238
3239         /*
3240          * Check the need for DMA-remapping initialization now.
3241          * Above initialization will also be used by Interrupt-remapping.
3242          */
3243         if (no_iommu || swiotlb || dmar_disabled)
3244                 return -ENODEV;
3245
3246         iommu_init_mempool();
3247         dmar_init_reserved_ranges();
3248
3249         init_no_remapping_devices();
3250
3251         ret = init_dmars();
3252         if (ret) {
3253                 if (force_on)
3254                         panic("tboot: Failed to initialize DMARs\n");
3255                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3256                 put_iova_domain(&reserved_iova_list);
3257                 iommu_exit_mempool();
3258                 return ret;
3259         }
3260         printk(KERN_INFO
3261         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3262
3263         init_timer(&unmap_timer);
3264         force_iommu = 1;
3265         dma_ops = &intel_dma_ops;
3266
3267         init_iommu_sysfs();
3268
3269         register_iommu(&intel_iommu_ops);
3270
3271         return 0;
3272 }
3273
3274 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3275                                            struct pci_dev *pdev)
3276 {
3277         struct pci_dev *tmp, *parent;
3278
3279         if (!iommu || !pdev)
3280                 return;
3281
3282         /* dependent device detach */
3283         tmp = pci_find_upstream_pcie_bridge(pdev);
3284         /* Secondary interface's bus number and devfn 0 */
3285         if (tmp) {
3286                 parent = pdev->bus->self;
3287                 while (parent != tmp) {
3288                         iommu_detach_dev(iommu, parent->bus->number,
3289                                          parent->devfn);
3290                         parent = parent->bus->self;
3291                 }
3292                 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
3293                         iommu_detach_dev(iommu,
3294                                 tmp->subordinate->number, 0);
3295                 else /* this is a legacy PCI bridge */
3296                         iommu_detach_dev(iommu, tmp->bus->number,
3297                                          tmp->devfn);
3298         }
3299 }
3300
3301 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3302                                           struct pci_dev *pdev)
3303 {
3304         struct device_domain_info *info;
3305         struct intel_iommu *iommu;
3306         unsigned long flags;
3307         int found = 0;
3308         struct list_head *entry, *tmp;
3309
3310         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3311                                 pdev->devfn);
3312         if (!iommu)
3313                 return;
3314
3315         spin_lock_irqsave(&device_domain_lock, flags);
3316         list_for_each_safe(entry, tmp, &domain->devices) {
3317                 info = list_entry(entry, struct device_domain_info, link);
3318                 /* No need to compare PCI domain; it has to be the same */
3319                 if (info->bus == pdev->bus->number &&
3320                     info->devfn == pdev->devfn) {
3321                         list_del(&info->link);
3322                         list_del(&info->global);
3323                         if (info->dev)
3324                                 info->dev->dev.archdata.iommu = NULL;
3325                         spin_unlock_irqrestore(&device_domain_lock, flags);
3326
3327                         iommu_disable_dev_iotlb(info);
3328                         iommu_detach_dev(iommu, info->bus, info->devfn);
3329                         iommu_detach_dependent_devices(iommu, pdev);
3330                         free_devinfo_mem(info);
3331
3332                         spin_lock_irqsave(&device_domain_lock, flags);
3333
3334                         if (found)
3335                                 break;
3336                         else
3337                                 continue;
3338                 }
3339
3340                 /* if there is no other devices under the same iommu
3341                  * owned by this domain, clear this iommu in iommu_bmp
3342                  * update iommu count and coherency
3343                  */
3344                 if (iommu == device_to_iommu(info->segment, info->bus,
3345                                             info->devfn))
3346                         found = 1;
3347         }
3348
3349         if (found == 0) {
3350                 unsigned long tmp_flags;
3351                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3352                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3353                 domain->iommu_count--;
3354                 domain_update_iommu_cap(domain);
3355                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3356         }
3357
3358         spin_unlock_irqrestore(&device_domain_lock, flags);
3359 }
3360
3361 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3362 {
3363         struct device_domain_info *info;
3364         struct intel_iommu *iommu;
3365         unsigned long flags1, flags2;
3366
3367         spin_lock_irqsave(&device_domain_lock, flags1);
3368         while (!list_empty(&domain->devices)) {
3369                 info = list_entry(domain->devices.next,
3370                         struct device_domain_info, link);
3371                 list_del(&info->link);
3372                 list_del(&info->global);
3373                 if (info->dev)
3374                         info->dev->dev.archdata.iommu = NULL;
3375
3376                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3377
3378                 iommu_disable_dev_iotlb(info);
3379                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3380                 iommu_detach_dev(iommu, info->bus, info->devfn);
3381                 iommu_detach_dependent_devices(iommu, info->dev);
3382
3383                 /* clear this iommu in iommu_bmp, update iommu count
3384                  * and capabilities
3385                  */
3386                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3387                 if (test_and_clear_bit(iommu->seq_id,
3388                                        &domain->iommu_bmp)) {
3389                         domain->iommu_count--;
3390                         domain_update_iommu_cap(domain);
3391                 }
3392                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3393
3394                 free_devinfo_mem(info);
3395                 spin_lock_irqsave(&device_domain_lock, flags1);
3396         }
3397         spin_unlock_irqrestore(&device_domain_lock, flags1);
3398 }
3399
3400 /* domain id for virtual machine, it won't be set in context */
3401 static unsigned long vm_domid;
3402
3403 static int vm_domain_min_agaw(struct dmar_domain *domain)
3404 {
3405         int i;
3406         int min_agaw = domain->agaw;
3407
3408         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
3409         for (; i < g_num_of_iommus; ) {
3410                 if (min_agaw > g_iommus[i]->agaw)
3411                         min_agaw = g_iommus[i]->agaw;
3412
3413                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
3414         }
3415
3416         return min_agaw;
3417 }
3418
3419 static struct dmar_domain *iommu_alloc_vm_domain(void)
3420 {
3421         struct dmar_domain *domain;
3422
3423         domain = alloc_domain_mem();
3424         if (!domain)
3425                 return NULL;
3426
3427         domain->id = vm_domid++;
3428         domain->nid = -1;
3429         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3430         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3431
3432         return domain;
3433 }
3434
3435 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3436 {
3437         int adjust_width;
3438
3439         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3440         spin_lock_init(&domain->iommu_lock);
3441
3442         domain_reserve_special_ranges(domain);
3443
3444         /* calculate AGAW */
3445         domain->gaw = guest_width;
3446         adjust_width = guestwidth_to_adjustwidth(guest_width);
3447         domain->agaw = width_to_agaw(adjust_width);
3448
3449         INIT_LIST_HEAD(&domain->devices);
3450
3451         domain->iommu_count = 0;
3452         domain->iommu_coherency = 0;
3453         domain->iommu_snooping = 0;
3454         domain->max_addr = 0;
3455         domain->nid = -1;
3456
3457         /* always allocate the top pgd */
3458         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3459         if (!domain->pgd)
3460                 return -ENOMEM;
3461         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3462         return 0;
3463 }
3464
3465 static void iommu_free_vm_domain(struct dmar_domain *domain)
3466 {
3467         unsigned long flags;
3468         struct dmar_drhd_unit *drhd;
3469         struct intel_iommu *iommu;
3470         unsigned long i;
3471         unsigned long ndomains;
3472
3473         for_each_drhd_unit(drhd) {
3474                 if (drhd->ignored)
3475                         continue;
3476                 iommu = drhd->iommu;
3477
3478                 ndomains = cap_ndoms(iommu->cap);
3479                 i = find_first_bit(iommu->domain_ids, ndomains);
3480                 for (; i < ndomains; ) {
3481                         if (iommu->domains[i] == domain) {
3482                                 spin_lock_irqsave(&iommu->lock, flags);
3483                                 clear_bit(i, iommu->domain_ids);
3484                                 iommu->domains[i] = NULL;
3485                                 spin_unlock_irqrestore(&iommu->lock, flags);
3486                                 break;
3487                         }
3488                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3489                 }
3490         }
3491 }
3492
3493 static void vm_domain_exit(struct dmar_domain *domain)
3494 {
3495         /* Domain 0 is reserved, so dont process it */
3496         if (!domain)
3497                 return;
3498
3499         vm_domain_remove_all_dev_info(domain);
3500         /* destroy iovas */
3501         put_iova_domain(&domain->iovad);
3502
3503         /* clear ptes */
3504         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3505
3506         /* free page tables */
3507         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3508
3509         iommu_free_vm_domain(domain);
3510         free_domain_mem(domain);
3511 }
3512
3513 static int intel_iommu_domain_init(struct iommu_domain *domain)
3514 {
3515         struct dmar_domain *dmar_domain;
3516
3517         dmar_domain = iommu_alloc_vm_domain();
3518         if (!dmar_domain) {
3519                 printk(KERN_ERR
3520                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3521                 return -ENOMEM;
3522         }
3523         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3524                 printk(KERN_ERR
3525                         "intel_iommu_domain_init() failed\n");
3526                 vm_domain_exit(dmar_domain);
3527                 return -ENOMEM;
3528         }
3529         domain->priv = dmar_domain;
3530
3531         return 0;
3532 }
3533
3534 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3535 {
3536         struct dmar_domain *dmar_domain = domain->priv;
3537
3538         domain->priv = NULL;
3539         vm_domain_exit(dmar_domain);
3540 }
3541
3542 static int intel_iommu_attach_device(struct iommu_domain *domain,
3543                                      struct device *dev)
3544 {
3545         struct dmar_domain *dmar_domain = domain->priv;
3546         struct pci_dev *pdev = to_pci_dev(dev);
3547         struct intel_iommu *iommu;
3548         int addr_width;
3549         u64 end;
3550
3551         /* normally pdev is not mapped */
3552         if (unlikely(domain_context_mapped(pdev))) {
3553                 struct dmar_domain *old_domain;
3554
3555                 old_domain = find_domain(pdev);
3556                 if (old_domain) {
3557                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3558                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3559                                 domain_remove_one_dev_info(old_domain, pdev);
3560                         else
3561                                 domain_remove_dev_info(old_domain);
3562                 }
3563         }
3564
3565         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3566                                 pdev->devfn);
3567         if (!iommu)
3568                 return -ENODEV;
3569
3570         /* check if this iommu agaw is sufficient for max mapped address */
3571         addr_width = agaw_to_width(iommu->agaw);
3572         end = DOMAIN_MAX_ADDR(addr_width);
3573         end = end & VTD_PAGE_MASK;
3574         if (end < dmar_domain->max_addr) {
3575                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3576                        "sufficient for the mapped address (%llx)\n",
3577                        __func__, iommu->agaw, dmar_domain->max_addr);
3578                 return -EFAULT;
3579         }
3580
3581         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3582 }
3583
3584 static void intel_iommu_detach_device(struct iommu_domain *domain,
3585                                       struct device *dev)
3586 {
3587         struct dmar_domain *dmar_domain = domain->priv;
3588         struct pci_dev *pdev = to_pci_dev(dev);
3589
3590         domain_remove_one_dev_info(dmar_domain, pdev);
3591 }
3592
3593 static int intel_iommu_map_range(struct iommu_domain *domain,
3594                                  unsigned long iova, phys_addr_t hpa,
3595                                  size_t size, int iommu_prot)
3596 {
3597         struct dmar_domain *dmar_domain = domain->priv;
3598         u64 max_addr;
3599         int addr_width;
3600         int prot = 0;
3601         int ret;
3602
3603         if (iommu_prot & IOMMU_READ)
3604                 prot |= DMA_PTE_READ;
3605         if (iommu_prot & IOMMU_WRITE)
3606                 prot |= DMA_PTE_WRITE;
3607         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3608                 prot |= DMA_PTE_SNP;
3609
3610         max_addr = iova + size;
3611         if (dmar_domain->max_addr < max_addr) {
3612                 int min_agaw;
3613                 u64 end;
3614
3615                 /* check if minimum agaw is sufficient for mapped address */
3616                 min_agaw = vm_domain_min_agaw(dmar_domain);
3617                 addr_width = agaw_to_width(min_agaw);
3618                 end = DOMAIN_MAX_ADDR(addr_width);
3619                 end = end & VTD_PAGE_MASK;
3620                 if (end < max_addr) {
3621                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3622                                "sufficient for the mapped address (%llx)\n",
3623                                __func__, min_agaw, max_addr);
3624                         return -EFAULT;
3625                 }
3626                 dmar_domain->max_addr = max_addr;
3627         }
3628         /* Round up size to next multiple of PAGE_SIZE, if it and
3629            the low bits of hpa would take us onto the next page */
3630         size = aligned_nrpages(hpa, size);
3631         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3632                                  hpa >> VTD_PAGE_SHIFT, size, prot);
3633         return ret;
3634 }
3635
3636 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3637                                     unsigned long iova, size_t size)
3638 {
3639         struct dmar_domain *dmar_domain = domain->priv;
3640
3641         if (!size)
3642                 return;
3643
3644         dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3645                             (iova + size - 1) >> VTD_PAGE_SHIFT);
3646
3647         if (dmar_domain->max_addr == iova + size)
3648                 dmar_domain->max_addr = iova;
3649 }
3650
3651 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3652                                             unsigned long iova)
3653 {
3654         struct dmar_domain *dmar_domain = domain->priv;
3655         struct dma_pte *pte;
3656         u64 phys = 0;
3657
3658         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3659         if (pte)
3660                 phys = dma_pte_addr(pte);
3661
3662         return phys;
3663 }
3664
3665 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3666                                       unsigned long cap)
3667 {
3668         struct dmar_domain *dmar_domain = domain->priv;
3669
3670         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3671                 return dmar_domain->iommu_snooping;
3672
3673         return 0;
3674 }
3675
3676 static struct iommu_ops intel_iommu_ops = {
3677         .domain_init    = intel_iommu_domain_init,
3678         .domain_destroy = intel_iommu_domain_destroy,
3679         .attach_dev     = intel_iommu_attach_device,
3680         .detach_dev     = intel_iommu_detach_device,
3681         .map            = intel_iommu_map_range,
3682         .unmap          = intel_iommu_unmap_range,
3683         .iova_to_phys   = intel_iommu_iova_to_phys,
3684         .domain_has_cap = intel_iommu_domain_has_cap,
3685 };
3686
3687 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3688 {
3689         /*
3690          * Mobile 4 Series Chipset neglects to set RWBF capability,
3691          * but needs it:
3692          */
3693         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3694         rwbf_quirk = 1;
3695 }
3696
3697 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3698
3699 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3700    ISOCH DMAR unit for the Azalia sound device, but not give it any
3701    TLB entries, which causes it to deadlock. Check for that.  We do
3702    this in a function called from init_dmars(), instead of in a PCI
3703    quirk, because we don't want to print the obnoxious "BIOS broken"
3704    message if VT-d is actually disabled.
3705 */
3706 static void __init check_tylersburg_isoch(void)
3707 {
3708         struct pci_dev *pdev;
3709         uint32_t vtisochctrl;
3710
3711         /* If there's no Azalia in the system anyway, forget it. */
3712         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3713         if (!pdev)
3714                 return;
3715         pci_dev_put(pdev);
3716
3717         /* System Management Registers. Might be hidden, in which case
3718            we can't do the sanity check. But that's OK, because the
3719            known-broken BIOSes _don't_ actually hide it, so far. */
3720         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3721         if (!pdev)
3722                 return;
3723
3724         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3725                 pci_dev_put(pdev);
3726                 return;
3727         }
3728
3729         pci_dev_put(pdev);
3730
3731         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3732         if (vtisochctrl & 1)
3733                 return;
3734
3735         /* Drop all bits other than the number of TLB entries */
3736         vtisochctrl &= 0x1c;
3737
3738         /* If we have the recommended number of TLB entries (16), fine. */
3739         if (vtisochctrl == 0x10)
3740                 return;
3741
3742         /* Zero TLB entries? You get to ride the short bus to school. */
3743         if (!vtisochctrl) {
3744                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3745                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3746                      dmi_get_system_info(DMI_BIOS_VENDOR),
3747                      dmi_get_system_info(DMI_BIOS_VERSION),
3748                      dmi_get_system_info(DMI_PRODUCT_VERSION));
3749                 iommu_identity_mapping |= IDENTMAP_AZALIA;
3750                 return;
3751         }
3752         
3753         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3754                vtisochctrl);
3755 }