Merge branch 'release' of git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux...
[linux-2.6.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/syscore_ops.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <linux/pci-ats.h>
43 #include <asm/cacheflush.h>
44 #include <asm/iommu.h>
45 #include "pci.h"
46
47 #define ROOT_SIZE               VTD_PAGE_SIZE
48 #define CONTEXT_SIZE            VTD_PAGE_SIZE
49
50 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
51 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
52 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
53
54 #define IOAPIC_RANGE_START      (0xfee00000)
55 #define IOAPIC_RANGE_END        (0xfeefffff)
56 #define IOVA_START_ADDR         (0x1000)
57
58 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
59
60 #define MAX_AGAW_WIDTH 64
61
62 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
63 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
64
65 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
66    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
67 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
68                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
69 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
70
71 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
72 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
73 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
74
75 /* page table handling */
76 #define LEVEL_STRIDE            (9)
77 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
78
79 static inline int agaw_to_level(int agaw)
80 {
81         return agaw + 2;
82 }
83
84 static inline int agaw_to_width(int agaw)
85 {
86         return 30 + agaw * LEVEL_STRIDE;
87 }
88
89 static inline int width_to_agaw(int width)
90 {
91         return (width - 30) / LEVEL_STRIDE;
92 }
93
94 static inline unsigned int level_to_offset_bits(int level)
95 {
96         return (level - 1) * LEVEL_STRIDE;
97 }
98
99 static inline int pfn_level_offset(unsigned long pfn, int level)
100 {
101         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
102 }
103
104 static inline unsigned long level_mask(int level)
105 {
106         return -1UL << level_to_offset_bits(level);
107 }
108
109 static inline unsigned long level_size(int level)
110 {
111         return 1UL << level_to_offset_bits(level);
112 }
113
114 static inline unsigned long align_to_level(unsigned long pfn, int level)
115 {
116         return (pfn + level_size(level) - 1) & level_mask(level);
117 }
118
119 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
120    are never going to work. */
121 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
122 {
123         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
124 }
125
126 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
127 {
128         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
129 }
130 static inline unsigned long page_to_dma_pfn(struct page *pg)
131 {
132         return mm_to_dma_pfn(page_to_pfn(pg));
133 }
134 static inline unsigned long virt_to_dma_pfn(void *p)
135 {
136         return page_to_dma_pfn(virt_to_page(p));
137 }
138
139 /* global iommu list, set NULL for ignored DMAR units */
140 static struct intel_iommu **g_iommus;
141
142 static void __init check_tylersburg_isoch(void);
143 static int rwbf_quirk;
144
145 /*
146  * 0: Present
147  * 1-11: Reserved
148  * 12-63: Context Ptr (12 - (haw-1))
149  * 64-127: Reserved
150  */
151 struct root_entry {
152         u64     val;
153         u64     rsvd1;
154 };
155 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
156 static inline bool root_present(struct root_entry *root)
157 {
158         return (root->val & 1);
159 }
160 static inline void set_root_present(struct root_entry *root)
161 {
162         root->val |= 1;
163 }
164 static inline void set_root_value(struct root_entry *root, unsigned long value)
165 {
166         root->val |= value & VTD_PAGE_MASK;
167 }
168
169 static inline struct context_entry *
170 get_context_addr_from_root(struct root_entry *root)
171 {
172         return (struct context_entry *)
173                 (root_present(root)?phys_to_virt(
174                 root->val & VTD_PAGE_MASK) :
175                 NULL);
176 }
177
178 /*
179  * low 64 bits:
180  * 0: present
181  * 1: fault processing disable
182  * 2-3: translation type
183  * 12-63: address space root
184  * high 64 bits:
185  * 0-2: address width
186  * 3-6: aval
187  * 8-23: domain id
188  */
189 struct context_entry {
190         u64 lo;
191         u64 hi;
192 };
193
194 static inline bool context_present(struct context_entry *context)
195 {
196         return (context->lo & 1);
197 }
198 static inline void context_set_present(struct context_entry *context)
199 {
200         context->lo |= 1;
201 }
202
203 static inline void context_set_fault_enable(struct context_entry *context)
204 {
205         context->lo &= (((u64)-1) << 2) | 1;
206 }
207
208 static inline void context_set_translation_type(struct context_entry *context,
209                                                 unsigned long value)
210 {
211         context->lo &= (((u64)-1) << 4) | 3;
212         context->lo |= (value & 3) << 2;
213 }
214
215 static inline void context_set_address_root(struct context_entry *context,
216                                             unsigned long value)
217 {
218         context->lo |= value & VTD_PAGE_MASK;
219 }
220
221 static inline void context_set_address_width(struct context_entry *context,
222                                              unsigned long value)
223 {
224         context->hi |= value & 7;
225 }
226
227 static inline void context_set_domain_id(struct context_entry *context,
228                                          unsigned long value)
229 {
230         context->hi |= (value & ((1 << 16) - 1)) << 8;
231 }
232
233 static inline void context_clear_entry(struct context_entry *context)
234 {
235         context->lo = 0;
236         context->hi = 0;
237 }
238
239 /*
240  * 0: readable
241  * 1: writable
242  * 2-6: reserved
243  * 7: super page
244  * 8-10: available
245  * 11: snoop behavior
246  * 12-63: Host physcial address
247  */
248 struct dma_pte {
249         u64 val;
250 };
251
252 static inline void dma_clear_pte(struct dma_pte *pte)
253 {
254         pte->val = 0;
255 }
256
257 static inline void dma_set_pte_readable(struct dma_pte *pte)
258 {
259         pte->val |= DMA_PTE_READ;
260 }
261
262 static inline void dma_set_pte_writable(struct dma_pte *pte)
263 {
264         pte->val |= DMA_PTE_WRITE;
265 }
266
267 static inline void dma_set_pte_snp(struct dma_pte *pte)
268 {
269         pte->val |= DMA_PTE_SNP;
270 }
271
272 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
273 {
274         pte->val = (pte->val & ~3) | (prot & 3);
275 }
276
277 static inline u64 dma_pte_addr(struct dma_pte *pte)
278 {
279 #ifdef CONFIG_64BIT
280         return pte->val & VTD_PAGE_MASK;
281 #else
282         /* Must have a full atomic 64-bit read */
283         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
284 #endif
285 }
286
287 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
288 {
289         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
290 }
291
292 static inline bool dma_pte_present(struct dma_pte *pte)
293 {
294         return (pte->val & 3) != 0;
295 }
296
297 static inline int first_pte_in_page(struct dma_pte *pte)
298 {
299         return !((unsigned long)pte & ~VTD_PAGE_MASK);
300 }
301
302 /*
303  * This domain is a statically identity mapping domain.
304  *      1. This domain creats a static 1:1 mapping to all usable memory.
305  *      2. It maps to each iommu if successful.
306  *      3. Each iommu mapps to this domain if successful.
307  */
308 static struct dmar_domain *si_domain;
309 static int hw_pass_through = 1;
310
311 /* devices under the same p2p bridge are owned in one domain */
312 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
313
314 /* domain represents a virtual machine, more than one devices
315  * across iommus may be owned in one domain, e.g. kvm guest.
316  */
317 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
318
319 /* si_domain contains mulitple devices */
320 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
321
322 struct dmar_domain {
323         int     id;                     /* domain id */
324         int     nid;                    /* node id */
325         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
326
327         struct list_head devices;       /* all devices' list */
328         struct iova_domain iovad;       /* iova's that belong to this domain */
329
330         struct dma_pte  *pgd;           /* virtual address */
331         int             gaw;            /* max guest address width */
332
333         /* adjusted guest address width, 0 is level 2 30-bit */
334         int             agaw;
335
336         int             flags;          /* flags to find out type of domain */
337
338         int             iommu_coherency;/* indicate coherency of iommu access */
339         int             iommu_snooping; /* indicate snooping control feature*/
340         int             iommu_count;    /* reference count of iommu */
341         spinlock_t      iommu_lock;     /* protect iommu set in domain */
342         u64             max_addr;       /* maximum mapped address */
343 };
344
345 /* PCI domain-device relationship */
346 struct device_domain_info {
347         struct list_head link;  /* link to domain siblings */
348         struct list_head global; /* link to global list */
349         int segment;            /* PCI domain */
350         u8 bus;                 /* PCI bus number */
351         u8 devfn;               /* PCI devfn number */
352         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
353         struct intel_iommu *iommu; /* IOMMU used by this device */
354         struct dmar_domain *domain; /* pointer to domain */
355 };
356
357 static void flush_unmaps_timeout(unsigned long data);
358
359 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
360
361 #define HIGH_WATER_MARK 250
362 struct deferred_flush_tables {
363         int next;
364         struct iova *iova[HIGH_WATER_MARK];
365         struct dmar_domain *domain[HIGH_WATER_MARK];
366 };
367
368 static struct deferred_flush_tables *deferred_flush;
369
370 /* bitmap for indexing intel_iommus */
371 static int g_num_of_iommus;
372
373 static DEFINE_SPINLOCK(async_umap_flush_lock);
374 static LIST_HEAD(unmaps_to_do);
375
376 static int timer_on;
377 static long list_size;
378
379 static void domain_remove_dev_info(struct dmar_domain *domain);
380
381 #ifdef CONFIG_DMAR_DEFAULT_ON
382 int dmar_disabled = 0;
383 #else
384 int dmar_disabled = 1;
385 #endif /*CONFIG_DMAR_DEFAULT_ON*/
386
387 static int dmar_map_gfx = 1;
388 static int dmar_forcedac;
389 static int intel_iommu_strict;
390
391 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
392 static DEFINE_SPINLOCK(device_domain_lock);
393 static LIST_HEAD(device_domain_list);
394
395 static struct iommu_ops intel_iommu_ops;
396
397 static int __init intel_iommu_setup(char *str)
398 {
399         if (!str)
400                 return -EINVAL;
401         while (*str) {
402                 if (!strncmp(str, "on", 2)) {
403                         dmar_disabled = 0;
404                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
405                 } else if (!strncmp(str, "off", 3)) {
406                         dmar_disabled = 1;
407                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
408                 } else if (!strncmp(str, "igfx_off", 8)) {
409                         dmar_map_gfx = 0;
410                         printk(KERN_INFO
411                                 "Intel-IOMMU: disable GFX device mapping\n");
412                 } else if (!strncmp(str, "forcedac", 8)) {
413                         printk(KERN_INFO
414                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
415                         dmar_forcedac = 1;
416                 } else if (!strncmp(str, "strict", 6)) {
417                         printk(KERN_INFO
418                                 "Intel-IOMMU: disable batched IOTLB flush\n");
419                         intel_iommu_strict = 1;
420                 }
421
422                 str += strcspn(str, ",");
423                 while (*str == ',')
424                         str++;
425         }
426         return 0;
427 }
428 __setup("intel_iommu=", intel_iommu_setup);
429
430 static struct kmem_cache *iommu_domain_cache;
431 static struct kmem_cache *iommu_devinfo_cache;
432 static struct kmem_cache *iommu_iova_cache;
433
434 static inline void *alloc_pgtable_page(int node)
435 {
436         struct page *page;
437         void *vaddr = NULL;
438
439         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
440         if (page)
441                 vaddr = page_address(page);
442         return vaddr;
443 }
444
445 static inline void free_pgtable_page(void *vaddr)
446 {
447         free_page((unsigned long)vaddr);
448 }
449
450 static inline void *alloc_domain_mem(void)
451 {
452         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
453 }
454
455 static void free_domain_mem(void *vaddr)
456 {
457         kmem_cache_free(iommu_domain_cache, vaddr);
458 }
459
460 static inline void * alloc_devinfo_mem(void)
461 {
462         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
463 }
464
465 static inline void free_devinfo_mem(void *vaddr)
466 {
467         kmem_cache_free(iommu_devinfo_cache, vaddr);
468 }
469
470 struct iova *alloc_iova_mem(void)
471 {
472         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
473 }
474
475 void free_iova_mem(struct iova *iova)
476 {
477         kmem_cache_free(iommu_iova_cache, iova);
478 }
479
480
481 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
482 {
483         unsigned long sagaw;
484         int agaw = -1;
485
486         sagaw = cap_sagaw(iommu->cap);
487         for (agaw = width_to_agaw(max_gaw);
488              agaw >= 0; agaw--) {
489                 if (test_bit(agaw, &sagaw))
490                         break;
491         }
492
493         return agaw;
494 }
495
496 /*
497  * Calculate max SAGAW for each iommu.
498  */
499 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
500 {
501         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
502 }
503
504 /*
505  * calculate agaw for each iommu.
506  * "SAGAW" may be different across iommus, use a default agaw, and
507  * get a supported less agaw for iommus that don't support the default agaw.
508  */
509 int iommu_calculate_agaw(struct intel_iommu *iommu)
510 {
511         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
512 }
513
514 /* This functionin only returns single iommu in a domain */
515 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
516 {
517         int iommu_id;
518
519         /* si_domain and vm domain should not get here. */
520         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
521         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
522
523         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
524         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
525                 return NULL;
526
527         return g_iommus[iommu_id];
528 }
529
530 static void domain_update_iommu_coherency(struct dmar_domain *domain)
531 {
532         int i;
533
534         domain->iommu_coherency = 1;
535
536         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
537                 if (!ecap_coherent(g_iommus[i]->ecap)) {
538                         domain->iommu_coherency = 0;
539                         break;
540                 }
541         }
542 }
543
544 static void domain_update_iommu_snooping(struct dmar_domain *domain)
545 {
546         int i;
547
548         domain->iommu_snooping = 1;
549
550         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
551                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
552                         domain->iommu_snooping = 0;
553                         break;
554                 }
555         }
556 }
557
558 /* Some capabilities may be different across iommus */
559 static void domain_update_iommu_cap(struct dmar_domain *domain)
560 {
561         domain_update_iommu_coherency(domain);
562         domain_update_iommu_snooping(domain);
563 }
564
565 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
566 {
567         struct dmar_drhd_unit *drhd = NULL;
568         int i;
569
570         for_each_drhd_unit(drhd) {
571                 if (drhd->ignored)
572                         continue;
573                 if (segment != drhd->segment)
574                         continue;
575
576                 for (i = 0; i < drhd->devices_cnt; i++) {
577                         if (drhd->devices[i] &&
578                             drhd->devices[i]->bus->number == bus &&
579                             drhd->devices[i]->devfn == devfn)
580                                 return drhd->iommu;
581                         if (drhd->devices[i] &&
582                             drhd->devices[i]->subordinate &&
583                             drhd->devices[i]->subordinate->number <= bus &&
584                             drhd->devices[i]->subordinate->subordinate >= bus)
585                                 return drhd->iommu;
586                 }
587
588                 if (drhd->include_all)
589                         return drhd->iommu;
590         }
591
592         return NULL;
593 }
594
595 static void domain_flush_cache(struct dmar_domain *domain,
596                                void *addr, int size)
597 {
598         if (!domain->iommu_coherency)
599                 clflush_cache_range(addr, size);
600 }
601
602 /* Gets context entry for a given bus and devfn */
603 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
604                 u8 bus, u8 devfn)
605 {
606         struct root_entry *root;
607         struct context_entry *context;
608         unsigned long phy_addr;
609         unsigned long flags;
610
611         spin_lock_irqsave(&iommu->lock, flags);
612         root = &iommu->root_entry[bus];
613         context = get_context_addr_from_root(root);
614         if (!context) {
615                 context = (struct context_entry *)
616                                 alloc_pgtable_page(iommu->node);
617                 if (!context) {
618                         spin_unlock_irqrestore(&iommu->lock, flags);
619                         return NULL;
620                 }
621                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
622                 phy_addr = virt_to_phys((void *)context);
623                 set_root_value(root, phy_addr);
624                 set_root_present(root);
625                 __iommu_flush_cache(iommu, root, sizeof(*root));
626         }
627         spin_unlock_irqrestore(&iommu->lock, flags);
628         return &context[devfn];
629 }
630
631 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
632 {
633         struct root_entry *root;
634         struct context_entry *context;
635         int ret;
636         unsigned long flags;
637
638         spin_lock_irqsave(&iommu->lock, flags);
639         root = &iommu->root_entry[bus];
640         context = get_context_addr_from_root(root);
641         if (!context) {
642                 ret = 0;
643                 goto out;
644         }
645         ret = context_present(&context[devfn]);
646 out:
647         spin_unlock_irqrestore(&iommu->lock, flags);
648         return ret;
649 }
650
651 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
652 {
653         struct root_entry *root;
654         struct context_entry *context;
655         unsigned long flags;
656
657         spin_lock_irqsave(&iommu->lock, flags);
658         root = &iommu->root_entry[bus];
659         context = get_context_addr_from_root(root);
660         if (context) {
661                 context_clear_entry(&context[devfn]);
662                 __iommu_flush_cache(iommu, &context[devfn], \
663                         sizeof(*context));
664         }
665         spin_unlock_irqrestore(&iommu->lock, flags);
666 }
667
668 static void free_context_table(struct intel_iommu *iommu)
669 {
670         struct root_entry *root;
671         int i;
672         unsigned long flags;
673         struct context_entry *context;
674
675         spin_lock_irqsave(&iommu->lock, flags);
676         if (!iommu->root_entry) {
677                 goto out;
678         }
679         for (i = 0; i < ROOT_ENTRY_NR; i++) {
680                 root = &iommu->root_entry[i];
681                 context = get_context_addr_from_root(root);
682                 if (context)
683                         free_pgtable_page(context);
684         }
685         free_pgtable_page(iommu->root_entry);
686         iommu->root_entry = NULL;
687 out:
688         spin_unlock_irqrestore(&iommu->lock, flags);
689 }
690
691 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
692                                       unsigned long pfn)
693 {
694         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
695         struct dma_pte *parent, *pte = NULL;
696         int level = agaw_to_level(domain->agaw);
697         int offset;
698
699         BUG_ON(!domain->pgd);
700         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
701         parent = domain->pgd;
702
703         while (level > 0) {
704                 void *tmp_page;
705
706                 offset = pfn_level_offset(pfn, level);
707                 pte = &parent[offset];
708                 if (level == 1)
709                         break;
710
711                 if (!dma_pte_present(pte)) {
712                         uint64_t pteval;
713
714                         tmp_page = alloc_pgtable_page(domain->nid);
715
716                         if (!tmp_page)
717                                 return NULL;
718
719                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
720                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
721                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
722                                 /* Someone else set it while we were thinking; use theirs. */
723                                 free_pgtable_page(tmp_page);
724                         } else {
725                                 dma_pte_addr(pte);
726                                 domain_flush_cache(domain, pte, sizeof(*pte));
727                         }
728                 }
729                 parent = phys_to_virt(dma_pte_addr(pte));
730                 level--;
731         }
732
733         return pte;
734 }
735
736 /* return address's pte at specific level */
737 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
738                                          unsigned long pfn,
739                                          int level)
740 {
741         struct dma_pte *parent, *pte = NULL;
742         int total = agaw_to_level(domain->agaw);
743         int offset;
744
745         parent = domain->pgd;
746         while (level <= total) {
747                 offset = pfn_level_offset(pfn, total);
748                 pte = &parent[offset];
749                 if (level == total)
750                         return pte;
751
752                 if (!dma_pte_present(pte))
753                         break;
754                 parent = phys_to_virt(dma_pte_addr(pte));
755                 total--;
756         }
757         return NULL;
758 }
759
760 /* clear last level pte, a tlb flush should be followed */
761 static void dma_pte_clear_range(struct dmar_domain *domain,
762                                 unsigned long start_pfn,
763                                 unsigned long last_pfn)
764 {
765         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
766         struct dma_pte *first_pte, *pte;
767
768         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
769         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
770         BUG_ON(start_pfn > last_pfn);
771
772         /* we don't need lock here; nobody else touches the iova range */
773         do {
774                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
775                 if (!pte) {
776                         start_pfn = align_to_level(start_pfn + 1, 2);
777                         continue;
778                 }
779                 do { 
780                         dma_clear_pte(pte);
781                         start_pfn++;
782                         pte++;
783                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
784
785                 domain_flush_cache(domain, first_pte,
786                                    (void *)pte - (void *)first_pte);
787
788         } while (start_pfn && start_pfn <= last_pfn);
789 }
790
791 /* free page table pages. last level pte should already be cleared */
792 static void dma_pte_free_pagetable(struct dmar_domain *domain,
793                                    unsigned long start_pfn,
794                                    unsigned long last_pfn)
795 {
796         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
797         struct dma_pte *first_pte, *pte;
798         int total = agaw_to_level(domain->agaw);
799         int level;
800         unsigned long tmp;
801
802         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
803         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
804         BUG_ON(start_pfn > last_pfn);
805
806         /* We don't need lock here; nobody else touches the iova range */
807         level = 2;
808         while (level <= total) {
809                 tmp = align_to_level(start_pfn, level);
810
811                 /* If we can't even clear one PTE at this level, we're done */
812                 if (tmp + level_size(level) - 1 > last_pfn)
813                         return;
814
815                 do {
816                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
817                         if (!pte) {
818                                 tmp = align_to_level(tmp + 1, level + 1);
819                                 continue;
820                         }
821                         do {
822                                 if (dma_pte_present(pte)) {
823                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
824                                         dma_clear_pte(pte);
825                                 }
826                                 pte++;
827                                 tmp += level_size(level);
828                         } while (!first_pte_in_page(pte) &&
829                                  tmp + level_size(level) - 1 <= last_pfn);
830
831                         domain_flush_cache(domain, first_pte,
832                                            (void *)pte - (void *)first_pte);
833                         
834                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
835                 level++;
836         }
837         /* free pgd */
838         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
839                 free_pgtable_page(domain->pgd);
840                 domain->pgd = NULL;
841         }
842 }
843
844 /* iommu handling */
845 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
846 {
847         struct root_entry *root;
848         unsigned long flags;
849
850         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
851         if (!root)
852                 return -ENOMEM;
853
854         __iommu_flush_cache(iommu, root, ROOT_SIZE);
855
856         spin_lock_irqsave(&iommu->lock, flags);
857         iommu->root_entry = root;
858         spin_unlock_irqrestore(&iommu->lock, flags);
859
860         return 0;
861 }
862
863 static void iommu_set_root_entry(struct intel_iommu *iommu)
864 {
865         void *addr;
866         u32 sts;
867         unsigned long flag;
868
869         addr = iommu->root_entry;
870
871         spin_lock_irqsave(&iommu->register_lock, flag);
872         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
873
874         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
875
876         /* Make sure hardware complete it */
877         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
878                       readl, (sts & DMA_GSTS_RTPS), sts);
879
880         spin_unlock_irqrestore(&iommu->register_lock, flag);
881 }
882
883 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
884 {
885         u32 val;
886         unsigned long flag;
887
888         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
889                 return;
890
891         spin_lock_irqsave(&iommu->register_lock, flag);
892         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
893
894         /* Make sure hardware complete it */
895         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
896                       readl, (!(val & DMA_GSTS_WBFS)), val);
897
898         spin_unlock_irqrestore(&iommu->register_lock, flag);
899 }
900
901 /* return value determine if we need a write buffer flush */
902 static void __iommu_flush_context(struct intel_iommu *iommu,
903                                   u16 did, u16 source_id, u8 function_mask,
904                                   u64 type)
905 {
906         u64 val = 0;
907         unsigned long flag;
908
909         switch (type) {
910         case DMA_CCMD_GLOBAL_INVL:
911                 val = DMA_CCMD_GLOBAL_INVL;
912                 break;
913         case DMA_CCMD_DOMAIN_INVL:
914                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
915                 break;
916         case DMA_CCMD_DEVICE_INVL:
917                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
918                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
919                 break;
920         default:
921                 BUG();
922         }
923         val |= DMA_CCMD_ICC;
924
925         spin_lock_irqsave(&iommu->register_lock, flag);
926         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
927
928         /* Make sure hardware complete it */
929         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
930                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
931
932         spin_unlock_irqrestore(&iommu->register_lock, flag);
933 }
934
935 /* return value determine if we need a write buffer flush */
936 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
937                                 u64 addr, unsigned int size_order, u64 type)
938 {
939         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
940         u64 val = 0, val_iva = 0;
941         unsigned long flag;
942
943         switch (type) {
944         case DMA_TLB_GLOBAL_FLUSH:
945                 /* global flush doesn't need set IVA_REG */
946                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
947                 break;
948         case DMA_TLB_DSI_FLUSH:
949                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
950                 break;
951         case DMA_TLB_PSI_FLUSH:
952                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
953                 /* Note: always flush non-leaf currently */
954                 val_iva = size_order | addr;
955                 break;
956         default:
957                 BUG();
958         }
959         /* Note: set drain read/write */
960 #if 0
961         /*
962          * This is probably to be super secure.. Looks like we can
963          * ignore it without any impact.
964          */
965         if (cap_read_drain(iommu->cap))
966                 val |= DMA_TLB_READ_DRAIN;
967 #endif
968         if (cap_write_drain(iommu->cap))
969                 val |= DMA_TLB_WRITE_DRAIN;
970
971         spin_lock_irqsave(&iommu->register_lock, flag);
972         /* Note: Only uses first TLB reg currently */
973         if (val_iva)
974                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
975         dmar_writeq(iommu->reg + tlb_offset + 8, val);
976
977         /* Make sure hardware complete it */
978         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
979                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
980
981         spin_unlock_irqrestore(&iommu->register_lock, flag);
982
983         /* check IOTLB invalidation granularity */
984         if (DMA_TLB_IAIG(val) == 0)
985                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
986         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
987                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
988                         (unsigned long long)DMA_TLB_IIRG(type),
989                         (unsigned long long)DMA_TLB_IAIG(val));
990 }
991
992 static struct device_domain_info *iommu_support_dev_iotlb(
993         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
994 {
995         int found = 0;
996         unsigned long flags;
997         struct device_domain_info *info;
998         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
999
1000         if (!ecap_dev_iotlb_support(iommu->ecap))
1001                 return NULL;
1002
1003         if (!iommu->qi)
1004                 return NULL;
1005
1006         spin_lock_irqsave(&device_domain_lock, flags);
1007         list_for_each_entry(info, &domain->devices, link)
1008                 if (info->bus == bus && info->devfn == devfn) {
1009                         found = 1;
1010                         break;
1011                 }
1012         spin_unlock_irqrestore(&device_domain_lock, flags);
1013
1014         if (!found || !info->dev)
1015                 return NULL;
1016
1017         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1018                 return NULL;
1019
1020         if (!dmar_find_matched_atsr_unit(info->dev))
1021                 return NULL;
1022
1023         info->iommu = iommu;
1024
1025         return info;
1026 }
1027
1028 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1029 {
1030         if (!info)
1031                 return;
1032
1033         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1034 }
1035
1036 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1037 {
1038         if (!info->dev || !pci_ats_enabled(info->dev))
1039                 return;
1040
1041         pci_disable_ats(info->dev);
1042 }
1043
1044 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1045                                   u64 addr, unsigned mask)
1046 {
1047         u16 sid, qdep;
1048         unsigned long flags;
1049         struct device_domain_info *info;
1050
1051         spin_lock_irqsave(&device_domain_lock, flags);
1052         list_for_each_entry(info, &domain->devices, link) {
1053                 if (!info->dev || !pci_ats_enabled(info->dev))
1054                         continue;
1055
1056                 sid = info->bus << 8 | info->devfn;
1057                 qdep = pci_ats_queue_depth(info->dev);
1058                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1059         }
1060         spin_unlock_irqrestore(&device_domain_lock, flags);
1061 }
1062
1063 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1064                                   unsigned long pfn, unsigned int pages, int map)
1065 {
1066         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1067         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1068
1069         BUG_ON(pages == 0);
1070
1071         /*
1072          * Fallback to domain selective flush if no PSI support or the size is
1073          * too big.
1074          * PSI requires page size to be 2 ^ x, and the base address is naturally
1075          * aligned to the size
1076          */
1077         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1078                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1079                                                 DMA_TLB_DSI_FLUSH);
1080         else
1081                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1082                                                 DMA_TLB_PSI_FLUSH);
1083
1084         /*
1085          * In caching mode, changes of pages from non-present to present require
1086          * flush. However, device IOTLB doesn't need to be flushed in this case.
1087          */
1088         if (!cap_caching_mode(iommu->cap) || !map)
1089                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1090 }
1091
1092 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1093 {
1094         u32 pmen;
1095         unsigned long flags;
1096
1097         spin_lock_irqsave(&iommu->register_lock, flags);
1098         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1099         pmen &= ~DMA_PMEN_EPM;
1100         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1101
1102         /* wait for the protected region status bit to clear */
1103         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1104                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1105
1106         spin_unlock_irqrestore(&iommu->register_lock, flags);
1107 }
1108
1109 static int iommu_enable_translation(struct intel_iommu *iommu)
1110 {
1111         u32 sts;
1112         unsigned long flags;
1113
1114         spin_lock_irqsave(&iommu->register_lock, flags);
1115         iommu->gcmd |= DMA_GCMD_TE;
1116         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1117
1118         /* Make sure hardware complete it */
1119         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1120                       readl, (sts & DMA_GSTS_TES), sts);
1121
1122         spin_unlock_irqrestore(&iommu->register_lock, flags);
1123         return 0;
1124 }
1125
1126 static int iommu_disable_translation(struct intel_iommu *iommu)
1127 {
1128         u32 sts;
1129         unsigned long flag;
1130
1131         spin_lock_irqsave(&iommu->register_lock, flag);
1132         iommu->gcmd &= ~DMA_GCMD_TE;
1133         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1134
1135         /* Make sure hardware complete it */
1136         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1137                       readl, (!(sts & DMA_GSTS_TES)), sts);
1138
1139         spin_unlock_irqrestore(&iommu->register_lock, flag);
1140         return 0;
1141 }
1142
1143
1144 static int iommu_init_domains(struct intel_iommu *iommu)
1145 {
1146         unsigned long ndomains;
1147         unsigned long nlongs;
1148
1149         ndomains = cap_ndoms(iommu->cap);
1150         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1151                         ndomains);
1152         nlongs = BITS_TO_LONGS(ndomains);
1153
1154         spin_lock_init(&iommu->lock);
1155
1156         /* TBD: there might be 64K domains,
1157          * consider other allocation for future chip
1158          */
1159         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1160         if (!iommu->domain_ids) {
1161                 printk(KERN_ERR "Allocating domain id array failed\n");
1162                 return -ENOMEM;
1163         }
1164         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1165                         GFP_KERNEL);
1166         if (!iommu->domains) {
1167                 printk(KERN_ERR "Allocating domain array failed\n");
1168                 return -ENOMEM;
1169         }
1170
1171         /*
1172          * if Caching mode is set, then invalid translations are tagged
1173          * with domainid 0. Hence we need to pre-allocate it.
1174          */
1175         if (cap_caching_mode(iommu->cap))
1176                 set_bit(0, iommu->domain_ids);
1177         return 0;
1178 }
1179
1180
1181 static void domain_exit(struct dmar_domain *domain);
1182 static void vm_domain_exit(struct dmar_domain *domain);
1183
1184 void free_dmar_iommu(struct intel_iommu *iommu)
1185 {
1186         struct dmar_domain *domain;
1187         int i;
1188         unsigned long flags;
1189
1190         if ((iommu->domains) && (iommu->domain_ids)) {
1191                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1192                         domain = iommu->domains[i];
1193                         clear_bit(i, iommu->domain_ids);
1194
1195                         spin_lock_irqsave(&domain->iommu_lock, flags);
1196                         if (--domain->iommu_count == 0) {
1197                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1198                                         vm_domain_exit(domain);
1199                                 else
1200                                         domain_exit(domain);
1201                         }
1202                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1203                 }
1204         }
1205
1206         if (iommu->gcmd & DMA_GCMD_TE)
1207                 iommu_disable_translation(iommu);
1208
1209         if (iommu->irq) {
1210                 irq_set_handler_data(iommu->irq, NULL);
1211                 /* This will mask the irq */
1212                 free_irq(iommu->irq, iommu);
1213                 destroy_irq(iommu->irq);
1214         }
1215
1216         kfree(iommu->domains);
1217         kfree(iommu->domain_ids);
1218
1219         g_iommus[iommu->seq_id] = NULL;
1220
1221         /* if all iommus are freed, free g_iommus */
1222         for (i = 0; i < g_num_of_iommus; i++) {
1223                 if (g_iommus[i])
1224                         break;
1225         }
1226
1227         if (i == g_num_of_iommus)
1228                 kfree(g_iommus);
1229
1230         /* free context mapping */
1231         free_context_table(iommu);
1232 }
1233
1234 static struct dmar_domain *alloc_domain(void)
1235 {
1236         struct dmar_domain *domain;
1237
1238         domain = alloc_domain_mem();
1239         if (!domain)
1240                 return NULL;
1241
1242         domain->nid = -1;
1243         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1244         domain->flags = 0;
1245
1246         return domain;
1247 }
1248
1249 static int iommu_attach_domain(struct dmar_domain *domain,
1250                                struct intel_iommu *iommu)
1251 {
1252         int num;
1253         unsigned long ndomains;
1254         unsigned long flags;
1255
1256         ndomains = cap_ndoms(iommu->cap);
1257
1258         spin_lock_irqsave(&iommu->lock, flags);
1259
1260         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1261         if (num >= ndomains) {
1262                 spin_unlock_irqrestore(&iommu->lock, flags);
1263                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1264                 return -ENOMEM;
1265         }
1266
1267         domain->id = num;
1268         set_bit(num, iommu->domain_ids);
1269         set_bit(iommu->seq_id, &domain->iommu_bmp);
1270         iommu->domains[num] = domain;
1271         spin_unlock_irqrestore(&iommu->lock, flags);
1272
1273         return 0;
1274 }
1275
1276 static void iommu_detach_domain(struct dmar_domain *domain,
1277                                 struct intel_iommu *iommu)
1278 {
1279         unsigned long flags;
1280         int num, ndomains;
1281         int found = 0;
1282
1283         spin_lock_irqsave(&iommu->lock, flags);
1284         ndomains = cap_ndoms(iommu->cap);
1285         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1286                 if (iommu->domains[num] == domain) {
1287                         found = 1;
1288                         break;
1289                 }
1290         }
1291
1292         if (found) {
1293                 clear_bit(num, iommu->domain_ids);
1294                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1295                 iommu->domains[num] = NULL;
1296         }
1297         spin_unlock_irqrestore(&iommu->lock, flags);
1298 }
1299
1300 static struct iova_domain reserved_iova_list;
1301 static struct lock_class_key reserved_rbtree_key;
1302
1303 static int dmar_init_reserved_ranges(void)
1304 {
1305         struct pci_dev *pdev = NULL;
1306         struct iova *iova;
1307         int i;
1308
1309         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1310
1311         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1312                 &reserved_rbtree_key);
1313
1314         /* IOAPIC ranges shouldn't be accessed by DMA */
1315         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1316                 IOVA_PFN(IOAPIC_RANGE_END));
1317         if (!iova) {
1318                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1319                 return -ENODEV;
1320         }
1321
1322         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1323         for_each_pci_dev(pdev) {
1324                 struct resource *r;
1325
1326                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1327                         r = &pdev->resource[i];
1328                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1329                                 continue;
1330                         iova = reserve_iova(&reserved_iova_list,
1331                                             IOVA_PFN(r->start),
1332                                             IOVA_PFN(r->end));
1333                         if (!iova) {
1334                                 printk(KERN_ERR "Reserve iova failed\n");
1335                                 return -ENODEV;
1336                         }
1337                 }
1338         }
1339         return 0;
1340 }
1341
1342 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1343 {
1344         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1345 }
1346
1347 static inline int guestwidth_to_adjustwidth(int gaw)
1348 {
1349         int agaw;
1350         int r = (gaw - 12) % 9;
1351
1352         if (r == 0)
1353                 agaw = gaw;
1354         else
1355                 agaw = gaw + 9 - r;
1356         if (agaw > 64)
1357                 agaw = 64;
1358         return agaw;
1359 }
1360
1361 static int domain_init(struct dmar_domain *domain, int guest_width)
1362 {
1363         struct intel_iommu *iommu;
1364         int adjust_width, agaw;
1365         unsigned long sagaw;
1366
1367         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1368         spin_lock_init(&domain->iommu_lock);
1369
1370         domain_reserve_special_ranges(domain);
1371
1372         /* calculate AGAW */
1373         iommu = domain_get_iommu(domain);
1374         if (guest_width > cap_mgaw(iommu->cap))
1375                 guest_width = cap_mgaw(iommu->cap);
1376         domain->gaw = guest_width;
1377         adjust_width = guestwidth_to_adjustwidth(guest_width);
1378         agaw = width_to_agaw(adjust_width);
1379         sagaw = cap_sagaw(iommu->cap);
1380         if (!test_bit(agaw, &sagaw)) {
1381                 /* hardware doesn't support it, choose a bigger one */
1382                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1383                 agaw = find_next_bit(&sagaw, 5, agaw);
1384                 if (agaw >= 5)
1385                         return -ENODEV;
1386         }
1387         domain->agaw = agaw;
1388         INIT_LIST_HEAD(&domain->devices);
1389
1390         if (ecap_coherent(iommu->ecap))
1391                 domain->iommu_coherency = 1;
1392         else
1393                 domain->iommu_coherency = 0;
1394
1395         if (ecap_sc_support(iommu->ecap))
1396                 domain->iommu_snooping = 1;
1397         else
1398                 domain->iommu_snooping = 0;
1399
1400         domain->iommu_count = 1;
1401         domain->nid = iommu->node;
1402
1403         /* always allocate the top pgd */
1404         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1405         if (!domain->pgd)
1406                 return -ENOMEM;
1407         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1408         return 0;
1409 }
1410
1411 static void domain_exit(struct dmar_domain *domain)
1412 {
1413         struct dmar_drhd_unit *drhd;
1414         struct intel_iommu *iommu;
1415
1416         /* Domain 0 is reserved, so dont process it */
1417         if (!domain)
1418                 return;
1419
1420         domain_remove_dev_info(domain);
1421         /* destroy iovas */
1422         put_iova_domain(&domain->iovad);
1423
1424         /* clear ptes */
1425         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1426
1427         /* free page tables */
1428         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1429
1430         for_each_active_iommu(iommu, drhd)
1431                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1432                         iommu_detach_domain(domain, iommu);
1433
1434         free_domain_mem(domain);
1435 }
1436
1437 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1438                                  u8 bus, u8 devfn, int translation)
1439 {
1440         struct context_entry *context;
1441         unsigned long flags;
1442         struct intel_iommu *iommu;
1443         struct dma_pte *pgd;
1444         unsigned long num;
1445         unsigned long ndomains;
1446         int id;
1447         int agaw;
1448         struct device_domain_info *info = NULL;
1449
1450         pr_debug("Set context mapping for %02x:%02x.%d\n",
1451                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1452
1453         BUG_ON(!domain->pgd);
1454         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1455                translation != CONTEXT_TT_MULTI_LEVEL);
1456
1457         iommu = device_to_iommu(segment, bus, devfn);
1458         if (!iommu)
1459                 return -ENODEV;
1460
1461         context = device_to_context_entry(iommu, bus, devfn);
1462         if (!context)
1463                 return -ENOMEM;
1464         spin_lock_irqsave(&iommu->lock, flags);
1465         if (context_present(context)) {
1466                 spin_unlock_irqrestore(&iommu->lock, flags);
1467                 return 0;
1468         }
1469
1470         id = domain->id;
1471         pgd = domain->pgd;
1472
1473         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1474             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1475                 int found = 0;
1476
1477                 /* find an available domain id for this device in iommu */
1478                 ndomains = cap_ndoms(iommu->cap);
1479                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1480                         if (iommu->domains[num] == domain) {
1481                                 id = num;
1482                                 found = 1;
1483                                 break;
1484                         }
1485                 }
1486
1487                 if (found == 0) {
1488                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1489                         if (num >= ndomains) {
1490                                 spin_unlock_irqrestore(&iommu->lock, flags);
1491                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1492                                 return -EFAULT;
1493                         }
1494
1495                         set_bit(num, iommu->domain_ids);
1496                         iommu->domains[num] = domain;
1497                         id = num;
1498                 }
1499
1500                 /* Skip top levels of page tables for
1501                  * iommu which has less agaw than default.
1502                  * Unnecessary for PT mode.
1503                  */
1504                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1505                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1506                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1507                                 if (!dma_pte_present(pgd)) {
1508                                         spin_unlock_irqrestore(&iommu->lock, flags);
1509                                         return -ENOMEM;
1510                                 }
1511                         }
1512                 }
1513         }
1514
1515         context_set_domain_id(context, id);
1516
1517         if (translation != CONTEXT_TT_PASS_THROUGH) {
1518                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1519                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1520                                      CONTEXT_TT_MULTI_LEVEL;
1521         }
1522         /*
1523          * In pass through mode, AW must be programmed to indicate the largest
1524          * AGAW value supported by hardware. And ASR is ignored by hardware.
1525          */
1526         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1527                 context_set_address_width(context, iommu->msagaw);
1528         else {
1529                 context_set_address_root(context, virt_to_phys(pgd));
1530                 context_set_address_width(context, iommu->agaw);
1531         }
1532
1533         context_set_translation_type(context, translation);
1534         context_set_fault_enable(context);
1535         context_set_present(context);
1536         domain_flush_cache(domain, context, sizeof(*context));
1537
1538         /*
1539          * It's a non-present to present mapping. If hardware doesn't cache
1540          * non-present entry we only need to flush the write-buffer. If the
1541          * _does_ cache non-present entries, then it does so in the special
1542          * domain #0, which we have to flush:
1543          */
1544         if (cap_caching_mode(iommu->cap)) {
1545                 iommu->flush.flush_context(iommu, 0,
1546                                            (((u16)bus) << 8) | devfn,
1547                                            DMA_CCMD_MASK_NOBIT,
1548                                            DMA_CCMD_DEVICE_INVL);
1549                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1550         } else {
1551                 iommu_flush_write_buffer(iommu);
1552         }
1553         iommu_enable_dev_iotlb(info);
1554         spin_unlock_irqrestore(&iommu->lock, flags);
1555
1556         spin_lock_irqsave(&domain->iommu_lock, flags);
1557         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1558                 domain->iommu_count++;
1559                 if (domain->iommu_count == 1)
1560                         domain->nid = iommu->node;
1561                 domain_update_iommu_cap(domain);
1562         }
1563         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1564         return 0;
1565 }
1566
1567 static int
1568 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1569                         int translation)
1570 {
1571         int ret;
1572         struct pci_dev *tmp, *parent;
1573
1574         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1575                                          pdev->bus->number, pdev->devfn,
1576                                          translation);
1577         if (ret)
1578                 return ret;
1579
1580         /* dependent device mapping */
1581         tmp = pci_find_upstream_pcie_bridge(pdev);
1582         if (!tmp)
1583                 return 0;
1584         /* Secondary interface's bus number and devfn 0 */
1585         parent = pdev->bus->self;
1586         while (parent != tmp) {
1587                 ret = domain_context_mapping_one(domain,
1588                                                  pci_domain_nr(parent->bus),
1589                                                  parent->bus->number,
1590                                                  parent->devfn, translation);
1591                 if (ret)
1592                         return ret;
1593                 parent = parent->bus->self;
1594         }
1595         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1596                 return domain_context_mapping_one(domain,
1597                                         pci_domain_nr(tmp->subordinate),
1598                                         tmp->subordinate->number, 0,
1599                                         translation);
1600         else /* this is a legacy PCI bridge */
1601                 return domain_context_mapping_one(domain,
1602                                                   pci_domain_nr(tmp->bus),
1603                                                   tmp->bus->number,
1604                                                   tmp->devfn,
1605                                                   translation);
1606 }
1607
1608 static int domain_context_mapped(struct pci_dev *pdev)
1609 {
1610         int ret;
1611         struct pci_dev *tmp, *parent;
1612         struct intel_iommu *iommu;
1613
1614         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1615                                 pdev->devfn);
1616         if (!iommu)
1617                 return -ENODEV;
1618
1619         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1620         if (!ret)
1621                 return ret;
1622         /* dependent device mapping */
1623         tmp = pci_find_upstream_pcie_bridge(pdev);
1624         if (!tmp)
1625                 return ret;
1626         /* Secondary interface's bus number and devfn 0 */
1627         parent = pdev->bus->self;
1628         while (parent != tmp) {
1629                 ret = device_context_mapped(iommu, parent->bus->number,
1630                                             parent->devfn);
1631                 if (!ret)
1632                         return ret;
1633                 parent = parent->bus->self;
1634         }
1635         if (pci_is_pcie(tmp))
1636                 return device_context_mapped(iommu, tmp->subordinate->number,
1637                                              0);
1638         else
1639                 return device_context_mapped(iommu, tmp->bus->number,
1640                                              tmp->devfn);
1641 }
1642
1643 /* Returns a number of VTD pages, but aligned to MM page size */
1644 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1645                                             size_t size)
1646 {
1647         host_addr &= ~PAGE_MASK;
1648         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1649 }
1650
1651 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1652                             struct scatterlist *sg, unsigned long phys_pfn,
1653                             unsigned long nr_pages, int prot)
1654 {
1655         struct dma_pte *first_pte = NULL, *pte = NULL;
1656         phys_addr_t uninitialized_var(pteval);
1657         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1658         unsigned long sg_res;
1659
1660         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1661
1662         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1663                 return -EINVAL;
1664
1665         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1666
1667         if (sg)
1668                 sg_res = 0;
1669         else {
1670                 sg_res = nr_pages + 1;
1671                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1672         }
1673
1674         while (nr_pages--) {
1675                 uint64_t tmp;
1676
1677                 if (!sg_res) {
1678                         sg_res = aligned_nrpages(sg->offset, sg->length);
1679                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1680                         sg->dma_length = sg->length;
1681                         pteval = page_to_phys(sg_page(sg)) | prot;
1682                 }
1683                 if (!pte) {
1684                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1685                         if (!pte)
1686                                 return -ENOMEM;
1687                 }
1688                 /* We don't need lock here, nobody else
1689                  * touches the iova range
1690                  */
1691                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1692                 if (tmp) {
1693                         static int dumps = 5;
1694                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1695                                iov_pfn, tmp, (unsigned long long)pteval);
1696                         if (dumps) {
1697                                 dumps--;
1698                                 debug_dma_dump_mappings(NULL);
1699                         }
1700                         WARN_ON(1);
1701                 }
1702                 pte++;
1703                 if (!nr_pages || first_pte_in_page(pte)) {
1704                         domain_flush_cache(domain, first_pte,
1705                                            (void *)pte - (void *)first_pte);
1706                         pte = NULL;
1707                 }
1708                 iov_pfn++;
1709                 pteval += VTD_PAGE_SIZE;
1710                 sg_res--;
1711                 if (!sg_res)
1712                         sg = sg_next(sg);
1713         }
1714         return 0;
1715 }
1716
1717 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1718                                     struct scatterlist *sg, unsigned long nr_pages,
1719                                     int prot)
1720 {
1721         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1722 }
1723
1724 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1725                                      unsigned long phys_pfn, unsigned long nr_pages,
1726                                      int prot)
1727 {
1728         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1729 }
1730
1731 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1732 {
1733         if (!iommu)
1734                 return;
1735
1736         clear_context_table(iommu, bus, devfn);
1737         iommu->flush.flush_context(iommu, 0, 0, 0,
1738                                            DMA_CCMD_GLOBAL_INVL);
1739         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1740 }
1741
1742 static void domain_remove_dev_info(struct dmar_domain *domain)
1743 {
1744         struct device_domain_info *info;
1745         unsigned long flags;
1746         struct intel_iommu *iommu;
1747
1748         spin_lock_irqsave(&device_domain_lock, flags);
1749         while (!list_empty(&domain->devices)) {
1750                 info = list_entry(domain->devices.next,
1751                         struct device_domain_info, link);
1752                 list_del(&info->link);
1753                 list_del(&info->global);
1754                 if (info->dev)
1755                         info->dev->dev.archdata.iommu = NULL;
1756                 spin_unlock_irqrestore(&device_domain_lock, flags);
1757
1758                 iommu_disable_dev_iotlb(info);
1759                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1760                 iommu_detach_dev(iommu, info->bus, info->devfn);
1761                 free_devinfo_mem(info);
1762
1763                 spin_lock_irqsave(&device_domain_lock, flags);
1764         }
1765         spin_unlock_irqrestore(&device_domain_lock, flags);
1766 }
1767
1768 /*
1769  * find_domain
1770  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1771  */
1772 static struct dmar_domain *
1773 find_domain(struct pci_dev *pdev)
1774 {
1775         struct device_domain_info *info;
1776
1777         /* No lock here, assumes no domain exit in normal case */
1778         info = pdev->dev.archdata.iommu;
1779         if (info)
1780                 return info->domain;
1781         return NULL;
1782 }
1783
1784 /* domain is initialized */
1785 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1786 {
1787         struct dmar_domain *domain, *found = NULL;
1788         struct intel_iommu *iommu;
1789         struct dmar_drhd_unit *drhd;
1790         struct device_domain_info *info, *tmp;
1791         struct pci_dev *dev_tmp;
1792         unsigned long flags;
1793         int bus = 0, devfn = 0;
1794         int segment;
1795         int ret;
1796
1797         domain = find_domain(pdev);
1798         if (domain)
1799                 return domain;
1800
1801         segment = pci_domain_nr(pdev->bus);
1802
1803         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1804         if (dev_tmp) {
1805                 if (pci_is_pcie(dev_tmp)) {
1806                         bus = dev_tmp->subordinate->number;
1807                         devfn = 0;
1808                 } else {
1809                         bus = dev_tmp->bus->number;
1810                         devfn = dev_tmp->devfn;
1811                 }
1812                 spin_lock_irqsave(&device_domain_lock, flags);
1813                 list_for_each_entry(info, &device_domain_list, global) {
1814                         if (info->segment == segment &&
1815                             info->bus == bus && info->devfn == devfn) {
1816                                 found = info->domain;
1817                                 break;
1818                         }
1819                 }
1820                 spin_unlock_irqrestore(&device_domain_lock, flags);
1821                 /* pcie-pci bridge already has a domain, uses it */
1822                 if (found) {
1823                         domain = found;
1824                         goto found_domain;
1825                 }
1826         }
1827
1828         domain = alloc_domain();
1829         if (!domain)
1830                 goto error;
1831
1832         /* Allocate new domain for the device */
1833         drhd = dmar_find_matched_drhd_unit(pdev);
1834         if (!drhd) {
1835                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1836                         pci_name(pdev));
1837                 return NULL;
1838         }
1839         iommu = drhd->iommu;
1840
1841         ret = iommu_attach_domain(domain, iommu);
1842         if (ret) {
1843                 free_domain_mem(domain);
1844                 goto error;
1845         }
1846
1847         if (domain_init(domain, gaw)) {
1848                 domain_exit(domain);
1849                 goto error;
1850         }
1851
1852         /* register pcie-to-pci device */
1853         if (dev_tmp) {
1854                 info = alloc_devinfo_mem();
1855                 if (!info) {
1856                         domain_exit(domain);
1857                         goto error;
1858                 }
1859                 info->segment = segment;
1860                 info->bus = bus;
1861                 info->devfn = devfn;
1862                 info->dev = NULL;
1863                 info->domain = domain;
1864                 /* This domain is shared by devices under p2p bridge */
1865                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1866
1867                 /* pcie-to-pci bridge already has a domain, uses it */
1868                 found = NULL;
1869                 spin_lock_irqsave(&device_domain_lock, flags);
1870                 list_for_each_entry(tmp, &device_domain_list, global) {
1871                         if (tmp->segment == segment &&
1872                             tmp->bus == bus && tmp->devfn == devfn) {
1873                                 found = tmp->domain;
1874                                 break;
1875                         }
1876                 }
1877                 if (found) {
1878                         spin_unlock_irqrestore(&device_domain_lock, flags);
1879                         free_devinfo_mem(info);
1880                         domain_exit(domain);
1881                         domain = found;
1882                 } else {
1883                         list_add(&info->link, &domain->devices);
1884                         list_add(&info->global, &device_domain_list);
1885                         spin_unlock_irqrestore(&device_domain_lock, flags);
1886                 }
1887         }
1888
1889 found_domain:
1890         info = alloc_devinfo_mem();
1891         if (!info)
1892                 goto error;
1893         info->segment = segment;
1894         info->bus = pdev->bus->number;
1895         info->devfn = pdev->devfn;
1896         info->dev = pdev;
1897         info->domain = domain;
1898         spin_lock_irqsave(&device_domain_lock, flags);
1899         /* somebody is fast */
1900         found = find_domain(pdev);
1901         if (found != NULL) {
1902                 spin_unlock_irqrestore(&device_domain_lock, flags);
1903                 if (found != domain) {
1904                         domain_exit(domain);
1905                         domain = found;
1906                 }
1907                 free_devinfo_mem(info);
1908                 return domain;
1909         }
1910         list_add(&info->link, &domain->devices);
1911         list_add(&info->global, &device_domain_list);
1912         pdev->dev.archdata.iommu = info;
1913         spin_unlock_irqrestore(&device_domain_lock, flags);
1914         return domain;
1915 error:
1916         /* recheck it here, maybe others set it */
1917         return find_domain(pdev);
1918 }
1919
1920 static int iommu_identity_mapping;
1921 #define IDENTMAP_ALL            1
1922 #define IDENTMAP_GFX            2
1923 #define IDENTMAP_AZALIA         4
1924
1925 static int iommu_domain_identity_map(struct dmar_domain *domain,
1926                                      unsigned long long start,
1927                                      unsigned long long end)
1928 {
1929         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1930         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1931
1932         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1933                           dma_to_mm_pfn(last_vpfn))) {
1934                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1935                 return -ENOMEM;
1936         }
1937
1938         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1939                  start, end, domain->id);
1940         /*
1941          * RMRR range might have overlap with physical memory range,
1942          * clear it first
1943          */
1944         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1945
1946         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1947                                   last_vpfn - first_vpfn + 1,
1948                                   DMA_PTE_READ|DMA_PTE_WRITE);
1949 }
1950
1951 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1952                                       unsigned long long start,
1953                                       unsigned long long end)
1954 {
1955         struct dmar_domain *domain;
1956         int ret;
1957
1958         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1959         if (!domain)
1960                 return -ENOMEM;
1961
1962         /* For _hardware_ passthrough, don't bother. But for software
1963            passthrough, we do it anyway -- it may indicate a memory
1964            range which is reserved in E820, so which didn't get set
1965            up to start with in si_domain */
1966         if (domain == si_domain && hw_pass_through) {
1967                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1968                        pci_name(pdev), start, end);
1969                 return 0;
1970         }
1971
1972         printk(KERN_INFO
1973                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1974                pci_name(pdev), start, end);
1975         
1976         if (end < start) {
1977                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1978                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1979                         dmi_get_system_info(DMI_BIOS_VENDOR),
1980                         dmi_get_system_info(DMI_BIOS_VERSION),
1981                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1982                 ret = -EIO;
1983                 goto error;
1984         }
1985
1986         if (end >> agaw_to_width(domain->agaw)) {
1987                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1988                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1989                      agaw_to_width(domain->agaw),
1990                      dmi_get_system_info(DMI_BIOS_VENDOR),
1991                      dmi_get_system_info(DMI_BIOS_VERSION),
1992                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1993                 ret = -EIO;
1994                 goto error;
1995         }
1996
1997         ret = iommu_domain_identity_map(domain, start, end);
1998         if (ret)
1999                 goto error;
2000
2001         /* context entry init */
2002         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2003         if (ret)
2004                 goto error;
2005
2006         return 0;
2007
2008  error:
2009         domain_exit(domain);
2010         return ret;
2011 }
2012
2013 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2014         struct pci_dev *pdev)
2015 {
2016         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2017                 return 0;
2018         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2019                 rmrr->end_address + 1);
2020 }
2021
2022 #ifdef CONFIG_DMAR_FLOPPY_WA
2023 static inline void iommu_prepare_isa(void)
2024 {
2025         struct pci_dev *pdev;
2026         int ret;
2027
2028         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2029         if (!pdev)
2030                 return;
2031
2032         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2033         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2034
2035         if (ret)
2036                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2037                        "floppy might not work\n");
2038
2039 }
2040 #else
2041 static inline void iommu_prepare_isa(void)
2042 {
2043         return;
2044 }
2045 #endif /* !CONFIG_DMAR_FLPY_WA */
2046
2047 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2048
2049 static int __init si_domain_work_fn(unsigned long start_pfn,
2050                                     unsigned long end_pfn, void *datax)
2051 {
2052         int *ret = datax;
2053
2054         *ret = iommu_domain_identity_map(si_domain,
2055                                          (uint64_t)start_pfn << PAGE_SHIFT,
2056                                          (uint64_t)end_pfn << PAGE_SHIFT);
2057         return *ret;
2058
2059 }
2060
2061 static int __init si_domain_init(int hw)
2062 {
2063         struct dmar_drhd_unit *drhd;
2064         struct intel_iommu *iommu;
2065         int nid, ret = 0;
2066
2067         si_domain = alloc_domain();
2068         if (!si_domain)
2069                 return -EFAULT;
2070
2071         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2072
2073         for_each_active_iommu(iommu, drhd) {
2074                 ret = iommu_attach_domain(si_domain, iommu);
2075                 if (ret) {
2076                         domain_exit(si_domain);
2077                         return -EFAULT;
2078                 }
2079         }
2080
2081         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2082                 domain_exit(si_domain);
2083                 return -EFAULT;
2084         }
2085
2086         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2087
2088         if (hw)
2089                 return 0;
2090
2091         for_each_online_node(nid) {
2092                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2093                 if (ret)
2094                         return ret;
2095         }
2096
2097         return 0;
2098 }
2099
2100 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2101                                           struct pci_dev *pdev);
2102 static int identity_mapping(struct pci_dev *pdev)
2103 {
2104         struct device_domain_info *info;
2105
2106         if (likely(!iommu_identity_mapping))
2107                 return 0;
2108
2109
2110         list_for_each_entry(info, &si_domain->devices, link)
2111                 if (info->dev == pdev)
2112                         return 1;
2113         return 0;
2114 }
2115
2116 static int domain_add_dev_info(struct dmar_domain *domain,
2117                                struct pci_dev *pdev,
2118                                int translation)
2119 {
2120         struct device_domain_info *info;
2121         unsigned long flags;
2122         int ret;
2123
2124         info = alloc_devinfo_mem();
2125         if (!info)
2126                 return -ENOMEM;
2127
2128         ret = domain_context_mapping(domain, pdev, translation);
2129         if (ret) {
2130                 free_devinfo_mem(info);
2131                 return ret;
2132         }
2133
2134         info->segment = pci_domain_nr(pdev->bus);
2135         info->bus = pdev->bus->number;
2136         info->devfn = pdev->devfn;
2137         info->dev = pdev;
2138         info->domain = domain;
2139
2140         spin_lock_irqsave(&device_domain_lock, flags);
2141         list_add(&info->link, &domain->devices);
2142         list_add(&info->global, &device_domain_list);
2143         pdev->dev.archdata.iommu = info;
2144         spin_unlock_irqrestore(&device_domain_lock, flags);
2145
2146         return 0;
2147 }
2148
2149 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2150 {
2151         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2152                 return 1;
2153
2154         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2155                 return 1;
2156
2157         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2158                 return 0;
2159
2160         /*
2161          * We want to start off with all devices in the 1:1 domain, and
2162          * take them out later if we find they can't access all of memory.
2163          *
2164          * However, we can't do this for PCI devices behind bridges,
2165          * because all PCI devices behind the same bridge will end up
2166          * with the same source-id on their transactions.
2167          *
2168          * Practically speaking, we can't change things around for these
2169          * devices at run-time, because we can't be sure there'll be no
2170          * DMA transactions in flight for any of their siblings.
2171          * 
2172          * So PCI devices (unless they're on the root bus) as well as
2173          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2174          * the 1:1 domain, just in _case_ one of their siblings turns out
2175          * not to be able to map all of memory.
2176          */
2177         if (!pci_is_pcie(pdev)) {
2178                 if (!pci_is_root_bus(pdev->bus))
2179                         return 0;
2180                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2181                         return 0;
2182         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2183                 return 0;
2184
2185         /* 
2186          * At boot time, we don't yet know if devices will be 64-bit capable.
2187          * Assume that they will -- if they turn out not to be, then we can 
2188          * take them out of the 1:1 domain later.
2189          */
2190         if (!startup)
2191                 return pdev->dma_mask > DMA_BIT_MASK(32);
2192
2193         return 1;
2194 }
2195
2196 static int __init iommu_prepare_static_identity_mapping(int hw)
2197 {
2198         struct pci_dev *pdev = NULL;
2199         int ret;
2200
2201         ret = si_domain_init(hw);
2202         if (ret)
2203                 return -EFAULT;
2204
2205         for_each_pci_dev(pdev) {
2206                 if (iommu_should_identity_map(pdev, 1)) {
2207                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2208                                hw ? "hardware" : "software", pci_name(pdev));
2209
2210                         ret = domain_add_dev_info(si_domain, pdev,
2211                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2212                                                      CONTEXT_TT_MULTI_LEVEL);
2213                         if (ret)
2214                                 return ret;
2215                 }
2216         }
2217
2218         return 0;
2219 }
2220
2221 static int __init init_dmars(int force_on)
2222 {
2223         struct dmar_drhd_unit *drhd;
2224         struct dmar_rmrr_unit *rmrr;
2225         struct pci_dev *pdev;
2226         struct intel_iommu *iommu;
2227         int i, ret;
2228
2229         /*
2230          * for each drhd
2231          *    allocate root
2232          *    initialize and program root entry to not present
2233          * endfor
2234          */
2235         for_each_drhd_unit(drhd) {
2236                 g_num_of_iommus++;
2237                 /*
2238                  * lock not needed as this is only incremented in the single
2239                  * threaded kernel __init code path all other access are read
2240                  * only
2241                  */
2242         }
2243
2244         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2245                         GFP_KERNEL);
2246         if (!g_iommus) {
2247                 printk(KERN_ERR "Allocating global iommu array failed\n");
2248                 ret = -ENOMEM;
2249                 goto error;
2250         }
2251
2252         deferred_flush = kzalloc(g_num_of_iommus *
2253                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2254         if (!deferred_flush) {
2255                 ret = -ENOMEM;
2256                 goto error;
2257         }
2258
2259         for_each_drhd_unit(drhd) {
2260                 if (drhd->ignored)
2261                         continue;
2262
2263                 iommu = drhd->iommu;
2264                 g_iommus[iommu->seq_id] = iommu;
2265
2266                 ret = iommu_init_domains(iommu);
2267                 if (ret)
2268                         goto error;
2269
2270                 /*
2271                  * TBD:
2272                  * we could share the same root & context tables
2273                  * among all IOMMU's. Need to Split it later.
2274                  */
2275                 ret = iommu_alloc_root_entry(iommu);
2276                 if (ret) {
2277                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2278                         goto error;
2279                 }
2280                 if (!ecap_pass_through(iommu->ecap))
2281                         hw_pass_through = 0;
2282         }
2283
2284         /*
2285          * Start from the sane iommu hardware state.
2286          */
2287         for_each_drhd_unit(drhd) {
2288                 if (drhd->ignored)
2289                         continue;
2290
2291                 iommu = drhd->iommu;
2292
2293                 /*
2294                  * If the queued invalidation is already initialized by us
2295                  * (for example, while enabling interrupt-remapping) then
2296                  * we got the things already rolling from a sane state.
2297                  */
2298                 if (iommu->qi)
2299                         continue;
2300
2301                 /*
2302                  * Clear any previous faults.
2303                  */
2304                 dmar_fault(-1, iommu);
2305                 /*
2306                  * Disable queued invalidation if supported and already enabled
2307                  * before OS handover.
2308                  */
2309                 dmar_disable_qi(iommu);
2310         }
2311
2312         for_each_drhd_unit(drhd) {
2313                 if (drhd->ignored)
2314                         continue;
2315
2316                 iommu = drhd->iommu;
2317
2318                 if (dmar_enable_qi(iommu)) {
2319                         /*
2320                          * Queued Invalidate not enabled, use Register Based
2321                          * Invalidate
2322                          */
2323                         iommu->flush.flush_context = __iommu_flush_context;
2324                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2325                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2326                                "invalidation\n",
2327                                 iommu->seq_id,
2328                                (unsigned long long)drhd->reg_base_addr);
2329                 } else {
2330                         iommu->flush.flush_context = qi_flush_context;
2331                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2332                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2333                                "invalidation\n",
2334                                 iommu->seq_id,
2335                                (unsigned long long)drhd->reg_base_addr);
2336                 }
2337         }
2338
2339         if (iommu_pass_through)
2340                 iommu_identity_mapping |= IDENTMAP_ALL;
2341
2342 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2343         iommu_identity_mapping |= IDENTMAP_GFX;
2344 #endif
2345
2346         check_tylersburg_isoch();
2347
2348         /*
2349          * If pass through is not set or not enabled, setup context entries for
2350          * identity mappings for rmrr, gfx, and isa and may fall back to static
2351          * identity mapping if iommu_identity_mapping is set.
2352          */
2353         if (iommu_identity_mapping) {
2354                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2355                 if (ret) {
2356                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2357                         goto error;
2358                 }
2359         }
2360         /*
2361          * For each rmrr
2362          *   for each dev attached to rmrr
2363          *   do
2364          *     locate drhd for dev, alloc domain for dev
2365          *     allocate free domain
2366          *     allocate page table entries for rmrr
2367          *     if context not allocated for bus
2368          *           allocate and init context
2369          *           set present in root table for this bus
2370          *     init context with domain, translation etc
2371          *    endfor
2372          * endfor
2373          */
2374         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2375         for_each_rmrr_units(rmrr) {
2376                 for (i = 0; i < rmrr->devices_cnt; i++) {
2377                         pdev = rmrr->devices[i];
2378                         /*
2379                          * some BIOS lists non-exist devices in DMAR
2380                          * table.
2381                          */
2382                         if (!pdev)
2383                                 continue;
2384                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2385                         if (ret)
2386                                 printk(KERN_ERR
2387                                        "IOMMU: mapping reserved region failed\n");
2388                 }
2389         }
2390
2391         iommu_prepare_isa();
2392
2393         /*
2394          * for each drhd
2395          *   enable fault log
2396          *   global invalidate context cache
2397          *   global invalidate iotlb
2398          *   enable translation
2399          */
2400         for_each_drhd_unit(drhd) {
2401                 if (drhd->ignored) {
2402                         /*
2403                          * we always have to disable PMRs or DMA may fail on
2404                          * this device
2405                          */
2406                         if (force_on)
2407                                 iommu_disable_protect_mem_regions(drhd->iommu);
2408                         continue;
2409                 }
2410                 iommu = drhd->iommu;
2411
2412                 iommu_flush_write_buffer(iommu);
2413
2414                 ret = dmar_set_interrupt(iommu);
2415                 if (ret)
2416                         goto error;
2417
2418                 iommu_set_root_entry(iommu);
2419
2420                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2421                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2422
2423                 ret = iommu_enable_translation(iommu);
2424                 if (ret)
2425                         goto error;
2426
2427                 iommu_disable_protect_mem_regions(iommu);
2428         }
2429
2430         return 0;
2431 error:
2432         for_each_drhd_unit(drhd) {
2433                 if (drhd->ignored)
2434                         continue;
2435                 iommu = drhd->iommu;
2436                 free_iommu(iommu);
2437         }
2438         kfree(g_iommus);
2439         return ret;
2440 }
2441
2442 /* This takes a number of _MM_ pages, not VTD pages */
2443 static struct iova *intel_alloc_iova(struct device *dev,
2444                                      struct dmar_domain *domain,
2445                                      unsigned long nrpages, uint64_t dma_mask)
2446 {
2447         struct pci_dev *pdev = to_pci_dev(dev);
2448         struct iova *iova = NULL;
2449
2450         /* Restrict dma_mask to the width that the iommu can handle */
2451         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2452
2453         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2454                 /*
2455                  * First try to allocate an io virtual address in
2456                  * DMA_BIT_MASK(32) and if that fails then try allocating
2457                  * from higher range
2458                  */
2459                 iova = alloc_iova(&domain->iovad, nrpages,
2460                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2461                 if (iova)
2462                         return iova;
2463         }
2464         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2465         if (unlikely(!iova)) {
2466                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2467                        nrpages, pci_name(pdev));
2468                 return NULL;
2469         }
2470
2471         return iova;
2472 }
2473
2474 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2475 {
2476         struct dmar_domain *domain;
2477         int ret;
2478
2479         domain = get_domain_for_dev(pdev,
2480                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2481         if (!domain) {
2482                 printk(KERN_ERR
2483                         "Allocating domain for %s failed", pci_name(pdev));
2484                 return NULL;
2485         }
2486
2487         /* make sure context mapping is ok */
2488         if (unlikely(!domain_context_mapped(pdev))) {
2489                 ret = domain_context_mapping(domain, pdev,
2490                                              CONTEXT_TT_MULTI_LEVEL);
2491                 if (ret) {
2492                         printk(KERN_ERR
2493                                 "Domain context map for %s failed",
2494                                 pci_name(pdev));
2495                         return NULL;
2496                 }
2497         }
2498
2499         return domain;
2500 }
2501
2502 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2503 {
2504         struct device_domain_info *info;
2505
2506         /* No lock here, assumes no domain exit in normal case */
2507         info = dev->dev.archdata.iommu;
2508         if (likely(info))
2509                 return info->domain;
2510
2511         return __get_valid_domain_for_dev(dev);
2512 }
2513
2514 static int iommu_dummy(struct pci_dev *pdev)
2515 {
2516         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2517 }
2518
2519 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2520 static int iommu_no_mapping(struct device *dev)
2521 {
2522         struct pci_dev *pdev;
2523         int found;
2524
2525         if (unlikely(dev->bus != &pci_bus_type))
2526                 return 1;
2527
2528         pdev = to_pci_dev(dev);
2529         if (iommu_dummy(pdev))
2530                 return 1;
2531
2532         if (!iommu_identity_mapping)
2533                 return 0;
2534
2535         found = identity_mapping(pdev);
2536         if (found) {
2537                 if (iommu_should_identity_map(pdev, 0))
2538                         return 1;
2539                 else {
2540                         /*
2541                          * 32 bit DMA is removed from si_domain and fall back
2542                          * to non-identity mapping.
2543                          */
2544                         domain_remove_one_dev_info(si_domain, pdev);
2545                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2546                                pci_name(pdev));
2547                         return 0;
2548                 }
2549         } else {
2550                 /*
2551                  * In case of a detached 64 bit DMA device from vm, the device
2552                  * is put into si_domain for identity mapping.
2553                  */
2554                 if (iommu_should_identity_map(pdev, 0)) {
2555                         int ret;
2556                         ret = domain_add_dev_info(si_domain, pdev,
2557                                                   hw_pass_through ?
2558                                                   CONTEXT_TT_PASS_THROUGH :
2559                                                   CONTEXT_TT_MULTI_LEVEL);
2560                         if (!ret) {
2561                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2562                                        pci_name(pdev));
2563                                 return 1;
2564                         }
2565                 }
2566         }
2567
2568         return 0;
2569 }
2570
2571 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2572                                      size_t size, int dir, u64 dma_mask)
2573 {
2574         struct pci_dev *pdev = to_pci_dev(hwdev);
2575         struct dmar_domain *domain;
2576         phys_addr_t start_paddr;
2577         struct iova *iova;
2578         int prot = 0;
2579         int ret;
2580         struct intel_iommu *iommu;
2581         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2582
2583         BUG_ON(dir == DMA_NONE);
2584
2585         if (iommu_no_mapping(hwdev))
2586                 return paddr;
2587
2588         domain = get_valid_domain_for_dev(pdev);
2589         if (!domain)
2590                 return 0;
2591
2592         iommu = domain_get_iommu(domain);
2593         size = aligned_nrpages(paddr, size);
2594
2595         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2596                                 pdev->dma_mask);
2597         if (!iova)
2598                 goto error;
2599
2600         /*
2601          * Check if DMAR supports zero-length reads on write only
2602          * mappings..
2603          */
2604         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2605                         !cap_zlr(iommu->cap))
2606                 prot |= DMA_PTE_READ;
2607         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2608                 prot |= DMA_PTE_WRITE;
2609         /*
2610          * paddr - (paddr + size) might be partial page, we should map the whole
2611          * page.  Note: if two part of one page are separately mapped, we
2612          * might have two guest_addr mapping to the same host paddr, but this
2613          * is not a big problem
2614          */
2615         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2616                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2617         if (ret)
2618                 goto error;
2619
2620         /* it's a non-present to present mapping. Only flush if caching mode */
2621         if (cap_caching_mode(iommu->cap))
2622                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2623         else
2624                 iommu_flush_write_buffer(iommu);
2625
2626         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2627         start_paddr += paddr & ~PAGE_MASK;
2628         return start_paddr;
2629
2630 error:
2631         if (iova)
2632                 __free_iova(&domain->iovad, iova);
2633         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2634                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2635         return 0;
2636 }
2637
2638 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2639                                  unsigned long offset, size_t size,
2640                                  enum dma_data_direction dir,
2641                                  struct dma_attrs *attrs)
2642 {
2643         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2644                                   dir, to_pci_dev(dev)->dma_mask);
2645 }
2646
2647 static void flush_unmaps(void)
2648 {
2649         int i, j;
2650
2651         timer_on = 0;
2652
2653         /* just flush them all */
2654         for (i = 0; i < g_num_of_iommus; i++) {
2655                 struct intel_iommu *iommu = g_iommus[i];
2656                 if (!iommu)
2657                         continue;
2658
2659                 if (!deferred_flush[i].next)
2660                         continue;
2661
2662                 /* In caching mode, global flushes turn emulation expensive */
2663                 if (!cap_caching_mode(iommu->cap))
2664                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2665                                          DMA_TLB_GLOBAL_FLUSH);
2666                 for (j = 0; j < deferred_flush[i].next; j++) {
2667                         unsigned long mask;
2668                         struct iova *iova = deferred_flush[i].iova[j];
2669                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2670
2671                         /* On real hardware multiple invalidations are expensive */
2672                         if (cap_caching_mode(iommu->cap))
2673                                 iommu_flush_iotlb_psi(iommu, domain->id,
2674                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2675                         else {
2676                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2677                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2678                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2679                         }
2680                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2681                 }
2682                 deferred_flush[i].next = 0;
2683         }
2684
2685         list_size = 0;
2686 }
2687
2688 static void flush_unmaps_timeout(unsigned long data)
2689 {
2690         unsigned long flags;
2691
2692         spin_lock_irqsave(&async_umap_flush_lock, flags);
2693         flush_unmaps();
2694         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2695 }
2696
2697 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2698 {
2699         unsigned long flags;
2700         int next, iommu_id;
2701         struct intel_iommu *iommu;
2702
2703         spin_lock_irqsave(&async_umap_flush_lock, flags);
2704         if (list_size == HIGH_WATER_MARK)
2705                 flush_unmaps();
2706
2707         iommu = domain_get_iommu(dom);
2708         iommu_id = iommu->seq_id;
2709
2710         next = deferred_flush[iommu_id].next;
2711         deferred_flush[iommu_id].domain[next] = dom;
2712         deferred_flush[iommu_id].iova[next] = iova;
2713         deferred_flush[iommu_id].next++;
2714
2715         if (!timer_on) {
2716                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2717                 timer_on = 1;
2718         }
2719         list_size++;
2720         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2721 }
2722
2723 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2724                              size_t size, enum dma_data_direction dir,
2725                              struct dma_attrs *attrs)
2726 {
2727         struct pci_dev *pdev = to_pci_dev(dev);
2728         struct dmar_domain *domain;
2729         unsigned long start_pfn, last_pfn;
2730         struct iova *iova;
2731         struct intel_iommu *iommu;
2732
2733         if (iommu_no_mapping(dev))
2734                 return;
2735
2736         domain = find_domain(pdev);
2737         BUG_ON(!domain);
2738
2739         iommu = domain_get_iommu(domain);
2740
2741         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2742         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2743                       (unsigned long long)dev_addr))
2744                 return;
2745
2746         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2747         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2748
2749         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2750                  pci_name(pdev), start_pfn, last_pfn);
2751
2752         /*  clear the whole page */
2753         dma_pte_clear_range(domain, start_pfn, last_pfn);
2754
2755         /* free page tables */
2756         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2757
2758         if (intel_iommu_strict) {
2759                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2760                                       last_pfn - start_pfn + 1, 0);
2761                 /* free iova */
2762                 __free_iova(&domain->iovad, iova);
2763         } else {
2764                 add_unmap(domain, iova);
2765                 /*
2766                  * queue up the release of the unmap to save the 1/6th of the
2767                  * cpu used up by the iotlb flush operation...
2768                  */
2769         }
2770 }
2771
2772 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2773                                   dma_addr_t *dma_handle, gfp_t flags)
2774 {
2775         void *vaddr;
2776         int order;
2777
2778         size = PAGE_ALIGN(size);
2779         order = get_order(size);
2780
2781         if (!iommu_no_mapping(hwdev))
2782                 flags &= ~(GFP_DMA | GFP_DMA32);
2783         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2784                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2785                         flags |= GFP_DMA;
2786                 else
2787                         flags |= GFP_DMA32;
2788         }
2789
2790         vaddr = (void *)__get_free_pages(flags, order);
2791         if (!vaddr)
2792                 return NULL;
2793         memset(vaddr, 0, size);
2794
2795         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2796                                          DMA_BIDIRECTIONAL,
2797                                          hwdev->coherent_dma_mask);
2798         if (*dma_handle)
2799                 return vaddr;
2800         free_pages((unsigned long)vaddr, order);
2801         return NULL;
2802 }
2803
2804 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2805                                 dma_addr_t dma_handle)
2806 {
2807         int order;
2808
2809         size = PAGE_ALIGN(size);
2810         order = get_order(size);
2811
2812         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2813         free_pages((unsigned long)vaddr, order);
2814 }
2815
2816 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2817                            int nelems, enum dma_data_direction dir,
2818                            struct dma_attrs *attrs)
2819 {
2820         struct pci_dev *pdev = to_pci_dev(hwdev);
2821         struct dmar_domain *domain;
2822         unsigned long start_pfn, last_pfn;
2823         struct iova *iova;
2824         struct intel_iommu *iommu;
2825
2826         if (iommu_no_mapping(hwdev))
2827                 return;
2828
2829         domain = find_domain(pdev);
2830         BUG_ON(!domain);
2831
2832         iommu = domain_get_iommu(domain);
2833
2834         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2835         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2836                       (unsigned long long)sglist[0].dma_address))
2837                 return;
2838
2839         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2840         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2841
2842         /*  clear the whole page */
2843         dma_pte_clear_range(domain, start_pfn, last_pfn);
2844
2845         /* free page tables */
2846         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2847
2848         if (intel_iommu_strict) {
2849                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2850                                       last_pfn - start_pfn + 1, 0);
2851                 /* free iova */
2852                 __free_iova(&domain->iovad, iova);
2853         } else {
2854                 add_unmap(domain, iova);
2855                 /*
2856                  * queue up the release of the unmap to save the 1/6th of the
2857                  * cpu used up by the iotlb flush operation...
2858                  */
2859         }
2860 }
2861
2862 static int intel_nontranslate_map_sg(struct device *hddev,
2863         struct scatterlist *sglist, int nelems, int dir)
2864 {
2865         int i;
2866         struct scatterlist *sg;
2867
2868         for_each_sg(sglist, sg, nelems, i) {
2869                 BUG_ON(!sg_page(sg));
2870                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2871                 sg->dma_length = sg->length;
2872         }
2873         return nelems;
2874 }
2875
2876 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2877                         enum dma_data_direction dir, struct dma_attrs *attrs)
2878 {
2879         int i;
2880         struct pci_dev *pdev = to_pci_dev(hwdev);
2881         struct dmar_domain *domain;
2882         size_t size = 0;
2883         int prot = 0;
2884         struct iova *iova = NULL;
2885         int ret;
2886         struct scatterlist *sg;
2887         unsigned long start_vpfn;
2888         struct intel_iommu *iommu;
2889
2890         BUG_ON(dir == DMA_NONE);
2891         if (iommu_no_mapping(hwdev))
2892                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2893
2894         domain = get_valid_domain_for_dev(pdev);
2895         if (!domain)
2896                 return 0;
2897
2898         iommu = domain_get_iommu(domain);
2899
2900         for_each_sg(sglist, sg, nelems, i)
2901                 size += aligned_nrpages(sg->offset, sg->length);
2902
2903         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2904                                 pdev->dma_mask);
2905         if (!iova) {
2906                 sglist->dma_length = 0;
2907                 return 0;
2908         }
2909
2910         /*
2911          * Check if DMAR supports zero-length reads on write only
2912          * mappings..
2913          */
2914         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2915                         !cap_zlr(iommu->cap))
2916                 prot |= DMA_PTE_READ;
2917         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2918                 prot |= DMA_PTE_WRITE;
2919
2920         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2921
2922         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2923         if (unlikely(ret)) {
2924                 /*  clear the page */
2925                 dma_pte_clear_range(domain, start_vpfn,
2926                                     start_vpfn + size - 1);
2927                 /* free page tables */
2928                 dma_pte_free_pagetable(domain, start_vpfn,
2929                                        start_vpfn + size - 1);
2930                 /* free iova */
2931                 __free_iova(&domain->iovad, iova);
2932                 return 0;
2933         }
2934
2935         /* it's a non-present to present mapping. Only flush if caching mode */
2936         if (cap_caching_mode(iommu->cap))
2937                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
2938         else
2939                 iommu_flush_write_buffer(iommu);
2940
2941         return nelems;
2942 }
2943
2944 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2945 {
2946         return !dma_addr;
2947 }
2948
2949 struct dma_map_ops intel_dma_ops = {
2950         .alloc_coherent = intel_alloc_coherent,
2951         .free_coherent = intel_free_coherent,
2952         .map_sg = intel_map_sg,
2953         .unmap_sg = intel_unmap_sg,
2954         .map_page = intel_map_page,
2955         .unmap_page = intel_unmap_page,
2956         .mapping_error = intel_mapping_error,
2957 };
2958
2959 static inline int iommu_domain_cache_init(void)
2960 {
2961         int ret = 0;
2962
2963         iommu_domain_cache = kmem_cache_create("iommu_domain",
2964                                          sizeof(struct dmar_domain),
2965                                          0,
2966                                          SLAB_HWCACHE_ALIGN,
2967
2968                                          NULL);
2969         if (!iommu_domain_cache) {
2970                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2971                 ret = -ENOMEM;
2972         }
2973
2974         return ret;
2975 }
2976
2977 static inline int iommu_devinfo_cache_init(void)
2978 {
2979         int ret = 0;
2980
2981         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2982                                          sizeof(struct device_domain_info),
2983                                          0,
2984                                          SLAB_HWCACHE_ALIGN,
2985                                          NULL);
2986         if (!iommu_devinfo_cache) {
2987                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2988                 ret = -ENOMEM;
2989         }
2990
2991         return ret;
2992 }
2993
2994 static inline int iommu_iova_cache_init(void)
2995 {
2996         int ret = 0;
2997
2998         iommu_iova_cache = kmem_cache_create("iommu_iova",
2999                                          sizeof(struct iova),
3000                                          0,
3001                                          SLAB_HWCACHE_ALIGN,
3002                                          NULL);
3003         if (!iommu_iova_cache) {
3004                 printk(KERN_ERR "Couldn't create iova cache\n");
3005                 ret = -ENOMEM;
3006         }
3007
3008         return ret;
3009 }
3010
3011 static int __init iommu_init_mempool(void)
3012 {
3013         int ret;
3014         ret = iommu_iova_cache_init();
3015         if (ret)
3016                 return ret;
3017
3018         ret = iommu_domain_cache_init();
3019         if (ret)
3020                 goto domain_error;
3021
3022         ret = iommu_devinfo_cache_init();
3023         if (!ret)
3024                 return ret;
3025
3026         kmem_cache_destroy(iommu_domain_cache);
3027 domain_error:
3028         kmem_cache_destroy(iommu_iova_cache);
3029
3030         return -ENOMEM;
3031 }
3032
3033 static void __init iommu_exit_mempool(void)
3034 {
3035         kmem_cache_destroy(iommu_devinfo_cache);
3036         kmem_cache_destroy(iommu_domain_cache);
3037         kmem_cache_destroy(iommu_iova_cache);
3038
3039 }
3040
3041 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3042 {
3043         struct dmar_drhd_unit *drhd;
3044         u32 vtbar;
3045         int rc;
3046
3047         /* We know that this device on this chipset has its own IOMMU.
3048          * If we find it under a different IOMMU, then the BIOS is lying
3049          * to us. Hope that the IOMMU for this device is actually
3050          * disabled, and it needs no translation...
3051          */
3052         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3053         if (rc) {
3054                 /* "can't" happen */
3055                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3056                 return;
3057         }
3058         vtbar &= 0xffff0000;
3059
3060         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3061         drhd = dmar_find_matched_drhd_unit(pdev);
3062         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3063                             TAINT_FIRMWARE_WORKAROUND,
3064                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3065                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3066 }
3067 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3068
3069 static void __init init_no_remapping_devices(void)
3070 {
3071         struct dmar_drhd_unit *drhd;
3072
3073         for_each_drhd_unit(drhd) {
3074                 if (!drhd->include_all) {
3075                         int i;
3076                         for (i = 0; i < drhd->devices_cnt; i++)
3077                                 if (drhd->devices[i] != NULL)
3078                                         break;
3079                         /* ignore DMAR unit if no pci devices exist */
3080                         if (i == drhd->devices_cnt)
3081                                 drhd->ignored = 1;
3082                 }
3083         }
3084
3085         if (dmar_map_gfx)
3086                 return;
3087
3088         for_each_drhd_unit(drhd) {
3089                 int i;
3090                 if (drhd->ignored || drhd->include_all)
3091                         continue;
3092
3093                 for (i = 0; i < drhd->devices_cnt; i++)
3094                         if (drhd->devices[i] &&
3095                                 !IS_GFX_DEVICE(drhd->devices[i]))
3096                                 break;
3097
3098                 if (i < drhd->devices_cnt)
3099                         continue;
3100
3101                 /* bypass IOMMU if it is just for gfx devices */
3102                 drhd->ignored = 1;
3103                 for (i = 0; i < drhd->devices_cnt; i++) {
3104                         if (!drhd->devices[i])
3105                                 continue;
3106                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3107                 }
3108         }
3109 }
3110
3111 #ifdef CONFIG_SUSPEND
3112 static int init_iommu_hw(void)
3113 {
3114         struct dmar_drhd_unit *drhd;
3115         struct intel_iommu *iommu = NULL;
3116
3117         for_each_active_iommu(iommu, drhd)
3118                 if (iommu->qi)
3119                         dmar_reenable_qi(iommu);
3120
3121         for_each_active_iommu(iommu, drhd) {
3122                 iommu_flush_write_buffer(iommu);
3123
3124                 iommu_set_root_entry(iommu);
3125
3126                 iommu->flush.flush_context(iommu, 0, 0, 0,
3127                                            DMA_CCMD_GLOBAL_INVL);
3128                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3129                                          DMA_TLB_GLOBAL_FLUSH);
3130                 iommu_enable_translation(iommu);
3131                 iommu_disable_protect_mem_regions(iommu);
3132         }
3133
3134         return 0;
3135 }
3136
3137 static void iommu_flush_all(void)
3138 {
3139         struct dmar_drhd_unit *drhd;
3140         struct intel_iommu *iommu;
3141
3142         for_each_active_iommu(iommu, drhd) {
3143                 iommu->flush.flush_context(iommu, 0, 0, 0,
3144                                            DMA_CCMD_GLOBAL_INVL);
3145                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3146                                          DMA_TLB_GLOBAL_FLUSH);
3147         }
3148 }
3149
3150 static int iommu_suspend(void)
3151 {
3152         struct dmar_drhd_unit *drhd;
3153         struct intel_iommu *iommu = NULL;
3154         unsigned long flag;
3155
3156         for_each_active_iommu(iommu, drhd) {
3157                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3158                                                  GFP_ATOMIC);
3159                 if (!iommu->iommu_state)
3160                         goto nomem;
3161         }
3162
3163         iommu_flush_all();
3164
3165         for_each_active_iommu(iommu, drhd) {
3166                 iommu_disable_translation(iommu);
3167
3168                 spin_lock_irqsave(&iommu->register_lock, flag);
3169
3170                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3171                         readl(iommu->reg + DMAR_FECTL_REG);
3172                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3173                         readl(iommu->reg + DMAR_FEDATA_REG);
3174                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3175                         readl(iommu->reg + DMAR_FEADDR_REG);
3176                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3177                         readl(iommu->reg + DMAR_FEUADDR_REG);
3178
3179                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3180         }
3181         return 0;
3182
3183 nomem:
3184         for_each_active_iommu(iommu, drhd)
3185                 kfree(iommu->iommu_state);
3186
3187         return -ENOMEM;
3188 }
3189
3190 static void iommu_resume(void)
3191 {
3192         struct dmar_drhd_unit *drhd;
3193         struct intel_iommu *iommu = NULL;
3194         unsigned long flag;
3195
3196         if (init_iommu_hw()) {
3197                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3198                 return;
3199         }
3200
3201         for_each_active_iommu(iommu, drhd) {
3202
3203                 spin_lock_irqsave(&iommu->register_lock, flag);
3204
3205                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3206                         iommu->reg + DMAR_FECTL_REG);
3207                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3208                         iommu->reg + DMAR_FEDATA_REG);
3209                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3210                         iommu->reg + DMAR_FEADDR_REG);
3211                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3212                         iommu->reg + DMAR_FEUADDR_REG);
3213
3214                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3215         }
3216
3217         for_each_active_iommu(iommu, drhd)
3218                 kfree(iommu->iommu_state);
3219 }
3220
3221 static struct syscore_ops iommu_syscore_ops = {
3222         .resume         = iommu_resume,
3223         .suspend        = iommu_suspend,
3224 };
3225
3226 static void __init init_iommu_pm_ops(void)
3227 {
3228         register_syscore_ops(&iommu_syscore_ops);
3229 }
3230
3231 #else
3232 static inline int init_iommu_pm_ops(void) { }
3233 #endif  /* CONFIG_PM */
3234
3235 /*
3236  * Here we only respond to action of unbound device from driver.
3237  *
3238  * Added device is not attached to its DMAR domain here yet. That will happen
3239  * when mapping the device to iova.
3240  */
3241 static int device_notifier(struct notifier_block *nb,
3242                                   unsigned long action, void *data)
3243 {
3244         struct device *dev = data;
3245         struct pci_dev *pdev = to_pci_dev(dev);
3246         struct dmar_domain *domain;
3247
3248         if (iommu_no_mapping(dev))
3249                 return 0;
3250
3251         domain = find_domain(pdev);
3252         if (!domain)
3253                 return 0;
3254
3255         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3256                 domain_remove_one_dev_info(domain, pdev);
3257
3258                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3259                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3260                     list_empty(&domain->devices))
3261                         domain_exit(domain);
3262         }
3263
3264         return 0;
3265 }
3266
3267 static struct notifier_block device_nb = {
3268         .notifier_call = device_notifier,
3269 };
3270
3271 int __init intel_iommu_init(void)
3272 {
3273         int ret = 0;
3274         int force_on = 0;
3275
3276         /* VT-d is required for a TXT/tboot launch, so enforce that */
3277         force_on = tboot_force_iommu();
3278
3279         if (dmar_table_init()) {
3280                 if (force_on)
3281                         panic("tboot: Failed to initialize DMAR table\n");
3282                 return  -ENODEV;
3283         }
3284
3285         if (dmar_dev_scope_init()) {
3286                 if (force_on)
3287                         panic("tboot: Failed to initialize DMAR device scope\n");
3288                 return  -ENODEV;
3289         }
3290
3291         /*
3292          * Check the need for DMA-remapping initialization now.
3293          * Above initialization will also be used by Interrupt-remapping.
3294          */
3295         if (no_iommu || dmar_disabled)
3296                 return -ENODEV;
3297
3298         if (iommu_init_mempool()) {
3299                 if (force_on)
3300                         panic("tboot: Failed to initialize iommu memory\n");
3301                 return  -ENODEV;
3302         }
3303
3304         if (dmar_init_reserved_ranges()) {
3305                 if (force_on)
3306                         panic("tboot: Failed to reserve iommu ranges\n");
3307                 return  -ENODEV;
3308         }
3309
3310         init_no_remapping_devices();
3311
3312         ret = init_dmars(force_on);
3313         if (ret) {
3314                 if (force_on)
3315                         panic("tboot: Failed to initialize DMARs\n");
3316                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3317                 put_iova_domain(&reserved_iova_list);
3318                 iommu_exit_mempool();
3319                 return ret;
3320         }
3321         printk(KERN_INFO
3322         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3323
3324         init_timer(&unmap_timer);
3325 #ifdef CONFIG_SWIOTLB
3326         swiotlb = 0;
3327 #endif
3328         dma_ops = &intel_dma_ops;
3329
3330         init_iommu_pm_ops();
3331
3332         register_iommu(&intel_iommu_ops);
3333
3334         bus_register_notifier(&pci_bus_type, &device_nb);
3335
3336         return 0;
3337 }
3338
3339 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3340                                            struct pci_dev *pdev)
3341 {
3342         struct pci_dev *tmp, *parent;
3343
3344         if (!iommu || !pdev)
3345                 return;
3346
3347         /* dependent device detach */
3348         tmp = pci_find_upstream_pcie_bridge(pdev);
3349         /* Secondary interface's bus number and devfn 0 */
3350         if (tmp) {
3351                 parent = pdev->bus->self;
3352                 while (parent != tmp) {
3353                         iommu_detach_dev(iommu, parent->bus->number,
3354                                          parent->devfn);
3355                         parent = parent->bus->self;
3356                 }
3357                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3358                         iommu_detach_dev(iommu,
3359                                 tmp->subordinate->number, 0);
3360                 else /* this is a legacy PCI bridge */
3361                         iommu_detach_dev(iommu, tmp->bus->number,
3362                                          tmp->devfn);
3363         }
3364 }
3365
3366 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3367                                           struct pci_dev *pdev)
3368 {
3369         struct device_domain_info *info;
3370         struct intel_iommu *iommu;
3371         unsigned long flags;
3372         int found = 0;
3373         struct list_head *entry, *tmp;
3374
3375         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3376                                 pdev->devfn);
3377         if (!iommu)
3378                 return;
3379
3380         spin_lock_irqsave(&device_domain_lock, flags);
3381         list_for_each_safe(entry, tmp, &domain->devices) {
3382                 info = list_entry(entry, struct device_domain_info, link);
3383                 /* No need to compare PCI domain; it has to be the same */
3384                 if (info->bus == pdev->bus->number &&
3385                     info->devfn == pdev->devfn) {
3386                         list_del(&info->link);
3387                         list_del(&info->global);
3388                         if (info->dev)
3389                                 info->dev->dev.archdata.iommu = NULL;
3390                         spin_unlock_irqrestore(&device_domain_lock, flags);
3391
3392                         iommu_disable_dev_iotlb(info);
3393                         iommu_detach_dev(iommu, info->bus, info->devfn);
3394                         iommu_detach_dependent_devices(iommu, pdev);
3395                         free_devinfo_mem(info);
3396
3397                         spin_lock_irqsave(&device_domain_lock, flags);
3398
3399                         if (found)
3400                                 break;
3401                         else
3402                                 continue;
3403                 }
3404
3405                 /* if there is no other devices under the same iommu
3406                  * owned by this domain, clear this iommu in iommu_bmp
3407                  * update iommu count and coherency
3408                  */
3409                 if (iommu == device_to_iommu(info->segment, info->bus,
3410                                             info->devfn))
3411                         found = 1;
3412         }
3413
3414         if (found == 0) {
3415                 unsigned long tmp_flags;
3416                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3417                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3418                 domain->iommu_count--;
3419                 domain_update_iommu_cap(domain);
3420                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3421
3422                 spin_lock_irqsave(&iommu->lock, tmp_flags);
3423                 clear_bit(domain->id, iommu->domain_ids);
3424                 iommu->domains[domain->id] = NULL;
3425                 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3426         }
3427
3428         spin_unlock_irqrestore(&device_domain_lock, flags);
3429 }
3430
3431 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3432 {
3433         struct device_domain_info *info;
3434         struct intel_iommu *iommu;
3435         unsigned long flags1, flags2;
3436
3437         spin_lock_irqsave(&device_domain_lock, flags1);
3438         while (!list_empty(&domain->devices)) {
3439                 info = list_entry(domain->devices.next,
3440                         struct device_domain_info, link);
3441                 list_del(&info->link);
3442                 list_del(&info->global);
3443                 if (info->dev)
3444                         info->dev->dev.archdata.iommu = NULL;
3445
3446                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3447
3448                 iommu_disable_dev_iotlb(info);
3449                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3450                 iommu_detach_dev(iommu, info->bus, info->devfn);
3451                 iommu_detach_dependent_devices(iommu, info->dev);
3452
3453                 /* clear this iommu in iommu_bmp, update iommu count
3454                  * and capabilities
3455                  */
3456                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3457                 if (test_and_clear_bit(iommu->seq_id,
3458                                        &domain->iommu_bmp)) {
3459                         domain->iommu_count--;
3460                         domain_update_iommu_cap(domain);
3461                 }
3462                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3463
3464                 free_devinfo_mem(info);
3465                 spin_lock_irqsave(&device_domain_lock, flags1);
3466         }
3467         spin_unlock_irqrestore(&device_domain_lock, flags1);
3468 }
3469
3470 /* domain id for virtual machine, it won't be set in context */
3471 static unsigned long vm_domid;
3472
3473 static struct dmar_domain *iommu_alloc_vm_domain(void)
3474 {
3475         struct dmar_domain *domain;
3476
3477         domain = alloc_domain_mem();
3478         if (!domain)
3479                 return NULL;
3480
3481         domain->id = vm_domid++;
3482         domain->nid = -1;
3483         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3484         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3485
3486         return domain;
3487 }
3488
3489 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3490 {
3491         int adjust_width;
3492
3493         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3494         spin_lock_init(&domain->iommu_lock);
3495
3496         domain_reserve_special_ranges(domain);
3497
3498         /* calculate AGAW */
3499         domain->gaw = guest_width;
3500         adjust_width = guestwidth_to_adjustwidth(guest_width);
3501         domain->agaw = width_to_agaw(adjust_width);
3502
3503         INIT_LIST_HEAD(&domain->devices);
3504
3505         domain->iommu_count = 0;
3506         domain->iommu_coherency = 0;
3507         domain->iommu_snooping = 0;
3508         domain->max_addr = 0;
3509         domain->nid = -1;
3510
3511         /* always allocate the top pgd */
3512         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3513         if (!domain->pgd)
3514                 return -ENOMEM;
3515         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3516         return 0;
3517 }
3518
3519 static void iommu_free_vm_domain(struct dmar_domain *domain)
3520 {
3521         unsigned long flags;
3522         struct dmar_drhd_unit *drhd;
3523         struct intel_iommu *iommu;
3524         unsigned long i;
3525         unsigned long ndomains;
3526
3527         for_each_drhd_unit(drhd) {
3528                 if (drhd->ignored)
3529                         continue;
3530                 iommu = drhd->iommu;
3531
3532                 ndomains = cap_ndoms(iommu->cap);
3533                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3534                         if (iommu->domains[i] == domain) {
3535                                 spin_lock_irqsave(&iommu->lock, flags);
3536                                 clear_bit(i, iommu->domain_ids);
3537                                 iommu->domains[i] = NULL;
3538                                 spin_unlock_irqrestore(&iommu->lock, flags);
3539                                 break;
3540                         }
3541                 }
3542         }
3543 }
3544
3545 static void vm_domain_exit(struct dmar_domain *domain)
3546 {
3547         /* Domain 0 is reserved, so dont process it */
3548         if (!domain)
3549                 return;
3550
3551         vm_domain_remove_all_dev_info(domain);
3552         /* destroy iovas */
3553         put_iova_domain(&domain->iovad);
3554
3555         /* clear ptes */
3556         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3557
3558         /* free page tables */
3559         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3560
3561         iommu_free_vm_domain(domain);
3562         free_domain_mem(domain);
3563 }
3564
3565 static int intel_iommu_domain_init(struct iommu_domain *domain)
3566 {
3567         struct dmar_domain *dmar_domain;
3568
3569         dmar_domain = iommu_alloc_vm_domain();
3570         if (!dmar_domain) {
3571                 printk(KERN_ERR
3572                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3573                 return -ENOMEM;
3574         }
3575         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3576                 printk(KERN_ERR
3577                         "intel_iommu_domain_init() failed\n");
3578                 vm_domain_exit(dmar_domain);
3579                 return -ENOMEM;
3580         }
3581         domain->priv = dmar_domain;
3582
3583         return 0;
3584 }
3585
3586 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3587 {
3588         struct dmar_domain *dmar_domain = domain->priv;
3589
3590         domain->priv = NULL;
3591         vm_domain_exit(dmar_domain);
3592 }
3593
3594 static int intel_iommu_attach_device(struct iommu_domain *domain,
3595                                      struct device *dev)
3596 {
3597         struct dmar_domain *dmar_domain = domain->priv;
3598         struct pci_dev *pdev = to_pci_dev(dev);
3599         struct intel_iommu *iommu;
3600         int addr_width;
3601
3602         /* normally pdev is not mapped */
3603         if (unlikely(domain_context_mapped(pdev))) {
3604                 struct dmar_domain *old_domain;
3605
3606                 old_domain = find_domain(pdev);
3607                 if (old_domain) {
3608                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3609                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3610                                 domain_remove_one_dev_info(old_domain, pdev);
3611                         else
3612                                 domain_remove_dev_info(old_domain);
3613                 }
3614         }
3615
3616         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3617                                 pdev->devfn);
3618         if (!iommu)
3619                 return -ENODEV;
3620
3621         /* check if this iommu agaw is sufficient for max mapped address */
3622         addr_width = agaw_to_width(iommu->agaw);
3623         if (addr_width > cap_mgaw(iommu->cap))
3624                 addr_width = cap_mgaw(iommu->cap);
3625
3626         if (dmar_domain->max_addr > (1LL << addr_width)) {
3627                 printk(KERN_ERR "%s: iommu width (%d) is not "
3628                        "sufficient for the mapped address (%llx)\n",
3629                        __func__, addr_width, dmar_domain->max_addr);
3630                 return -EFAULT;
3631         }
3632         dmar_domain->gaw = addr_width;
3633
3634         /*
3635          * Knock out extra levels of page tables if necessary
3636          */
3637         while (iommu->agaw < dmar_domain->agaw) {
3638                 struct dma_pte *pte;
3639
3640                 pte = dmar_domain->pgd;
3641                 if (dma_pte_present(pte)) {
3642                         dmar_domain->pgd = (struct dma_pte *)
3643                                 phys_to_virt(dma_pte_addr(pte));
3644                         free_pgtable_page(pte);
3645                 }
3646                 dmar_domain->agaw--;
3647         }
3648
3649         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3650 }
3651
3652 static void intel_iommu_detach_device(struct iommu_domain *domain,
3653                                       struct device *dev)
3654 {
3655         struct dmar_domain *dmar_domain = domain->priv;
3656         struct pci_dev *pdev = to_pci_dev(dev);
3657
3658         domain_remove_one_dev_info(dmar_domain, pdev);
3659 }
3660
3661 static int intel_iommu_map(struct iommu_domain *domain,
3662                            unsigned long iova, phys_addr_t hpa,
3663                            int gfp_order, int iommu_prot)
3664 {
3665         struct dmar_domain *dmar_domain = domain->priv;
3666         u64 max_addr;
3667         int prot = 0;
3668         size_t size;
3669         int ret;
3670
3671         if (iommu_prot & IOMMU_READ)
3672                 prot |= DMA_PTE_READ;
3673         if (iommu_prot & IOMMU_WRITE)
3674                 prot |= DMA_PTE_WRITE;
3675         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3676                 prot |= DMA_PTE_SNP;
3677
3678         size     = PAGE_SIZE << gfp_order;
3679         max_addr = iova + size;
3680         if (dmar_domain->max_addr < max_addr) {
3681                 u64 end;
3682
3683                 /* check if minimum agaw is sufficient for mapped address */
3684                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3685                 if (end < max_addr) {
3686                         printk(KERN_ERR "%s: iommu width (%d) is not "
3687                                "sufficient for the mapped address (%llx)\n",
3688                                __func__, dmar_domain->gaw, max_addr);
3689                         return -EFAULT;
3690                 }
3691                 dmar_domain->max_addr = max_addr;
3692         }
3693         /* Round up size to next multiple of PAGE_SIZE, if it and
3694            the low bits of hpa would take us onto the next page */
3695         size = aligned_nrpages(hpa, size);
3696         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3697                                  hpa >> VTD_PAGE_SHIFT, size, prot);
3698         return ret;
3699 }
3700
3701 static int intel_iommu_unmap(struct iommu_domain *domain,
3702                              unsigned long iova, int gfp_order)
3703 {
3704         struct dmar_domain *dmar_domain = domain->priv;
3705         size_t size = PAGE_SIZE << gfp_order;
3706
3707         dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3708                             (iova + size - 1) >> VTD_PAGE_SHIFT);
3709
3710         if (dmar_domain->max_addr == iova + size)
3711                 dmar_domain->max_addr = iova;
3712
3713         return gfp_order;
3714 }
3715
3716 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3717                                             unsigned long iova)
3718 {
3719         struct dmar_domain *dmar_domain = domain->priv;
3720         struct dma_pte *pte;
3721         u64 phys = 0;
3722
3723         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3724         if (pte)
3725                 phys = dma_pte_addr(pte);
3726
3727         return phys;
3728 }
3729
3730 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3731                                       unsigned long cap)
3732 {
3733         struct dmar_domain *dmar_domain = domain->priv;
3734
3735         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3736                 return dmar_domain->iommu_snooping;
3737         if (cap == IOMMU_CAP_INTR_REMAP)
3738                 return intr_remapping_enabled;
3739
3740         return 0;
3741 }
3742
3743 static struct iommu_ops intel_iommu_ops = {
3744         .domain_init    = intel_iommu_domain_init,
3745         .domain_destroy = intel_iommu_domain_destroy,
3746         .attach_dev     = intel_iommu_attach_device,
3747         .detach_dev     = intel_iommu_detach_device,
3748         .map            = intel_iommu_map,
3749         .unmap          = intel_iommu_unmap,
3750         .iova_to_phys   = intel_iommu_iova_to_phys,
3751         .domain_has_cap = intel_iommu_domain_has_cap,
3752 };
3753
3754 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3755 {
3756         /*
3757          * Mobile 4 Series Chipset neglects to set RWBF capability,
3758          * but needs it:
3759          */
3760         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3761         rwbf_quirk = 1;
3762
3763         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
3764         if (dev->revision == 0x07) {
3765                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
3766                 dmar_map_gfx = 0;
3767         }
3768 }
3769
3770 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3771
3772 #define GGC 0x52
3773 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
3774 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
3775 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
3776 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
3777 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
3778 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
3779 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
3780 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
3781
3782 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
3783 {
3784         unsigned short ggc;
3785
3786         if (pci_read_config_word(dev, GGC, &ggc))
3787                 return;
3788
3789         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
3790                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
3791                 dmar_map_gfx = 0;
3792         }
3793 }
3794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
3795 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
3796 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
3797 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
3798
3799 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3800    ISOCH DMAR unit for the Azalia sound device, but not give it any
3801    TLB entries, which causes it to deadlock. Check for that.  We do
3802    this in a function called from init_dmars(), instead of in a PCI
3803    quirk, because we don't want to print the obnoxious "BIOS broken"
3804    message if VT-d is actually disabled.
3805 */
3806 static void __init check_tylersburg_isoch(void)
3807 {
3808         struct pci_dev *pdev;
3809         uint32_t vtisochctrl;
3810
3811         /* If there's no Azalia in the system anyway, forget it. */
3812         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3813         if (!pdev)
3814                 return;
3815         pci_dev_put(pdev);
3816
3817         /* System Management Registers. Might be hidden, in which case
3818            we can't do the sanity check. But that's OK, because the
3819            known-broken BIOSes _don't_ actually hide it, so far. */
3820         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3821         if (!pdev)
3822                 return;
3823
3824         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3825                 pci_dev_put(pdev);
3826                 return;
3827         }
3828
3829         pci_dev_put(pdev);
3830
3831         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3832         if (vtisochctrl & 1)
3833                 return;
3834
3835         /* Drop all bits other than the number of TLB entries */
3836         vtisochctrl &= 0x1c;
3837
3838         /* If we have the recommended number of TLB entries (16), fine. */
3839         if (vtisochctrl == 0x10)
3840                 return;
3841
3842         /* Zero TLB entries? You get to ride the short bus to school. */
3843         if (!vtisochctrl) {
3844                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3845                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3846                      dmi_get_system_info(DMI_BIOS_VENDOR),
3847                      dmi_get_system_info(DMI_BIOS_VERSION),
3848                      dmi_get_system_info(DMI_PRODUCT_VERSION));
3849                 iommu_identity_mapping |= IDENTMAP_AZALIA;
3850                 return;
3851         }
3852         
3853         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3854                vtisochctrl);
3855 }