intel-iommu: Flush unmaps at domain_exit
[linux-2.6.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/syscore_ops.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
45
46 #define ROOT_SIZE               VTD_PAGE_SIZE
47 #define CONTEXT_SIZE            VTD_PAGE_SIZE
48
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
52
53 #define IOAPIC_RANGE_START      (0xfee00000)
54 #define IOAPIC_RANGE_END        (0xfeefffff)
55 #define IOVA_START_ADDR         (0x1000)
56
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
58
59 #define MAX_AGAW_WIDTH 64
60
61 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
63
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
67                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
69
70 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
73
74 /* page table handling */
75 #define LEVEL_STRIDE            (9)
76 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
77
78 static inline int agaw_to_level(int agaw)
79 {
80         return agaw + 2;
81 }
82
83 static inline int agaw_to_width(int agaw)
84 {
85         return 30 + agaw * LEVEL_STRIDE;
86 }
87
88 static inline int width_to_agaw(int width)
89 {
90         return (width - 30) / LEVEL_STRIDE;
91 }
92
93 static inline unsigned int level_to_offset_bits(int level)
94 {
95         return (level - 1) * LEVEL_STRIDE;
96 }
97
98 static inline int pfn_level_offset(unsigned long pfn, int level)
99 {
100         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
101 }
102
103 static inline unsigned long level_mask(int level)
104 {
105         return -1UL << level_to_offset_bits(level);
106 }
107
108 static inline unsigned long level_size(int level)
109 {
110         return 1UL << level_to_offset_bits(level);
111 }
112
113 static inline unsigned long align_to_level(unsigned long pfn, int level)
114 {
115         return (pfn + level_size(level) - 1) & level_mask(level);
116 }
117
118 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
119    are never going to work. */
120 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
121 {
122         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
123 }
124
125 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
126 {
127         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
128 }
129 static inline unsigned long page_to_dma_pfn(struct page *pg)
130 {
131         return mm_to_dma_pfn(page_to_pfn(pg));
132 }
133 static inline unsigned long virt_to_dma_pfn(void *p)
134 {
135         return page_to_dma_pfn(virt_to_page(p));
136 }
137
138 /* global iommu list, set NULL for ignored DMAR units */
139 static struct intel_iommu **g_iommus;
140
141 static void __init check_tylersburg_isoch(void);
142 static int rwbf_quirk;
143
144 /*
145  * set to 1 to panic kernel if can't successfully enable VT-d
146  * (used when kernel is launched w/ TXT)
147  */
148 static int force_on = 0;
149
150 /*
151  * 0: Present
152  * 1-11: Reserved
153  * 12-63: Context Ptr (12 - (haw-1))
154  * 64-127: Reserved
155  */
156 struct root_entry {
157         u64     val;
158         u64     rsvd1;
159 };
160 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
161 static inline bool root_present(struct root_entry *root)
162 {
163         return (root->val & 1);
164 }
165 static inline void set_root_present(struct root_entry *root)
166 {
167         root->val |= 1;
168 }
169 static inline void set_root_value(struct root_entry *root, unsigned long value)
170 {
171         root->val |= value & VTD_PAGE_MASK;
172 }
173
174 static inline struct context_entry *
175 get_context_addr_from_root(struct root_entry *root)
176 {
177         return (struct context_entry *)
178                 (root_present(root)?phys_to_virt(
179                 root->val & VTD_PAGE_MASK) :
180                 NULL);
181 }
182
183 /*
184  * low 64 bits:
185  * 0: present
186  * 1: fault processing disable
187  * 2-3: translation type
188  * 12-63: address space root
189  * high 64 bits:
190  * 0-2: address width
191  * 3-6: aval
192  * 8-23: domain id
193  */
194 struct context_entry {
195         u64 lo;
196         u64 hi;
197 };
198
199 static inline bool context_present(struct context_entry *context)
200 {
201         return (context->lo & 1);
202 }
203 static inline void context_set_present(struct context_entry *context)
204 {
205         context->lo |= 1;
206 }
207
208 static inline void context_set_fault_enable(struct context_entry *context)
209 {
210         context->lo &= (((u64)-1) << 2) | 1;
211 }
212
213 static inline void context_set_translation_type(struct context_entry *context,
214                                                 unsigned long value)
215 {
216         context->lo &= (((u64)-1) << 4) | 3;
217         context->lo |= (value & 3) << 2;
218 }
219
220 static inline void context_set_address_root(struct context_entry *context,
221                                             unsigned long value)
222 {
223         context->lo |= value & VTD_PAGE_MASK;
224 }
225
226 static inline void context_set_address_width(struct context_entry *context,
227                                              unsigned long value)
228 {
229         context->hi |= value & 7;
230 }
231
232 static inline void context_set_domain_id(struct context_entry *context,
233                                          unsigned long value)
234 {
235         context->hi |= (value & ((1 << 16) - 1)) << 8;
236 }
237
238 static inline void context_clear_entry(struct context_entry *context)
239 {
240         context->lo = 0;
241         context->hi = 0;
242 }
243
244 /*
245  * 0: readable
246  * 1: writable
247  * 2-6: reserved
248  * 7: super page
249  * 8-10: available
250  * 11: snoop behavior
251  * 12-63: Host physcial address
252  */
253 struct dma_pte {
254         u64 val;
255 };
256
257 static inline void dma_clear_pte(struct dma_pte *pte)
258 {
259         pte->val = 0;
260 }
261
262 static inline void dma_set_pte_readable(struct dma_pte *pte)
263 {
264         pte->val |= DMA_PTE_READ;
265 }
266
267 static inline void dma_set_pte_writable(struct dma_pte *pte)
268 {
269         pte->val |= DMA_PTE_WRITE;
270 }
271
272 static inline void dma_set_pte_snp(struct dma_pte *pte)
273 {
274         pte->val |= DMA_PTE_SNP;
275 }
276
277 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
278 {
279         pte->val = (pte->val & ~3) | (prot & 3);
280 }
281
282 static inline u64 dma_pte_addr(struct dma_pte *pte)
283 {
284 #ifdef CONFIG_64BIT
285         return pte->val & VTD_PAGE_MASK;
286 #else
287         /* Must have a full atomic 64-bit read */
288         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
289 #endif
290 }
291
292 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
293 {
294         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
295 }
296
297 static inline bool dma_pte_present(struct dma_pte *pte)
298 {
299         return (pte->val & 3) != 0;
300 }
301
302 static inline int first_pte_in_page(struct dma_pte *pte)
303 {
304         return !((unsigned long)pte & ~VTD_PAGE_MASK);
305 }
306
307 /*
308  * This domain is a statically identity mapping domain.
309  *      1. This domain creats a static 1:1 mapping to all usable memory.
310  *      2. It maps to each iommu if successful.
311  *      3. Each iommu mapps to this domain if successful.
312  */
313 static struct dmar_domain *si_domain;
314 static int hw_pass_through = 1;
315
316 /* devices under the same p2p bridge are owned in one domain */
317 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
318
319 /* domain represents a virtual machine, more than one devices
320  * across iommus may be owned in one domain, e.g. kvm guest.
321  */
322 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
323
324 /* si_domain contains mulitple devices */
325 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
326
327 struct dmar_domain {
328         int     id;                     /* domain id */
329         int     nid;                    /* node id */
330         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
331
332         struct list_head devices;       /* all devices' list */
333         struct iova_domain iovad;       /* iova's that belong to this domain */
334
335         struct dma_pte  *pgd;           /* virtual address */
336         int             gaw;            /* max guest address width */
337
338         /* adjusted guest address width, 0 is level 2 30-bit */
339         int             agaw;
340
341         int             flags;          /* flags to find out type of domain */
342
343         int             iommu_coherency;/* indicate coherency of iommu access */
344         int             iommu_snooping; /* indicate snooping control feature*/
345         int             iommu_count;    /* reference count of iommu */
346         spinlock_t      iommu_lock;     /* protect iommu set in domain */
347         u64             max_addr;       /* maximum mapped address */
348 };
349
350 /* PCI domain-device relationship */
351 struct device_domain_info {
352         struct list_head link;  /* link to domain siblings */
353         struct list_head global; /* link to global list */
354         int segment;            /* PCI domain */
355         u8 bus;                 /* PCI bus number */
356         u8 devfn;               /* PCI devfn number */
357         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
358         struct intel_iommu *iommu; /* IOMMU used by this device */
359         struct dmar_domain *domain; /* pointer to domain */
360 };
361
362 static void flush_unmaps_timeout(unsigned long data);
363
364 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
365
366 #define HIGH_WATER_MARK 250
367 struct deferred_flush_tables {
368         int next;
369         struct iova *iova[HIGH_WATER_MARK];
370         struct dmar_domain *domain[HIGH_WATER_MARK];
371 };
372
373 static struct deferred_flush_tables *deferred_flush;
374
375 /* bitmap for indexing intel_iommus */
376 static int g_num_of_iommus;
377
378 static DEFINE_SPINLOCK(async_umap_flush_lock);
379 static LIST_HEAD(unmaps_to_do);
380
381 static int timer_on;
382 static long list_size;
383
384 static void domain_remove_dev_info(struct dmar_domain *domain);
385
386 #ifdef CONFIG_DMAR_DEFAULT_ON
387 int dmar_disabled = 0;
388 #else
389 int dmar_disabled = 1;
390 #endif /*CONFIG_DMAR_DEFAULT_ON*/
391
392 static int dmar_map_gfx = 1;
393 static int dmar_forcedac;
394 static int intel_iommu_strict;
395
396 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
397 static DEFINE_SPINLOCK(device_domain_lock);
398 static LIST_HEAD(device_domain_list);
399
400 static struct iommu_ops intel_iommu_ops;
401
402 static int __init intel_iommu_setup(char *str)
403 {
404         if (!str)
405                 return -EINVAL;
406         while (*str) {
407                 if (!strncmp(str, "on", 2)) {
408                         dmar_disabled = 0;
409                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
410                 } else if (!strncmp(str, "off", 3)) {
411                         dmar_disabled = 1;
412                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
413                 } else if (!strncmp(str, "igfx_off", 8)) {
414                         dmar_map_gfx = 0;
415                         printk(KERN_INFO
416                                 "Intel-IOMMU: disable GFX device mapping\n");
417                 } else if (!strncmp(str, "forcedac", 8)) {
418                         printk(KERN_INFO
419                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
420                         dmar_forcedac = 1;
421                 } else if (!strncmp(str, "strict", 6)) {
422                         printk(KERN_INFO
423                                 "Intel-IOMMU: disable batched IOTLB flush\n");
424                         intel_iommu_strict = 1;
425                 }
426
427                 str += strcspn(str, ",");
428                 while (*str == ',')
429                         str++;
430         }
431         return 0;
432 }
433 __setup("intel_iommu=", intel_iommu_setup);
434
435 static struct kmem_cache *iommu_domain_cache;
436 static struct kmem_cache *iommu_devinfo_cache;
437 static struct kmem_cache *iommu_iova_cache;
438
439 static inline void *alloc_pgtable_page(int node)
440 {
441         struct page *page;
442         void *vaddr = NULL;
443
444         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
445         if (page)
446                 vaddr = page_address(page);
447         return vaddr;
448 }
449
450 static inline void free_pgtable_page(void *vaddr)
451 {
452         free_page((unsigned long)vaddr);
453 }
454
455 static inline void *alloc_domain_mem(void)
456 {
457         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
458 }
459
460 static void free_domain_mem(void *vaddr)
461 {
462         kmem_cache_free(iommu_domain_cache, vaddr);
463 }
464
465 static inline void * alloc_devinfo_mem(void)
466 {
467         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
468 }
469
470 static inline void free_devinfo_mem(void *vaddr)
471 {
472         kmem_cache_free(iommu_devinfo_cache, vaddr);
473 }
474
475 struct iova *alloc_iova_mem(void)
476 {
477         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
478 }
479
480 void free_iova_mem(struct iova *iova)
481 {
482         kmem_cache_free(iommu_iova_cache, iova);
483 }
484
485
486 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
487 {
488         unsigned long sagaw;
489         int agaw = -1;
490
491         sagaw = cap_sagaw(iommu->cap);
492         for (agaw = width_to_agaw(max_gaw);
493              agaw >= 0; agaw--) {
494                 if (test_bit(agaw, &sagaw))
495                         break;
496         }
497
498         return agaw;
499 }
500
501 /*
502  * Calculate max SAGAW for each iommu.
503  */
504 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
505 {
506         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
507 }
508
509 /*
510  * calculate agaw for each iommu.
511  * "SAGAW" may be different across iommus, use a default agaw, and
512  * get a supported less agaw for iommus that don't support the default agaw.
513  */
514 int iommu_calculate_agaw(struct intel_iommu *iommu)
515 {
516         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
517 }
518
519 /* This functionin only returns single iommu in a domain */
520 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
521 {
522         int iommu_id;
523
524         /* si_domain and vm domain should not get here. */
525         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
526         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
527
528         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
529         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
530                 return NULL;
531
532         return g_iommus[iommu_id];
533 }
534
535 static void domain_update_iommu_coherency(struct dmar_domain *domain)
536 {
537         int i;
538
539         domain->iommu_coherency = 1;
540
541         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
542                 if (!ecap_coherent(g_iommus[i]->ecap)) {
543                         domain->iommu_coherency = 0;
544                         break;
545                 }
546         }
547 }
548
549 static void domain_update_iommu_snooping(struct dmar_domain *domain)
550 {
551         int i;
552
553         domain->iommu_snooping = 1;
554
555         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
556                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
557                         domain->iommu_snooping = 0;
558                         break;
559                 }
560         }
561 }
562
563 /* Some capabilities may be different across iommus */
564 static void domain_update_iommu_cap(struct dmar_domain *domain)
565 {
566         domain_update_iommu_coherency(domain);
567         domain_update_iommu_snooping(domain);
568 }
569
570 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
571 {
572         struct dmar_drhd_unit *drhd = NULL;
573         int i;
574
575         for_each_drhd_unit(drhd) {
576                 if (drhd->ignored)
577                         continue;
578                 if (segment != drhd->segment)
579                         continue;
580
581                 for (i = 0; i < drhd->devices_cnt; i++) {
582                         if (drhd->devices[i] &&
583                             drhd->devices[i]->bus->number == bus &&
584                             drhd->devices[i]->devfn == devfn)
585                                 return drhd->iommu;
586                         if (drhd->devices[i] &&
587                             drhd->devices[i]->subordinate &&
588                             drhd->devices[i]->subordinate->number <= bus &&
589                             drhd->devices[i]->subordinate->subordinate >= bus)
590                                 return drhd->iommu;
591                 }
592
593                 if (drhd->include_all)
594                         return drhd->iommu;
595         }
596
597         return NULL;
598 }
599
600 static void domain_flush_cache(struct dmar_domain *domain,
601                                void *addr, int size)
602 {
603         if (!domain->iommu_coherency)
604                 clflush_cache_range(addr, size);
605 }
606
607 /* Gets context entry for a given bus and devfn */
608 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
609                 u8 bus, u8 devfn)
610 {
611         struct root_entry *root;
612         struct context_entry *context;
613         unsigned long phy_addr;
614         unsigned long flags;
615
616         spin_lock_irqsave(&iommu->lock, flags);
617         root = &iommu->root_entry[bus];
618         context = get_context_addr_from_root(root);
619         if (!context) {
620                 context = (struct context_entry *)
621                                 alloc_pgtable_page(iommu->node);
622                 if (!context) {
623                         spin_unlock_irqrestore(&iommu->lock, flags);
624                         return NULL;
625                 }
626                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
627                 phy_addr = virt_to_phys((void *)context);
628                 set_root_value(root, phy_addr);
629                 set_root_present(root);
630                 __iommu_flush_cache(iommu, root, sizeof(*root));
631         }
632         spin_unlock_irqrestore(&iommu->lock, flags);
633         return &context[devfn];
634 }
635
636 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
637 {
638         struct root_entry *root;
639         struct context_entry *context;
640         int ret;
641         unsigned long flags;
642
643         spin_lock_irqsave(&iommu->lock, flags);
644         root = &iommu->root_entry[bus];
645         context = get_context_addr_from_root(root);
646         if (!context) {
647                 ret = 0;
648                 goto out;
649         }
650         ret = context_present(&context[devfn]);
651 out:
652         spin_unlock_irqrestore(&iommu->lock, flags);
653         return ret;
654 }
655
656 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
657 {
658         struct root_entry *root;
659         struct context_entry *context;
660         unsigned long flags;
661
662         spin_lock_irqsave(&iommu->lock, flags);
663         root = &iommu->root_entry[bus];
664         context = get_context_addr_from_root(root);
665         if (context) {
666                 context_clear_entry(&context[devfn]);
667                 __iommu_flush_cache(iommu, &context[devfn], \
668                         sizeof(*context));
669         }
670         spin_unlock_irqrestore(&iommu->lock, flags);
671 }
672
673 static void free_context_table(struct intel_iommu *iommu)
674 {
675         struct root_entry *root;
676         int i;
677         unsigned long flags;
678         struct context_entry *context;
679
680         spin_lock_irqsave(&iommu->lock, flags);
681         if (!iommu->root_entry) {
682                 goto out;
683         }
684         for (i = 0; i < ROOT_ENTRY_NR; i++) {
685                 root = &iommu->root_entry[i];
686                 context = get_context_addr_from_root(root);
687                 if (context)
688                         free_pgtable_page(context);
689         }
690         free_pgtable_page(iommu->root_entry);
691         iommu->root_entry = NULL;
692 out:
693         spin_unlock_irqrestore(&iommu->lock, flags);
694 }
695
696 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
697                                       unsigned long pfn)
698 {
699         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
700         struct dma_pte *parent, *pte = NULL;
701         int level = agaw_to_level(domain->agaw);
702         int offset;
703
704         BUG_ON(!domain->pgd);
705         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
706         parent = domain->pgd;
707
708         while (level > 0) {
709                 void *tmp_page;
710
711                 offset = pfn_level_offset(pfn, level);
712                 pte = &parent[offset];
713                 if (level == 1)
714                         break;
715
716                 if (!dma_pte_present(pte)) {
717                         uint64_t pteval;
718
719                         tmp_page = alloc_pgtable_page(domain->nid);
720
721                         if (!tmp_page)
722                                 return NULL;
723
724                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
725                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
726                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
727                                 /* Someone else set it while we were thinking; use theirs. */
728                                 free_pgtable_page(tmp_page);
729                         } else {
730                                 dma_pte_addr(pte);
731                                 domain_flush_cache(domain, pte, sizeof(*pte));
732                         }
733                 }
734                 parent = phys_to_virt(dma_pte_addr(pte));
735                 level--;
736         }
737
738         return pte;
739 }
740
741 /* return address's pte at specific level */
742 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
743                                          unsigned long pfn,
744                                          int level)
745 {
746         struct dma_pte *parent, *pte = NULL;
747         int total = agaw_to_level(domain->agaw);
748         int offset;
749
750         parent = domain->pgd;
751         while (level <= total) {
752                 offset = pfn_level_offset(pfn, total);
753                 pte = &parent[offset];
754                 if (level == total)
755                         return pte;
756
757                 if (!dma_pte_present(pte))
758                         break;
759                 parent = phys_to_virt(dma_pte_addr(pte));
760                 total--;
761         }
762         return NULL;
763 }
764
765 /* clear last level pte, a tlb flush should be followed */
766 static void dma_pte_clear_range(struct dmar_domain *domain,
767                                 unsigned long start_pfn,
768                                 unsigned long last_pfn)
769 {
770         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
771         struct dma_pte *first_pte, *pte;
772
773         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
774         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
775         BUG_ON(start_pfn > last_pfn);
776
777         /* we don't need lock here; nobody else touches the iova range */
778         do {
779                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
780                 if (!pte) {
781                         start_pfn = align_to_level(start_pfn + 1, 2);
782                         continue;
783                 }
784                 do { 
785                         dma_clear_pte(pte);
786                         start_pfn++;
787                         pte++;
788                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
789
790                 domain_flush_cache(domain, first_pte,
791                                    (void *)pte - (void *)first_pte);
792
793         } while (start_pfn && start_pfn <= last_pfn);
794 }
795
796 /* free page table pages. last level pte should already be cleared */
797 static void dma_pte_free_pagetable(struct dmar_domain *domain,
798                                    unsigned long start_pfn,
799                                    unsigned long last_pfn)
800 {
801         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
802         struct dma_pte *first_pte, *pte;
803         int total = agaw_to_level(domain->agaw);
804         int level;
805         unsigned long tmp;
806
807         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
808         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
809         BUG_ON(start_pfn > last_pfn);
810
811         /* We don't need lock here; nobody else touches the iova range */
812         level = 2;
813         while (level <= total) {
814                 tmp = align_to_level(start_pfn, level);
815
816                 /* If we can't even clear one PTE at this level, we're done */
817                 if (tmp + level_size(level) - 1 > last_pfn)
818                         return;
819
820                 do {
821                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
822                         if (!pte) {
823                                 tmp = align_to_level(tmp + 1, level + 1);
824                                 continue;
825                         }
826                         do {
827                                 if (dma_pte_present(pte)) {
828                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
829                                         dma_clear_pte(pte);
830                                 }
831                                 pte++;
832                                 tmp += level_size(level);
833                         } while (!first_pte_in_page(pte) &&
834                                  tmp + level_size(level) - 1 <= last_pfn);
835
836                         domain_flush_cache(domain, first_pte,
837                                            (void *)pte - (void *)first_pte);
838                         
839                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
840                 level++;
841         }
842         /* free pgd */
843         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
844                 free_pgtable_page(domain->pgd);
845                 domain->pgd = NULL;
846         }
847 }
848
849 /* iommu handling */
850 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
851 {
852         struct root_entry *root;
853         unsigned long flags;
854
855         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
856         if (!root)
857                 return -ENOMEM;
858
859         __iommu_flush_cache(iommu, root, ROOT_SIZE);
860
861         spin_lock_irqsave(&iommu->lock, flags);
862         iommu->root_entry = root;
863         spin_unlock_irqrestore(&iommu->lock, flags);
864
865         return 0;
866 }
867
868 static void iommu_set_root_entry(struct intel_iommu *iommu)
869 {
870         void *addr;
871         u32 sts;
872         unsigned long flag;
873
874         addr = iommu->root_entry;
875
876         spin_lock_irqsave(&iommu->register_lock, flag);
877         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
878
879         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
880
881         /* Make sure hardware complete it */
882         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
883                       readl, (sts & DMA_GSTS_RTPS), sts);
884
885         spin_unlock_irqrestore(&iommu->register_lock, flag);
886 }
887
888 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
889 {
890         u32 val;
891         unsigned long flag;
892
893         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
894                 return;
895
896         spin_lock_irqsave(&iommu->register_lock, flag);
897         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
898
899         /* Make sure hardware complete it */
900         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
901                       readl, (!(val & DMA_GSTS_WBFS)), val);
902
903         spin_unlock_irqrestore(&iommu->register_lock, flag);
904 }
905
906 /* return value determine if we need a write buffer flush */
907 static void __iommu_flush_context(struct intel_iommu *iommu,
908                                   u16 did, u16 source_id, u8 function_mask,
909                                   u64 type)
910 {
911         u64 val = 0;
912         unsigned long flag;
913
914         switch (type) {
915         case DMA_CCMD_GLOBAL_INVL:
916                 val = DMA_CCMD_GLOBAL_INVL;
917                 break;
918         case DMA_CCMD_DOMAIN_INVL:
919                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
920                 break;
921         case DMA_CCMD_DEVICE_INVL:
922                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
923                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
924                 break;
925         default:
926                 BUG();
927         }
928         val |= DMA_CCMD_ICC;
929
930         spin_lock_irqsave(&iommu->register_lock, flag);
931         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
932
933         /* Make sure hardware complete it */
934         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
935                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
936
937         spin_unlock_irqrestore(&iommu->register_lock, flag);
938 }
939
940 /* return value determine if we need a write buffer flush */
941 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
942                                 u64 addr, unsigned int size_order, u64 type)
943 {
944         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
945         u64 val = 0, val_iva = 0;
946         unsigned long flag;
947
948         switch (type) {
949         case DMA_TLB_GLOBAL_FLUSH:
950                 /* global flush doesn't need set IVA_REG */
951                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
952                 break;
953         case DMA_TLB_DSI_FLUSH:
954                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
955                 break;
956         case DMA_TLB_PSI_FLUSH:
957                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
958                 /* Note: always flush non-leaf currently */
959                 val_iva = size_order | addr;
960                 break;
961         default:
962                 BUG();
963         }
964         /* Note: set drain read/write */
965 #if 0
966         /*
967          * This is probably to be super secure.. Looks like we can
968          * ignore it without any impact.
969          */
970         if (cap_read_drain(iommu->cap))
971                 val |= DMA_TLB_READ_DRAIN;
972 #endif
973         if (cap_write_drain(iommu->cap))
974                 val |= DMA_TLB_WRITE_DRAIN;
975
976         spin_lock_irqsave(&iommu->register_lock, flag);
977         /* Note: Only uses first TLB reg currently */
978         if (val_iva)
979                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
980         dmar_writeq(iommu->reg + tlb_offset + 8, val);
981
982         /* Make sure hardware complete it */
983         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
984                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
985
986         spin_unlock_irqrestore(&iommu->register_lock, flag);
987
988         /* check IOTLB invalidation granularity */
989         if (DMA_TLB_IAIG(val) == 0)
990                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
991         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
992                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
993                         (unsigned long long)DMA_TLB_IIRG(type),
994                         (unsigned long long)DMA_TLB_IAIG(val));
995 }
996
997 static struct device_domain_info *iommu_support_dev_iotlb(
998         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
999 {
1000         int found = 0;
1001         unsigned long flags;
1002         struct device_domain_info *info;
1003         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1004
1005         if (!ecap_dev_iotlb_support(iommu->ecap))
1006                 return NULL;
1007
1008         if (!iommu->qi)
1009                 return NULL;
1010
1011         spin_lock_irqsave(&device_domain_lock, flags);
1012         list_for_each_entry(info, &domain->devices, link)
1013                 if (info->bus == bus && info->devfn == devfn) {
1014                         found = 1;
1015                         break;
1016                 }
1017         spin_unlock_irqrestore(&device_domain_lock, flags);
1018
1019         if (!found || !info->dev)
1020                 return NULL;
1021
1022         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1023                 return NULL;
1024
1025         if (!dmar_find_matched_atsr_unit(info->dev))
1026                 return NULL;
1027
1028         info->iommu = iommu;
1029
1030         return info;
1031 }
1032
1033 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1034 {
1035         if (!info)
1036                 return;
1037
1038         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1039 }
1040
1041 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1042 {
1043         if (!info->dev || !pci_ats_enabled(info->dev))
1044                 return;
1045
1046         pci_disable_ats(info->dev);
1047 }
1048
1049 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1050                                   u64 addr, unsigned mask)
1051 {
1052         u16 sid, qdep;
1053         unsigned long flags;
1054         struct device_domain_info *info;
1055
1056         spin_lock_irqsave(&device_domain_lock, flags);
1057         list_for_each_entry(info, &domain->devices, link) {
1058                 if (!info->dev || !pci_ats_enabled(info->dev))
1059                         continue;
1060
1061                 sid = info->bus << 8 | info->devfn;
1062                 qdep = pci_ats_queue_depth(info->dev);
1063                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1064         }
1065         spin_unlock_irqrestore(&device_domain_lock, flags);
1066 }
1067
1068 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1069                                   unsigned long pfn, unsigned int pages, int map)
1070 {
1071         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1072         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1073
1074         BUG_ON(pages == 0);
1075
1076         /*
1077          * Fallback to domain selective flush if no PSI support or the size is
1078          * too big.
1079          * PSI requires page size to be 2 ^ x, and the base address is naturally
1080          * aligned to the size
1081          */
1082         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1083                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1084                                                 DMA_TLB_DSI_FLUSH);
1085         else
1086                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1087                                                 DMA_TLB_PSI_FLUSH);
1088
1089         /*
1090          * In caching mode, changes of pages from non-present to present require
1091          * flush. However, device IOTLB doesn't need to be flushed in this case.
1092          */
1093         if (!cap_caching_mode(iommu->cap) || !map)
1094                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1095 }
1096
1097 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1098 {
1099         u32 pmen;
1100         unsigned long flags;
1101
1102         spin_lock_irqsave(&iommu->register_lock, flags);
1103         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1104         pmen &= ~DMA_PMEN_EPM;
1105         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1106
1107         /* wait for the protected region status bit to clear */
1108         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1109                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1110
1111         spin_unlock_irqrestore(&iommu->register_lock, flags);
1112 }
1113
1114 static int iommu_enable_translation(struct intel_iommu *iommu)
1115 {
1116         u32 sts;
1117         unsigned long flags;
1118
1119         spin_lock_irqsave(&iommu->register_lock, flags);
1120         iommu->gcmd |= DMA_GCMD_TE;
1121         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1122
1123         /* Make sure hardware complete it */
1124         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1125                       readl, (sts & DMA_GSTS_TES), sts);
1126
1127         spin_unlock_irqrestore(&iommu->register_lock, flags);
1128         return 0;
1129 }
1130
1131 static int iommu_disable_translation(struct intel_iommu *iommu)
1132 {
1133         u32 sts;
1134         unsigned long flag;
1135
1136         spin_lock_irqsave(&iommu->register_lock, flag);
1137         iommu->gcmd &= ~DMA_GCMD_TE;
1138         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1139
1140         /* Make sure hardware complete it */
1141         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1142                       readl, (!(sts & DMA_GSTS_TES)), sts);
1143
1144         spin_unlock_irqrestore(&iommu->register_lock, flag);
1145         return 0;
1146 }
1147
1148
1149 static int iommu_init_domains(struct intel_iommu *iommu)
1150 {
1151         unsigned long ndomains;
1152         unsigned long nlongs;
1153
1154         ndomains = cap_ndoms(iommu->cap);
1155         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1156                         ndomains);
1157         nlongs = BITS_TO_LONGS(ndomains);
1158
1159         spin_lock_init(&iommu->lock);
1160
1161         /* TBD: there might be 64K domains,
1162          * consider other allocation for future chip
1163          */
1164         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1165         if (!iommu->domain_ids) {
1166                 printk(KERN_ERR "Allocating domain id array failed\n");
1167                 return -ENOMEM;
1168         }
1169         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1170                         GFP_KERNEL);
1171         if (!iommu->domains) {
1172                 printk(KERN_ERR "Allocating domain array failed\n");
1173                 return -ENOMEM;
1174         }
1175
1176         /*
1177          * if Caching mode is set, then invalid translations are tagged
1178          * with domainid 0. Hence we need to pre-allocate it.
1179          */
1180         if (cap_caching_mode(iommu->cap))
1181                 set_bit(0, iommu->domain_ids);
1182         return 0;
1183 }
1184
1185
1186 static void domain_exit(struct dmar_domain *domain);
1187 static void vm_domain_exit(struct dmar_domain *domain);
1188
1189 void free_dmar_iommu(struct intel_iommu *iommu)
1190 {
1191         struct dmar_domain *domain;
1192         int i;
1193         unsigned long flags;
1194
1195         if ((iommu->domains) && (iommu->domain_ids)) {
1196                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1197                         domain = iommu->domains[i];
1198                         clear_bit(i, iommu->domain_ids);
1199
1200                         spin_lock_irqsave(&domain->iommu_lock, flags);
1201                         if (--domain->iommu_count == 0) {
1202                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1203                                         vm_domain_exit(domain);
1204                                 else
1205                                         domain_exit(domain);
1206                         }
1207                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1208                 }
1209         }
1210
1211         if (iommu->gcmd & DMA_GCMD_TE)
1212                 iommu_disable_translation(iommu);
1213
1214         if (iommu->irq) {
1215                 irq_set_handler_data(iommu->irq, NULL);
1216                 /* This will mask the irq */
1217                 free_irq(iommu->irq, iommu);
1218                 destroy_irq(iommu->irq);
1219         }
1220
1221         kfree(iommu->domains);
1222         kfree(iommu->domain_ids);
1223
1224         g_iommus[iommu->seq_id] = NULL;
1225
1226         /* if all iommus are freed, free g_iommus */
1227         for (i = 0; i < g_num_of_iommus; i++) {
1228                 if (g_iommus[i])
1229                         break;
1230         }
1231
1232         if (i == g_num_of_iommus)
1233                 kfree(g_iommus);
1234
1235         /* free context mapping */
1236         free_context_table(iommu);
1237 }
1238
1239 static struct dmar_domain *alloc_domain(void)
1240 {
1241         struct dmar_domain *domain;
1242
1243         domain = alloc_domain_mem();
1244         if (!domain)
1245                 return NULL;
1246
1247         domain->nid = -1;
1248         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1249         domain->flags = 0;
1250
1251         return domain;
1252 }
1253
1254 static int iommu_attach_domain(struct dmar_domain *domain,
1255                                struct intel_iommu *iommu)
1256 {
1257         int num;
1258         unsigned long ndomains;
1259         unsigned long flags;
1260
1261         ndomains = cap_ndoms(iommu->cap);
1262
1263         spin_lock_irqsave(&iommu->lock, flags);
1264
1265         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1266         if (num >= ndomains) {
1267                 spin_unlock_irqrestore(&iommu->lock, flags);
1268                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1269                 return -ENOMEM;
1270         }
1271
1272         domain->id = num;
1273         set_bit(num, iommu->domain_ids);
1274         set_bit(iommu->seq_id, &domain->iommu_bmp);
1275         iommu->domains[num] = domain;
1276         spin_unlock_irqrestore(&iommu->lock, flags);
1277
1278         return 0;
1279 }
1280
1281 static void iommu_detach_domain(struct dmar_domain *domain,
1282                                 struct intel_iommu *iommu)
1283 {
1284         unsigned long flags;
1285         int num, ndomains;
1286         int found = 0;
1287
1288         spin_lock_irqsave(&iommu->lock, flags);
1289         ndomains = cap_ndoms(iommu->cap);
1290         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1291                 if (iommu->domains[num] == domain) {
1292                         found = 1;
1293                         break;
1294                 }
1295         }
1296
1297         if (found) {
1298                 clear_bit(num, iommu->domain_ids);
1299                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1300                 iommu->domains[num] = NULL;
1301         }
1302         spin_unlock_irqrestore(&iommu->lock, flags);
1303 }
1304
1305 static struct iova_domain reserved_iova_list;
1306 static struct lock_class_key reserved_rbtree_key;
1307
1308 static int dmar_init_reserved_ranges(void)
1309 {
1310         struct pci_dev *pdev = NULL;
1311         struct iova *iova;
1312         int i;
1313
1314         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1315
1316         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1317                 &reserved_rbtree_key);
1318
1319         /* IOAPIC ranges shouldn't be accessed by DMA */
1320         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1321                 IOVA_PFN(IOAPIC_RANGE_END));
1322         if (!iova) {
1323                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1324                 return -ENODEV;
1325         }
1326
1327         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1328         for_each_pci_dev(pdev) {
1329                 struct resource *r;
1330
1331                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1332                         r = &pdev->resource[i];
1333                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1334                                 continue;
1335                         iova = reserve_iova(&reserved_iova_list,
1336                                             IOVA_PFN(r->start),
1337                                             IOVA_PFN(r->end));
1338                         if (!iova) {
1339                                 printk(KERN_ERR "Reserve iova failed\n");
1340                                 return -ENODEV;
1341                         }
1342                 }
1343         }
1344         return 0;
1345 }
1346
1347 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1348 {
1349         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1350 }
1351
1352 static inline int guestwidth_to_adjustwidth(int gaw)
1353 {
1354         int agaw;
1355         int r = (gaw - 12) % 9;
1356
1357         if (r == 0)
1358                 agaw = gaw;
1359         else
1360                 agaw = gaw + 9 - r;
1361         if (agaw > 64)
1362                 agaw = 64;
1363         return agaw;
1364 }
1365
1366 static int domain_init(struct dmar_domain *domain, int guest_width)
1367 {
1368         struct intel_iommu *iommu;
1369         int adjust_width, agaw;
1370         unsigned long sagaw;
1371
1372         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1373         spin_lock_init(&domain->iommu_lock);
1374
1375         domain_reserve_special_ranges(domain);
1376
1377         /* calculate AGAW */
1378         iommu = domain_get_iommu(domain);
1379         if (guest_width > cap_mgaw(iommu->cap))
1380                 guest_width = cap_mgaw(iommu->cap);
1381         domain->gaw = guest_width;
1382         adjust_width = guestwidth_to_adjustwidth(guest_width);
1383         agaw = width_to_agaw(adjust_width);
1384         sagaw = cap_sagaw(iommu->cap);
1385         if (!test_bit(agaw, &sagaw)) {
1386                 /* hardware doesn't support it, choose a bigger one */
1387                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1388                 agaw = find_next_bit(&sagaw, 5, agaw);
1389                 if (agaw >= 5)
1390                         return -ENODEV;
1391         }
1392         domain->agaw = agaw;
1393         INIT_LIST_HEAD(&domain->devices);
1394
1395         if (ecap_coherent(iommu->ecap))
1396                 domain->iommu_coherency = 1;
1397         else
1398                 domain->iommu_coherency = 0;
1399
1400         if (ecap_sc_support(iommu->ecap))
1401                 domain->iommu_snooping = 1;
1402         else
1403                 domain->iommu_snooping = 0;
1404
1405         domain->iommu_count = 1;
1406         domain->nid = iommu->node;
1407
1408         /* always allocate the top pgd */
1409         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1410         if (!domain->pgd)
1411                 return -ENOMEM;
1412         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1413         return 0;
1414 }
1415
1416 static void domain_exit(struct dmar_domain *domain)
1417 {
1418         struct dmar_drhd_unit *drhd;
1419         struct intel_iommu *iommu;
1420
1421         /* Domain 0 is reserved, so dont process it */
1422         if (!domain)
1423                 return;
1424
1425         /* Flush any lazy unmaps that may reference this domain */
1426         if (!intel_iommu_strict)
1427                 flush_unmaps_timeout(0);
1428
1429         domain_remove_dev_info(domain);
1430         /* destroy iovas */
1431         put_iova_domain(&domain->iovad);
1432
1433         /* clear ptes */
1434         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1435
1436         /* free page tables */
1437         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1438
1439         for_each_active_iommu(iommu, drhd)
1440                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1441                         iommu_detach_domain(domain, iommu);
1442
1443         free_domain_mem(domain);
1444 }
1445
1446 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1447                                  u8 bus, u8 devfn, int translation)
1448 {
1449         struct context_entry *context;
1450         unsigned long flags;
1451         struct intel_iommu *iommu;
1452         struct dma_pte *pgd;
1453         unsigned long num;
1454         unsigned long ndomains;
1455         int id;
1456         int agaw;
1457         struct device_domain_info *info = NULL;
1458
1459         pr_debug("Set context mapping for %02x:%02x.%d\n",
1460                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1461
1462         BUG_ON(!domain->pgd);
1463         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1464                translation != CONTEXT_TT_MULTI_LEVEL);
1465
1466         iommu = device_to_iommu(segment, bus, devfn);
1467         if (!iommu)
1468                 return -ENODEV;
1469
1470         context = device_to_context_entry(iommu, bus, devfn);
1471         if (!context)
1472                 return -ENOMEM;
1473         spin_lock_irqsave(&iommu->lock, flags);
1474         if (context_present(context)) {
1475                 spin_unlock_irqrestore(&iommu->lock, flags);
1476                 return 0;
1477         }
1478
1479         id = domain->id;
1480         pgd = domain->pgd;
1481
1482         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1483             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1484                 int found = 0;
1485
1486                 /* find an available domain id for this device in iommu */
1487                 ndomains = cap_ndoms(iommu->cap);
1488                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1489                         if (iommu->domains[num] == domain) {
1490                                 id = num;
1491                                 found = 1;
1492                                 break;
1493                         }
1494                 }
1495
1496                 if (found == 0) {
1497                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1498                         if (num >= ndomains) {
1499                                 spin_unlock_irqrestore(&iommu->lock, flags);
1500                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1501                                 return -EFAULT;
1502                         }
1503
1504                         set_bit(num, iommu->domain_ids);
1505                         iommu->domains[num] = domain;
1506                         id = num;
1507                 }
1508
1509                 /* Skip top levels of page tables for
1510                  * iommu which has less agaw than default.
1511                  * Unnecessary for PT mode.
1512                  */
1513                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1514                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1515                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1516                                 if (!dma_pte_present(pgd)) {
1517                                         spin_unlock_irqrestore(&iommu->lock, flags);
1518                                         return -ENOMEM;
1519                                 }
1520                         }
1521                 }
1522         }
1523
1524         context_set_domain_id(context, id);
1525
1526         if (translation != CONTEXT_TT_PASS_THROUGH) {
1527                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1528                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1529                                      CONTEXT_TT_MULTI_LEVEL;
1530         }
1531         /*
1532          * In pass through mode, AW must be programmed to indicate the largest
1533          * AGAW value supported by hardware. And ASR is ignored by hardware.
1534          */
1535         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1536                 context_set_address_width(context, iommu->msagaw);
1537         else {
1538                 context_set_address_root(context, virt_to_phys(pgd));
1539                 context_set_address_width(context, iommu->agaw);
1540         }
1541
1542         context_set_translation_type(context, translation);
1543         context_set_fault_enable(context);
1544         context_set_present(context);
1545         domain_flush_cache(domain, context, sizeof(*context));
1546
1547         /*
1548          * It's a non-present to present mapping. If hardware doesn't cache
1549          * non-present entry we only need to flush the write-buffer. If the
1550          * _does_ cache non-present entries, then it does so in the special
1551          * domain #0, which we have to flush:
1552          */
1553         if (cap_caching_mode(iommu->cap)) {
1554                 iommu->flush.flush_context(iommu, 0,
1555                                            (((u16)bus) << 8) | devfn,
1556                                            DMA_CCMD_MASK_NOBIT,
1557                                            DMA_CCMD_DEVICE_INVL);
1558                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1559         } else {
1560                 iommu_flush_write_buffer(iommu);
1561         }
1562         iommu_enable_dev_iotlb(info);
1563         spin_unlock_irqrestore(&iommu->lock, flags);
1564
1565         spin_lock_irqsave(&domain->iommu_lock, flags);
1566         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1567                 domain->iommu_count++;
1568                 if (domain->iommu_count == 1)
1569                         domain->nid = iommu->node;
1570                 domain_update_iommu_cap(domain);
1571         }
1572         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1573         return 0;
1574 }
1575
1576 static int
1577 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1578                         int translation)
1579 {
1580         int ret;
1581         struct pci_dev *tmp, *parent;
1582
1583         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1584                                          pdev->bus->number, pdev->devfn,
1585                                          translation);
1586         if (ret)
1587                 return ret;
1588
1589         /* dependent device mapping */
1590         tmp = pci_find_upstream_pcie_bridge(pdev);
1591         if (!tmp)
1592                 return 0;
1593         /* Secondary interface's bus number and devfn 0 */
1594         parent = pdev->bus->self;
1595         while (parent != tmp) {
1596                 ret = domain_context_mapping_one(domain,
1597                                                  pci_domain_nr(parent->bus),
1598                                                  parent->bus->number,
1599                                                  parent->devfn, translation);
1600                 if (ret)
1601                         return ret;
1602                 parent = parent->bus->self;
1603         }
1604         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1605                 return domain_context_mapping_one(domain,
1606                                         pci_domain_nr(tmp->subordinate),
1607                                         tmp->subordinate->number, 0,
1608                                         translation);
1609         else /* this is a legacy PCI bridge */
1610                 return domain_context_mapping_one(domain,
1611                                                   pci_domain_nr(tmp->bus),
1612                                                   tmp->bus->number,
1613                                                   tmp->devfn,
1614                                                   translation);
1615 }
1616
1617 static int domain_context_mapped(struct pci_dev *pdev)
1618 {
1619         int ret;
1620         struct pci_dev *tmp, *parent;
1621         struct intel_iommu *iommu;
1622
1623         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1624                                 pdev->devfn);
1625         if (!iommu)
1626                 return -ENODEV;
1627
1628         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1629         if (!ret)
1630                 return ret;
1631         /* dependent device mapping */
1632         tmp = pci_find_upstream_pcie_bridge(pdev);
1633         if (!tmp)
1634                 return ret;
1635         /* Secondary interface's bus number and devfn 0 */
1636         parent = pdev->bus->self;
1637         while (parent != tmp) {
1638                 ret = device_context_mapped(iommu, parent->bus->number,
1639                                             parent->devfn);
1640                 if (!ret)
1641                         return ret;
1642                 parent = parent->bus->self;
1643         }
1644         if (pci_is_pcie(tmp))
1645                 return device_context_mapped(iommu, tmp->subordinate->number,
1646                                              0);
1647         else
1648                 return device_context_mapped(iommu, tmp->bus->number,
1649                                              tmp->devfn);
1650 }
1651
1652 /* Returns a number of VTD pages, but aligned to MM page size */
1653 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1654                                             size_t size)
1655 {
1656         host_addr &= ~PAGE_MASK;
1657         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1658 }
1659
1660 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1661                             struct scatterlist *sg, unsigned long phys_pfn,
1662                             unsigned long nr_pages, int prot)
1663 {
1664         struct dma_pte *first_pte = NULL, *pte = NULL;
1665         phys_addr_t uninitialized_var(pteval);
1666         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1667         unsigned long sg_res;
1668
1669         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1670
1671         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1672                 return -EINVAL;
1673
1674         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1675
1676         if (sg)
1677                 sg_res = 0;
1678         else {
1679                 sg_res = nr_pages + 1;
1680                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1681         }
1682
1683         while (nr_pages--) {
1684                 uint64_t tmp;
1685
1686                 if (!sg_res) {
1687                         sg_res = aligned_nrpages(sg->offset, sg->length);
1688                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1689                         sg->dma_length = sg->length;
1690                         pteval = page_to_phys(sg_page(sg)) | prot;
1691                 }
1692                 if (!pte) {
1693                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1694                         if (!pte)
1695                                 return -ENOMEM;
1696                 }
1697                 /* We don't need lock here, nobody else
1698                  * touches the iova range
1699                  */
1700                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1701                 if (tmp) {
1702                         static int dumps = 5;
1703                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1704                                iov_pfn, tmp, (unsigned long long)pteval);
1705                         if (dumps) {
1706                                 dumps--;
1707                                 debug_dma_dump_mappings(NULL);
1708                         }
1709                         WARN_ON(1);
1710                 }
1711                 pte++;
1712                 if (!nr_pages || first_pte_in_page(pte)) {
1713                         domain_flush_cache(domain, first_pte,
1714                                            (void *)pte - (void *)first_pte);
1715                         pte = NULL;
1716                 }
1717                 iov_pfn++;
1718                 pteval += VTD_PAGE_SIZE;
1719                 sg_res--;
1720                 if (!sg_res)
1721                         sg = sg_next(sg);
1722         }
1723         return 0;
1724 }
1725
1726 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1727                                     struct scatterlist *sg, unsigned long nr_pages,
1728                                     int prot)
1729 {
1730         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1731 }
1732
1733 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1734                                      unsigned long phys_pfn, unsigned long nr_pages,
1735                                      int prot)
1736 {
1737         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1738 }
1739
1740 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1741 {
1742         if (!iommu)
1743                 return;
1744
1745         clear_context_table(iommu, bus, devfn);
1746         iommu->flush.flush_context(iommu, 0, 0, 0,
1747                                            DMA_CCMD_GLOBAL_INVL);
1748         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1749 }
1750
1751 static void domain_remove_dev_info(struct dmar_domain *domain)
1752 {
1753         struct device_domain_info *info;
1754         unsigned long flags;
1755         struct intel_iommu *iommu;
1756
1757         spin_lock_irqsave(&device_domain_lock, flags);
1758         while (!list_empty(&domain->devices)) {
1759                 info = list_entry(domain->devices.next,
1760                         struct device_domain_info, link);
1761                 list_del(&info->link);
1762                 list_del(&info->global);
1763                 if (info->dev)
1764                         info->dev->dev.archdata.iommu = NULL;
1765                 spin_unlock_irqrestore(&device_domain_lock, flags);
1766
1767                 iommu_disable_dev_iotlb(info);
1768                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1769                 iommu_detach_dev(iommu, info->bus, info->devfn);
1770                 free_devinfo_mem(info);
1771
1772                 spin_lock_irqsave(&device_domain_lock, flags);
1773         }
1774         spin_unlock_irqrestore(&device_domain_lock, flags);
1775 }
1776
1777 /*
1778  * find_domain
1779  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1780  */
1781 static struct dmar_domain *
1782 find_domain(struct pci_dev *pdev)
1783 {
1784         struct device_domain_info *info;
1785
1786         /* No lock here, assumes no domain exit in normal case */
1787         info = pdev->dev.archdata.iommu;
1788         if (info)
1789                 return info->domain;
1790         return NULL;
1791 }
1792
1793 /* domain is initialized */
1794 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1795 {
1796         struct dmar_domain *domain, *found = NULL;
1797         struct intel_iommu *iommu;
1798         struct dmar_drhd_unit *drhd;
1799         struct device_domain_info *info, *tmp;
1800         struct pci_dev *dev_tmp;
1801         unsigned long flags;
1802         int bus = 0, devfn = 0;
1803         int segment;
1804         int ret;
1805
1806         domain = find_domain(pdev);
1807         if (domain)
1808                 return domain;
1809
1810         segment = pci_domain_nr(pdev->bus);
1811
1812         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1813         if (dev_tmp) {
1814                 if (pci_is_pcie(dev_tmp)) {
1815                         bus = dev_tmp->subordinate->number;
1816                         devfn = 0;
1817                 } else {
1818                         bus = dev_tmp->bus->number;
1819                         devfn = dev_tmp->devfn;
1820                 }
1821                 spin_lock_irqsave(&device_domain_lock, flags);
1822                 list_for_each_entry(info, &device_domain_list, global) {
1823                         if (info->segment == segment &&
1824                             info->bus == bus && info->devfn == devfn) {
1825                                 found = info->domain;
1826                                 break;
1827                         }
1828                 }
1829                 spin_unlock_irqrestore(&device_domain_lock, flags);
1830                 /* pcie-pci bridge already has a domain, uses it */
1831                 if (found) {
1832                         domain = found;
1833                         goto found_domain;
1834                 }
1835         }
1836
1837         domain = alloc_domain();
1838         if (!domain)
1839                 goto error;
1840
1841         /* Allocate new domain for the device */
1842         drhd = dmar_find_matched_drhd_unit(pdev);
1843         if (!drhd) {
1844                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1845                         pci_name(pdev));
1846                 return NULL;
1847         }
1848         iommu = drhd->iommu;
1849
1850         ret = iommu_attach_domain(domain, iommu);
1851         if (ret) {
1852                 free_domain_mem(domain);
1853                 goto error;
1854         }
1855
1856         if (domain_init(domain, gaw)) {
1857                 domain_exit(domain);
1858                 goto error;
1859         }
1860
1861         /* register pcie-to-pci device */
1862         if (dev_tmp) {
1863                 info = alloc_devinfo_mem();
1864                 if (!info) {
1865                         domain_exit(domain);
1866                         goto error;
1867                 }
1868                 info->segment = segment;
1869                 info->bus = bus;
1870                 info->devfn = devfn;
1871                 info->dev = NULL;
1872                 info->domain = domain;
1873                 /* This domain is shared by devices under p2p bridge */
1874                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1875
1876                 /* pcie-to-pci bridge already has a domain, uses it */
1877                 found = NULL;
1878                 spin_lock_irqsave(&device_domain_lock, flags);
1879                 list_for_each_entry(tmp, &device_domain_list, global) {
1880                         if (tmp->segment == segment &&
1881                             tmp->bus == bus && tmp->devfn == devfn) {
1882                                 found = tmp->domain;
1883                                 break;
1884                         }
1885                 }
1886                 if (found) {
1887                         spin_unlock_irqrestore(&device_domain_lock, flags);
1888                         free_devinfo_mem(info);
1889                         domain_exit(domain);
1890                         domain = found;
1891                 } else {
1892                         list_add(&info->link, &domain->devices);
1893                         list_add(&info->global, &device_domain_list);
1894                         spin_unlock_irqrestore(&device_domain_lock, flags);
1895                 }
1896         }
1897
1898 found_domain:
1899         info = alloc_devinfo_mem();
1900         if (!info)
1901                 goto error;
1902         info->segment = segment;
1903         info->bus = pdev->bus->number;
1904         info->devfn = pdev->devfn;
1905         info->dev = pdev;
1906         info->domain = domain;
1907         spin_lock_irqsave(&device_domain_lock, flags);
1908         /* somebody is fast */
1909         found = find_domain(pdev);
1910         if (found != NULL) {
1911                 spin_unlock_irqrestore(&device_domain_lock, flags);
1912                 if (found != domain) {
1913                         domain_exit(domain);
1914                         domain = found;
1915                 }
1916                 free_devinfo_mem(info);
1917                 return domain;
1918         }
1919         list_add(&info->link, &domain->devices);
1920         list_add(&info->global, &device_domain_list);
1921         pdev->dev.archdata.iommu = info;
1922         spin_unlock_irqrestore(&device_domain_lock, flags);
1923         return domain;
1924 error:
1925         /* recheck it here, maybe others set it */
1926         return find_domain(pdev);
1927 }
1928
1929 static int iommu_identity_mapping;
1930 #define IDENTMAP_ALL            1
1931 #define IDENTMAP_GFX            2
1932 #define IDENTMAP_AZALIA         4
1933
1934 static int iommu_domain_identity_map(struct dmar_domain *domain,
1935                                      unsigned long long start,
1936                                      unsigned long long end)
1937 {
1938         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1939         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1940
1941         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1942                           dma_to_mm_pfn(last_vpfn))) {
1943                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1944                 return -ENOMEM;
1945         }
1946
1947         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1948                  start, end, domain->id);
1949         /*
1950          * RMRR range might have overlap with physical memory range,
1951          * clear it first
1952          */
1953         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1954
1955         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1956                                   last_vpfn - first_vpfn + 1,
1957                                   DMA_PTE_READ|DMA_PTE_WRITE);
1958 }
1959
1960 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1961                                       unsigned long long start,
1962                                       unsigned long long end)
1963 {
1964         struct dmar_domain *domain;
1965         int ret;
1966
1967         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1968         if (!domain)
1969                 return -ENOMEM;
1970
1971         /* For _hardware_ passthrough, don't bother. But for software
1972            passthrough, we do it anyway -- it may indicate a memory
1973            range which is reserved in E820, so which didn't get set
1974            up to start with in si_domain */
1975         if (domain == si_domain && hw_pass_through) {
1976                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1977                        pci_name(pdev), start, end);
1978                 return 0;
1979         }
1980
1981         printk(KERN_INFO
1982                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1983                pci_name(pdev), start, end);
1984         
1985         if (end < start) {
1986                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1987                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1988                         dmi_get_system_info(DMI_BIOS_VENDOR),
1989                         dmi_get_system_info(DMI_BIOS_VERSION),
1990                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1991                 ret = -EIO;
1992                 goto error;
1993         }
1994
1995         if (end >> agaw_to_width(domain->agaw)) {
1996                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1997                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1998                      agaw_to_width(domain->agaw),
1999                      dmi_get_system_info(DMI_BIOS_VENDOR),
2000                      dmi_get_system_info(DMI_BIOS_VERSION),
2001                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2002                 ret = -EIO;
2003                 goto error;
2004         }
2005
2006         ret = iommu_domain_identity_map(domain, start, end);
2007         if (ret)
2008                 goto error;
2009
2010         /* context entry init */
2011         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2012         if (ret)
2013                 goto error;
2014
2015         return 0;
2016
2017  error:
2018         domain_exit(domain);
2019         return ret;
2020 }
2021
2022 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2023         struct pci_dev *pdev)
2024 {
2025         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2026                 return 0;
2027         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2028                 rmrr->end_address + 1);
2029 }
2030
2031 #ifdef CONFIG_DMAR_FLOPPY_WA
2032 static inline void iommu_prepare_isa(void)
2033 {
2034         struct pci_dev *pdev;
2035         int ret;
2036
2037         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2038         if (!pdev)
2039                 return;
2040
2041         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2042         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2043
2044         if (ret)
2045                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2046                        "floppy might not work\n");
2047
2048 }
2049 #else
2050 static inline void iommu_prepare_isa(void)
2051 {
2052         return;
2053 }
2054 #endif /* !CONFIG_DMAR_FLPY_WA */
2055
2056 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2057
2058 static int __init si_domain_work_fn(unsigned long start_pfn,
2059                                     unsigned long end_pfn, void *datax)
2060 {
2061         int *ret = datax;
2062
2063         *ret = iommu_domain_identity_map(si_domain,
2064                                          (uint64_t)start_pfn << PAGE_SHIFT,
2065                                          (uint64_t)end_pfn << PAGE_SHIFT);
2066         return *ret;
2067
2068 }
2069
2070 static int __init si_domain_init(int hw)
2071 {
2072         struct dmar_drhd_unit *drhd;
2073         struct intel_iommu *iommu;
2074         int nid, ret = 0;
2075
2076         si_domain = alloc_domain();
2077         if (!si_domain)
2078                 return -EFAULT;
2079
2080         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2081
2082         for_each_active_iommu(iommu, drhd) {
2083                 ret = iommu_attach_domain(si_domain, iommu);
2084                 if (ret) {
2085                         domain_exit(si_domain);
2086                         return -EFAULT;
2087                 }
2088         }
2089
2090         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2091                 domain_exit(si_domain);
2092                 return -EFAULT;
2093         }
2094
2095         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2096
2097         if (hw)
2098                 return 0;
2099
2100         for_each_online_node(nid) {
2101                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2102                 if (ret)
2103                         return ret;
2104         }
2105
2106         return 0;
2107 }
2108
2109 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2110                                           struct pci_dev *pdev);
2111 static int identity_mapping(struct pci_dev *pdev)
2112 {
2113         struct device_domain_info *info;
2114
2115         if (likely(!iommu_identity_mapping))
2116                 return 0;
2117
2118
2119         list_for_each_entry(info, &si_domain->devices, link)
2120                 if (info->dev == pdev)
2121                         return 1;
2122         return 0;
2123 }
2124
2125 static int domain_add_dev_info(struct dmar_domain *domain,
2126                                struct pci_dev *pdev,
2127                                int translation)
2128 {
2129         struct device_domain_info *info;
2130         unsigned long flags;
2131         int ret;
2132
2133         info = alloc_devinfo_mem();
2134         if (!info)
2135                 return -ENOMEM;
2136
2137         ret = domain_context_mapping(domain, pdev, translation);
2138         if (ret) {
2139                 free_devinfo_mem(info);
2140                 return ret;
2141         }
2142
2143         info->segment = pci_domain_nr(pdev->bus);
2144         info->bus = pdev->bus->number;
2145         info->devfn = pdev->devfn;
2146         info->dev = pdev;
2147         info->domain = domain;
2148
2149         spin_lock_irqsave(&device_domain_lock, flags);
2150         list_add(&info->link, &domain->devices);
2151         list_add(&info->global, &device_domain_list);
2152         pdev->dev.archdata.iommu = info;
2153         spin_unlock_irqrestore(&device_domain_lock, flags);
2154
2155         return 0;
2156 }
2157
2158 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2159 {
2160         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2161                 return 1;
2162
2163         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2164                 return 1;
2165
2166         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2167                 return 0;
2168
2169         /*
2170          * We want to start off with all devices in the 1:1 domain, and
2171          * take them out later if we find they can't access all of memory.
2172          *
2173          * However, we can't do this for PCI devices behind bridges,
2174          * because all PCI devices behind the same bridge will end up
2175          * with the same source-id on their transactions.
2176          *
2177          * Practically speaking, we can't change things around for these
2178          * devices at run-time, because we can't be sure there'll be no
2179          * DMA transactions in flight for any of their siblings.
2180          * 
2181          * So PCI devices (unless they're on the root bus) as well as
2182          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2183          * the 1:1 domain, just in _case_ one of their siblings turns out
2184          * not to be able to map all of memory.
2185          */
2186         if (!pci_is_pcie(pdev)) {
2187                 if (!pci_is_root_bus(pdev->bus))
2188                         return 0;
2189                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2190                         return 0;
2191         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2192                 return 0;
2193
2194         /* 
2195          * At boot time, we don't yet know if devices will be 64-bit capable.
2196          * Assume that they will -- if they turn out not to be, then we can 
2197          * take them out of the 1:1 domain later.
2198          */
2199         if (!startup)
2200                 return pdev->dma_mask > DMA_BIT_MASK(32);
2201
2202         return 1;
2203 }
2204
2205 static int __init iommu_prepare_static_identity_mapping(int hw)
2206 {
2207         struct pci_dev *pdev = NULL;
2208         int ret;
2209
2210         ret = si_domain_init(hw);
2211         if (ret)
2212                 return -EFAULT;
2213
2214         for_each_pci_dev(pdev) {
2215                 if (iommu_should_identity_map(pdev, 1)) {
2216                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2217                                hw ? "hardware" : "software", pci_name(pdev));
2218
2219                         ret = domain_add_dev_info(si_domain, pdev,
2220                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2221                                                      CONTEXT_TT_MULTI_LEVEL);
2222                         if (ret)
2223                                 return ret;
2224                 }
2225         }
2226
2227         return 0;
2228 }
2229
2230 static int __init init_dmars(void)
2231 {
2232         struct dmar_drhd_unit *drhd;
2233         struct dmar_rmrr_unit *rmrr;
2234         struct pci_dev *pdev;
2235         struct intel_iommu *iommu;
2236         int i, ret;
2237
2238         /*
2239          * for each drhd
2240          *    allocate root
2241          *    initialize and program root entry to not present
2242          * endfor
2243          */
2244         for_each_drhd_unit(drhd) {
2245                 g_num_of_iommus++;
2246                 /*
2247                  * lock not needed as this is only incremented in the single
2248                  * threaded kernel __init code path all other access are read
2249                  * only
2250                  */
2251         }
2252
2253         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2254                         GFP_KERNEL);
2255         if (!g_iommus) {
2256                 printk(KERN_ERR "Allocating global iommu array failed\n");
2257                 ret = -ENOMEM;
2258                 goto error;
2259         }
2260
2261         deferred_flush = kzalloc(g_num_of_iommus *
2262                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2263         if (!deferred_flush) {
2264                 ret = -ENOMEM;
2265                 goto error;
2266         }
2267
2268         for_each_drhd_unit(drhd) {
2269                 if (drhd->ignored)
2270                         continue;
2271
2272                 iommu = drhd->iommu;
2273                 g_iommus[iommu->seq_id] = iommu;
2274
2275                 ret = iommu_init_domains(iommu);
2276                 if (ret)
2277                         goto error;
2278
2279                 /*
2280                  * TBD:
2281                  * we could share the same root & context tables
2282                  * among all IOMMU's. Need to Split it later.
2283                  */
2284                 ret = iommu_alloc_root_entry(iommu);
2285                 if (ret) {
2286                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2287                         goto error;
2288                 }
2289                 if (!ecap_pass_through(iommu->ecap))
2290                         hw_pass_through = 0;
2291         }
2292
2293         /*
2294          * Start from the sane iommu hardware state.
2295          */
2296         for_each_drhd_unit(drhd) {
2297                 if (drhd->ignored)
2298                         continue;
2299
2300                 iommu = drhd->iommu;
2301
2302                 /*
2303                  * If the queued invalidation is already initialized by us
2304                  * (for example, while enabling interrupt-remapping) then
2305                  * we got the things already rolling from a sane state.
2306                  */
2307                 if (iommu->qi)
2308                         continue;
2309
2310                 /*
2311                  * Clear any previous faults.
2312                  */
2313                 dmar_fault(-1, iommu);
2314                 /*
2315                  * Disable queued invalidation if supported and already enabled
2316                  * before OS handover.
2317                  */
2318                 dmar_disable_qi(iommu);
2319         }
2320
2321         for_each_drhd_unit(drhd) {
2322                 if (drhd->ignored)
2323                         continue;
2324
2325                 iommu = drhd->iommu;
2326
2327                 if (dmar_enable_qi(iommu)) {
2328                         /*
2329                          * Queued Invalidate not enabled, use Register Based
2330                          * Invalidate
2331                          */
2332                         iommu->flush.flush_context = __iommu_flush_context;
2333                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2334                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2335                                "invalidation\n",
2336                                 iommu->seq_id,
2337                                (unsigned long long)drhd->reg_base_addr);
2338                 } else {
2339                         iommu->flush.flush_context = qi_flush_context;
2340                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2341                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2342                                "invalidation\n",
2343                                 iommu->seq_id,
2344                                (unsigned long long)drhd->reg_base_addr);
2345                 }
2346         }
2347
2348         if (iommu_pass_through)
2349                 iommu_identity_mapping |= IDENTMAP_ALL;
2350
2351 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2352         iommu_identity_mapping |= IDENTMAP_GFX;
2353 #endif
2354
2355         check_tylersburg_isoch();
2356
2357         /*
2358          * If pass through is not set or not enabled, setup context entries for
2359          * identity mappings for rmrr, gfx, and isa and may fall back to static
2360          * identity mapping if iommu_identity_mapping is set.
2361          */
2362         if (iommu_identity_mapping) {
2363                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2364                 if (ret) {
2365                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2366                         goto error;
2367                 }
2368         }
2369         /*
2370          * For each rmrr
2371          *   for each dev attached to rmrr
2372          *   do
2373          *     locate drhd for dev, alloc domain for dev
2374          *     allocate free domain
2375          *     allocate page table entries for rmrr
2376          *     if context not allocated for bus
2377          *           allocate and init context
2378          *           set present in root table for this bus
2379          *     init context with domain, translation etc
2380          *    endfor
2381          * endfor
2382          */
2383         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2384         for_each_rmrr_units(rmrr) {
2385                 for (i = 0; i < rmrr->devices_cnt; i++) {
2386                         pdev = rmrr->devices[i];
2387                         /*
2388                          * some BIOS lists non-exist devices in DMAR
2389                          * table.
2390                          */
2391                         if (!pdev)
2392                                 continue;
2393                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2394                         if (ret)
2395                                 printk(KERN_ERR
2396                                        "IOMMU: mapping reserved region failed\n");
2397                 }
2398         }
2399
2400         iommu_prepare_isa();
2401
2402         /*
2403          * for each drhd
2404          *   enable fault log
2405          *   global invalidate context cache
2406          *   global invalidate iotlb
2407          *   enable translation
2408          */
2409         for_each_drhd_unit(drhd) {
2410                 if (drhd->ignored) {
2411                         /*
2412                          * we always have to disable PMRs or DMA may fail on
2413                          * this device
2414                          */
2415                         if (force_on)
2416                                 iommu_disable_protect_mem_regions(drhd->iommu);
2417                         continue;
2418                 }
2419                 iommu = drhd->iommu;
2420
2421                 iommu_flush_write_buffer(iommu);
2422
2423                 ret = dmar_set_interrupt(iommu);
2424                 if (ret)
2425                         goto error;
2426
2427                 iommu_set_root_entry(iommu);
2428
2429                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2430                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2431
2432                 ret = iommu_enable_translation(iommu);
2433                 if (ret)
2434                         goto error;
2435
2436                 iommu_disable_protect_mem_regions(iommu);
2437         }
2438
2439         return 0;
2440 error:
2441         for_each_drhd_unit(drhd) {
2442                 if (drhd->ignored)
2443                         continue;
2444                 iommu = drhd->iommu;
2445                 free_iommu(iommu);
2446         }
2447         kfree(g_iommus);
2448         return ret;
2449 }
2450
2451 /* This takes a number of _MM_ pages, not VTD pages */
2452 static struct iova *intel_alloc_iova(struct device *dev,
2453                                      struct dmar_domain *domain,
2454                                      unsigned long nrpages, uint64_t dma_mask)
2455 {
2456         struct pci_dev *pdev = to_pci_dev(dev);
2457         struct iova *iova = NULL;
2458
2459         /* Restrict dma_mask to the width that the iommu can handle */
2460         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2461
2462         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2463                 /*
2464                  * First try to allocate an io virtual address in
2465                  * DMA_BIT_MASK(32) and if that fails then try allocating
2466                  * from higher range
2467                  */
2468                 iova = alloc_iova(&domain->iovad, nrpages,
2469                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2470                 if (iova)
2471                         return iova;
2472         }
2473         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2474         if (unlikely(!iova)) {
2475                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2476                        nrpages, pci_name(pdev));
2477                 return NULL;
2478         }
2479
2480         return iova;
2481 }
2482
2483 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2484 {
2485         struct dmar_domain *domain;
2486         int ret;
2487
2488         domain = get_domain_for_dev(pdev,
2489                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2490         if (!domain) {
2491                 printk(KERN_ERR
2492                         "Allocating domain for %s failed", pci_name(pdev));
2493                 return NULL;
2494         }
2495
2496         /* make sure context mapping is ok */
2497         if (unlikely(!domain_context_mapped(pdev))) {
2498                 ret = domain_context_mapping(domain, pdev,
2499                                              CONTEXT_TT_MULTI_LEVEL);
2500                 if (ret) {
2501                         printk(KERN_ERR
2502                                 "Domain context map for %s failed",
2503                                 pci_name(pdev));
2504                         return NULL;
2505                 }
2506         }
2507
2508         return domain;
2509 }
2510
2511 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2512 {
2513         struct device_domain_info *info;
2514
2515         /* No lock here, assumes no domain exit in normal case */
2516         info = dev->dev.archdata.iommu;
2517         if (likely(info))
2518                 return info->domain;
2519
2520         return __get_valid_domain_for_dev(dev);
2521 }
2522
2523 static int iommu_dummy(struct pci_dev *pdev)
2524 {
2525         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2526 }
2527
2528 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2529 static int iommu_no_mapping(struct device *dev)
2530 {
2531         struct pci_dev *pdev;
2532         int found;
2533
2534         if (unlikely(dev->bus != &pci_bus_type))
2535                 return 1;
2536
2537         pdev = to_pci_dev(dev);
2538         if (iommu_dummy(pdev))
2539                 return 1;
2540
2541         if (!iommu_identity_mapping)
2542                 return 0;
2543
2544         found = identity_mapping(pdev);
2545         if (found) {
2546                 if (iommu_should_identity_map(pdev, 0))
2547                         return 1;
2548                 else {
2549                         /*
2550                          * 32 bit DMA is removed from si_domain and fall back
2551                          * to non-identity mapping.
2552                          */
2553                         domain_remove_one_dev_info(si_domain, pdev);
2554                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2555                                pci_name(pdev));
2556                         return 0;
2557                 }
2558         } else {
2559                 /*
2560                  * In case of a detached 64 bit DMA device from vm, the device
2561                  * is put into si_domain for identity mapping.
2562                  */
2563                 if (iommu_should_identity_map(pdev, 0)) {
2564                         int ret;
2565                         ret = domain_add_dev_info(si_domain, pdev,
2566                                                   hw_pass_through ?
2567                                                   CONTEXT_TT_PASS_THROUGH :
2568                                                   CONTEXT_TT_MULTI_LEVEL);
2569                         if (!ret) {
2570                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2571                                        pci_name(pdev));
2572                                 return 1;
2573                         }
2574                 }
2575         }
2576
2577         return 0;
2578 }
2579
2580 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2581                                      size_t size, int dir, u64 dma_mask)
2582 {
2583         struct pci_dev *pdev = to_pci_dev(hwdev);
2584         struct dmar_domain *domain;
2585         phys_addr_t start_paddr;
2586         struct iova *iova;
2587         int prot = 0;
2588         int ret;
2589         struct intel_iommu *iommu;
2590         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2591
2592         BUG_ON(dir == DMA_NONE);
2593
2594         if (iommu_no_mapping(hwdev))
2595                 return paddr;
2596
2597         domain = get_valid_domain_for_dev(pdev);
2598         if (!domain)
2599                 return 0;
2600
2601         iommu = domain_get_iommu(domain);
2602         size = aligned_nrpages(paddr, size);
2603
2604         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2605                                 pdev->dma_mask);
2606         if (!iova)
2607                 goto error;
2608
2609         /*
2610          * Check if DMAR supports zero-length reads on write only
2611          * mappings..
2612          */
2613         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2614                         !cap_zlr(iommu->cap))
2615                 prot |= DMA_PTE_READ;
2616         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2617                 prot |= DMA_PTE_WRITE;
2618         /*
2619          * paddr - (paddr + size) might be partial page, we should map the whole
2620          * page.  Note: if two part of one page are separately mapped, we
2621          * might have two guest_addr mapping to the same host paddr, but this
2622          * is not a big problem
2623          */
2624         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2625                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2626         if (ret)
2627                 goto error;
2628
2629         /* it's a non-present to present mapping. Only flush if caching mode */
2630         if (cap_caching_mode(iommu->cap))
2631                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2632         else
2633                 iommu_flush_write_buffer(iommu);
2634
2635         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2636         start_paddr += paddr & ~PAGE_MASK;
2637         return start_paddr;
2638
2639 error:
2640         if (iova)
2641                 __free_iova(&domain->iovad, iova);
2642         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2643                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2644         return 0;
2645 }
2646
2647 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2648                                  unsigned long offset, size_t size,
2649                                  enum dma_data_direction dir,
2650                                  struct dma_attrs *attrs)
2651 {
2652         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2653                                   dir, to_pci_dev(dev)->dma_mask);
2654 }
2655
2656 static void flush_unmaps(void)
2657 {
2658         int i, j;
2659
2660         timer_on = 0;
2661
2662         /* just flush them all */
2663         for (i = 0; i < g_num_of_iommus; i++) {
2664                 struct intel_iommu *iommu = g_iommus[i];
2665                 if (!iommu)
2666                         continue;
2667
2668                 if (!deferred_flush[i].next)
2669                         continue;
2670
2671                 /* In caching mode, global flushes turn emulation expensive */
2672                 if (!cap_caching_mode(iommu->cap))
2673                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2674                                          DMA_TLB_GLOBAL_FLUSH);
2675                 for (j = 0; j < deferred_flush[i].next; j++) {
2676                         unsigned long mask;
2677                         struct iova *iova = deferred_flush[i].iova[j];
2678                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2679
2680                         /* On real hardware multiple invalidations are expensive */
2681                         if (cap_caching_mode(iommu->cap))
2682                                 iommu_flush_iotlb_psi(iommu, domain->id,
2683                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2684                         else {
2685                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2686                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2687                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2688                         }
2689                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2690                 }
2691                 deferred_flush[i].next = 0;
2692         }
2693
2694         list_size = 0;
2695 }
2696
2697 static void flush_unmaps_timeout(unsigned long data)
2698 {
2699         unsigned long flags;
2700
2701         spin_lock_irqsave(&async_umap_flush_lock, flags);
2702         flush_unmaps();
2703         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2704 }
2705
2706 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2707 {
2708         unsigned long flags;
2709         int next, iommu_id;
2710         struct intel_iommu *iommu;
2711
2712         spin_lock_irqsave(&async_umap_flush_lock, flags);
2713         if (list_size == HIGH_WATER_MARK)
2714                 flush_unmaps();
2715
2716         iommu = domain_get_iommu(dom);
2717         iommu_id = iommu->seq_id;
2718
2719         next = deferred_flush[iommu_id].next;
2720         deferred_flush[iommu_id].domain[next] = dom;
2721         deferred_flush[iommu_id].iova[next] = iova;
2722         deferred_flush[iommu_id].next++;
2723
2724         if (!timer_on) {
2725                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2726                 timer_on = 1;
2727         }
2728         list_size++;
2729         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2730 }
2731
2732 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2733                              size_t size, enum dma_data_direction dir,
2734                              struct dma_attrs *attrs)
2735 {
2736         struct pci_dev *pdev = to_pci_dev(dev);
2737         struct dmar_domain *domain;
2738         unsigned long start_pfn, last_pfn;
2739         struct iova *iova;
2740         struct intel_iommu *iommu;
2741
2742         if (iommu_no_mapping(dev))
2743                 return;
2744
2745         domain = find_domain(pdev);
2746         BUG_ON(!domain);
2747
2748         iommu = domain_get_iommu(domain);
2749
2750         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2751         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2752                       (unsigned long long)dev_addr))
2753                 return;
2754
2755         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2756         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2757
2758         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2759                  pci_name(pdev), start_pfn, last_pfn);
2760
2761         /*  clear the whole page */
2762         dma_pte_clear_range(domain, start_pfn, last_pfn);
2763
2764         /* free page tables */
2765         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2766
2767         if (intel_iommu_strict) {
2768                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2769                                       last_pfn - start_pfn + 1, 0);
2770                 /* free iova */
2771                 __free_iova(&domain->iovad, iova);
2772         } else {
2773                 add_unmap(domain, iova);
2774                 /*
2775                  * queue up the release of the unmap to save the 1/6th of the
2776                  * cpu used up by the iotlb flush operation...
2777                  */
2778         }
2779 }
2780
2781 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2782                                   dma_addr_t *dma_handle, gfp_t flags)
2783 {
2784         void *vaddr;
2785         int order;
2786
2787         size = PAGE_ALIGN(size);
2788         order = get_order(size);
2789
2790         if (!iommu_no_mapping(hwdev))
2791                 flags &= ~(GFP_DMA | GFP_DMA32);
2792         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2793                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2794                         flags |= GFP_DMA;
2795                 else
2796                         flags |= GFP_DMA32;
2797         }
2798
2799         vaddr = (void *)__get_free_pages(flags, order);
2800         if (!vaddr)
2801                 return NULL;
2802         memset(vaddr, 0, size);
2803
2804         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2805                                          DMA_BIDIRECTIONAL,
2806                                          hwdev->coherent_dma_mask);
2807         if (*dma_handle)
2808                 return vaddr;
2809         free_pages((unsigned long)vaddr, order);
2810         return NULL;
2811 }
2812
2813 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2814                                 dma_addr_t dma_handle)
2815 {
2816         int order;
2817
2818         size = PAGE_ALIGN(size);
2819         order = get_order(size);
2820
2821         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2822         free_pages((unsigned long)vaddr, order);
2823 }
2824
2825 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2826                            int nelems, enum dma_data_direction dir,
2827                            struct dma_attrs *attrs)
2828 {
2829         struct pci_dev *pdev = to_pci_dev(hwdev);
2830         struct dmar_domain *domain;
2831         unsigned long start_pfn, last_pfn;
2832         struct iova *iova;
2833         struct intel_iommu *iommu;
2834
2835         if (iommu_no_mapping(hwdev))
2836                 return;
2837
2838         domain = find_domain(pdev);
2839         BUG_ON(!domain);
2840
2841         iommu = domain_get_iommu(domain);
2842
2843         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2844         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2845                       (unsigned long long)sglist[0].dma_address))
2846                 return;
2847
2848         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2849         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2850
2851         /*  clear the whole page */
2852         dma_pte_clear_range(domain, start_pfn, last_pfn);
2853
2854         /* free page tables */
2855         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2856
2857         if (intel_iommu_strict) {
2858                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2859                                       last_pfn - start_pfn + 1, 0);
2860                 /* free iova */
2861                 __free_iova(&domain->iovad, iova);
2862         } else {
2863                 add_unmap(domain, iova);
2864                 /*
2865                  * queue up the release of the unmap to save the 1/6th of the
2866                  * cpu used up by the iotlb flush operation...
2867                  */
2868         }
2869 }
2870
2871 static int intel_nontranslate_map_sg(struct device *hddev,
2872         struct scatterlist *sglist, int nelems, int dir)
2873 {
2874         int i;
2875         struct scatterlist *sg;
2876
2877         for_each_sg(sglist, sg, nelems, i) {
2878                 BUG_ON(!sg_page(sg));
2879                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2880                 sg->dma_length = sg->length;
2881         }
2882         return nelems;
2883 }
2884
2885 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2886                         enum dma_data_direction dir, struct dma_attrs *attrs)
2887 {
2888         int i;
2889         struct pci_dev *pdev = to_pci_dev(hwdev);
2890         struct dmar_domain *domain;
2891         size_t size = 0;
2892         int prot = 0;
2893         struct iova *iova = NULL;
2894         int ret;
2895         struct scatterlist *sg;
2896         unsigned long start_vpfn;
2897         struct intel_iommu *iommu;
2898
2899         BUG_ON(dir == DMA_NONE);
2900         if (iommu_no_mapping(hwdev))
2901                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2902
2903         domain = get_valid_domain_for_dev(pdev);
2904         if (!domain)
2905                 return 0;
2906
2907         iommu = domain_get_iommu(domain);
2908
2909         for_each_sg(sglist, sg, nelems, i)
2910                 size += aligned_nrpages(sg->offset, sg->length);
2911
2912         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2913                                 pdev->dma_mask);
2914         if (!iova) {
2915                 sglist->dma_length = 0;
2916                 return 0;
2917         }
2918
2919         /*
2920          * Check if DMAR supports zero-length reads on write only
2921          * mappings..
2922          */
2923         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2924                         !cap_zlr(iommu->cap))
2925                 prot |= DMA_PTE_READ;
2926         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2927                 prot |= DMA_PTE_WRITE;
2928
2929         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2930
2931         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2932         if (unlikely(ret)) {
2933                 /*  clear the page */
2934                 dma_pte_clear_range(domain, start_vpfn,
2935                                     start_vpfn + size - 1);
2936                 /* free page tables */
2937                 dma_pte_free_pagetable(domain, start_vpfn,
2938                                        start_vpfn + size - 1);
2939                 /* free iova */
2940                 __free_iova(&domain->iovad, iova);
2941                 return 0;
2942         }
2943
2944         /* it's a non-present to present mapping. Only flush if caching mode */
2945         if (cap_caching_mode(iommu->cap))
2946                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
2947         else
2948                 iommu_flush_write_buffer(iommu);
2949
2950         return nelems;
2951 }
2952
2953 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2954 {
2955         return !dma_addr;
2956 }
2957
2958 struct dma_map_ops intel_dma_ops = {
2959         .alloc_coherent = intel_alloc_coherent,
2960         .free_coherent = intel_free_coherent,
2961         .map_sg = intel_map_sg,
2962         .unmap_sg = intel_unmap_sg,
2963         .map_page = intel_map_page,
2964         .unmap_page = intel_unmap_page,
2965         .mapping_error = intel_mapping_error,
2966 };
2967
2968 static inline int iommu_domain_cache_init(void)
2969 {
2970         int ret = 0;
2971
2972         iommu_domain_cache = kmem_cache_create("iommu_domain",
2973                                          sizeof(struct dmar_domain),
2974                                          0,
2975                                          SLAB_HWCACHE_ALIGN,
2976
2977                                          NULL);
2978         if (!iommu_domain_cache) {
2979                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2980                 ret = -ENOMEM;
2981         }
2982
2983         return ret;
2984 }
2985
2986 static inline int iommu_devinfo_cache_init(void)
2987 {
2988         int ret = 0;
2989
2990         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2991                                          sizeof(struct device_domain_info),
2992                                          0,
2993                                          SLAB_HWCACHE_ALIGN,
2994                                          NULL);
2995         if (!iommu_devinfo_cache) {
2996                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2997                 ret = -ENOMEM;
2998         }
2999
3000         return ret;
3001 }
3002
3003 static inline int iommu_iova_cache_init(void)
3004 {
3005         int ret = 0;
3006
3007         iommu_iova_cache = kmem_cache_create("iommu_iova",
3008                                          sizeof(struct iova),
3009                                          0,
3010                                          SLAB_HWCACHE_ALIGN,
3011                                          NULL);
3012         if (!iommu_iova_cache) {
3013                 printk(KERN_ERR "Couldn't create iova cache\n");
3014                 ret = -ENOMEM;
3015         }
3016
3017         return ret;
3018 }
3019
3020 static int __init iommu_init_mempool(void)
3021 {
3022         int ret;
3023         ret = iommu_iova_cache_init();
3024         if (ret)
3025                 return ret;
3026
3027         ret = iommu_domain_cache_init();
3028         if (ret)
3029                 goto domain_error;
3030
3031         ret = iommu_devinfo_cache_init();
3032         if (!ret)
3033                 return ret;
3034
3035         kmem_cache_destroy(iommu_domain_cache);
3036 domain_error:
3037         kmem_cache_destroy(iommu_iova_cache);
3038
3039         return -ENOMEM;
3040 }
3041
3042 static void __init iommu_exit_mempool(void)
3043 {
3044         kmem_cache_destroy(iommu_devinfo_cache);
3045         kmem_cache_destroy(iommu_domain_cache);
3046         kmem_cache_destroy(iommu_iova_cache);
3047
3048 }
3049
3050 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3051 {
3052         struct dmar_drhd_unit *drhd;
3053         u32 vtbar;
3054         int rc;
3055
3056         /* We know that this device on this chipset has its own IOMMU.
3057          * If we find it under a different IOMMU, then the BIOS is lying
3058          * to us. Hope that the IOMMU for this device is actually
3059          * disabled, and it needs no translation...
3060          */
3061         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3062         if (rc) {
3063                 /* "can't" happen */
3064                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3065                 return;
3066         }
3067         vtbar &= 0xffff0000;
3068
3069         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3070         drhd = dmar_find_matched_drhd_unit(pdev);
3071         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3072                             TAINT_FIRMWARE_WORKAROUND,
3073                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3074                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3075 }
3076 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3077
3078 static void __init init_no_remapping_devices(void)
3079 {
3080         struct dmar_drhd_unit *drhd;
3081
3082         for_each_drhd_unit(drhd) {
3083                 if (!drhd->include_all) {
3084                         int i;
3085                         for (i = 0; i < drhd->devices_cnt; i++)
3086                                 if (drhd->devices[i] != NULL)
3087                                         break;
3088                         /* ignore DMAR unit if no pci devices exist */
3089                         if (i == drhd->devices_cnt)
3090                                 drhd->ignored = 1;
3091                 }
3092         }
3093
3094         if (dmar_map_gfx)
3095                 return;
3096
3097         for_each_drhd_unit(drhd) {
3098                 int i;
3099                 if (drhd->ignored || drhd->include_all)
3100                         continue;
3101
3102                 for (i = 0; i < drhd->devices_cnt; i++)
3103                         if (drhd->devices[i] &&
3104                                 !IS_GFX_DEVICE(drhd->devices[i]))
3105                                 break;
3106
3107                 if (i < drhd->devices_cnt)
3108                         continue;
3109
3110                 /* bypass IOMMU if it is just for gfx devices */
3111                 drhd->ignored = 1;
3112                 for (i = 0; i < drhd->devices_cnt; i++) {
3113                         if (!drhd->devices[i])
3114                                 continue;
3115                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3116                 }
3117         }
3118 }
3119
3120 #ifdef CONFIG_SUSPEND
3121 static int init_iommu_hw(void)
3122 {
3123         struct dmar_drhd_unit *drhd;
3124         struct intel_iommu *iommu = NULL;
3125
3126         for_each_active_iommu(iommu, drhd)
3127                 if (iommu->qi)
3128                         dmar_reenable_qi(iommu);
3129
3130         for_each_iommu(iommu, drhd) {
3131                 if (drhd->ignored) {
3132                         /*
3133                          * we always have to disable PMRs or DMA may fail on
3134                          * this device
3135                          */
3136                         if (force_on)
3137                                 iommu_disable_protect_mem_regions(iommu);
3138                         continue;
3139                 }
3140         
3141                 iommu_flush_write_buffer(iommu);
3142
3143                 iommu_set_root_entry(iommu);
3144
3145                 iommu->flush.flush_context(iommu, 0, 0, 0,
3146                                            DMA_CCMD_GLOBAL_INVL);
3147                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3148                                          DMA_TLB_GLOBAL_FLUSH);
3149                 if (iommu_enable_translation(iommu))
3150                         return 1;
3151                 iommu_disable_protect_mem_regions(iommu);
3152         }
3153
3154         return 0;
3155 }
3156
3157 static void iommu_flush_all(void)
3158 {
3159         struct dmar_drhd_unit *drhd;
3160         struct intel_iommu *iommu;
3161
3162         for_each_active_iommu(iommu, drhd) {
3163                 iommu->flush.flush_context(iommu, 0, 0, 0,
3164                                            DMA_CCMD_GLOBAL_INVL);
3165                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3166                                          DMA_TLB_GLOBAL_FLUSH);
3167         }
3168 }
3169
3170 static int iommu_suspend(void)
3171 {
3172         struct dmar_drhd_unit *drhd;
3173         struct intel_iommu *iommu = NULL;
3174         unsigned long flag;
3175
3176         for_each_active_iommu(iommu, drhd) {
3177                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3178                                                  GFP_ATOMIC);
3179                 if (!iommu->iommu_state)
3180                         goto nomem;
3181         }
3182
3183         iommu_flush_all();
3184
3185         for_each_active_iommu(iommu, drhd) {
3186                 iommu_disable_translation(iommu);
3187
3188                 spin_lock_irqsave(&iommu->register_lock, flag);
3189
3190                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3191                         readl(iommu->reg + DMAR_FECTL_REG);
3192                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3193                         readl(iommu->reg + DMAR_FEDATA_REG);
3194                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3195                         readl(iommu->reg + DMAR_FEADDR_REG);
3196                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3197                         readl(iommu->reg + DMAR_FEUADDR_REG);
3198
3199                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3200         }
3201         return 0;
3202
3203 nomem:
3204         for_each_active_iommu(iommu, drhd)
3205                 kfree(iommu->iommu_state);
3206
3207         return -ENOMEM;
3208 }
3209
3210 static void iommu_resume(void)
3211 {
3212         struct dmar_drhd_unit *drhd;
3213         struct intel_iommu *iommu = NULL;
3214         unsigned long flag;
3215
3216         if (init_iommu_hw()) {
3217                 if (force_on)
3218                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3219                 else
3220                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3221                 return;
3222         }
3223
3224         for_each_active_iommu(iommu, drhd) {
3225
3226                 spin_lock_irqsave(&iommu->register_lock, flag);
3227
3228                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3229                         iommu->reg + DMAR_FECTL_REG);
3230                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3231                         iommu->reg + DMAR_FEDATA_REG);
3232                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3233                         iommu->reg + DMAR_FEADDR_REG);
3234                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3235                         iommu->reg + DMAR_FEUADDR_REG);
3236
3237                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3238         }
3239
3240         for_each_active_iommu(iommu, drhd)
3241                 kfree(iommu->iommu_state);
3242 }
3243
3244 static struct syscore_ops iommu_syscore_ops = {
3245         .resume         = iommu_resume,
3246         .suspend        = iommu_suspend,
3247 };
3248
3249 static void __init init_iommu_pm_ops(void)
3250 {
3251         register_syscore_ops(&iommu_syscore_ops);
3252 }
3253
3254 #else
3255 static inline int init_iommu_pm_ops(void) { }
3256 #endif  /* CONFIG_PM */
3257
3258 /*
3259  * Here we only respond to action of unbound device from driver.
3260  *
3261  * Added device is not attached to its DMAR domain here yet. That will happen
3262  * when mapping the device to iova.
3263  */
3264 static int device_notifier(struct notifier_block *nb,
3265                                   unsigned long action, void *data)
3266 {
3267         struct device *dev = data;
3268         struct pci_dev *pdev = to_pci_dev(dev);
3269         struct dmar_domain *domain;
3270
3271         if (iommu_no_mapping(dev))
3272                 return 0;
3273
3274         domain = find_domain(pdev);
3275         if (!domain)
3276                 return 0;
3277
3278         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3279                 domain_remove_one_dev_info(domain, pdev);
3280
3281                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3282                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3283                     list_empty(&domain->devices))
3284                         domain_exit(domain);
3285         }
3286
3287         return 0;
3288 }
3289
3290 static struct notifier_block device_nb = {
3291         .notifier_call = device_notifier,
3292 };
3293
3294 int __init intel_iommu_init(void)
3295 {
3296         int ret = 0;
3297
3298         /* VT-d is required for a TXT/tboot launch, so enforce that */
3299         force_on = tboot_force_iommu();
3300
3301         if (dmar_table_init()) {
3302                 if (force_on)
3303                         panic("tboot: Failed to initialize DMAR table\n");
3304                 return  -ENODEV;
3305         }
3306
3307         if (dmar_dev_scope_init()) {
3308                 if (force_on)
3309                         panic("tboot: Failed to initialize DMAR device scope\n");
3310                 return  -ENODEV;
3311         }
3312
3313         /*
3314          * Check the need for DMA-remapping initialization now.
3315          * Above initialization will also be used by Interrupt-remapping.
3316          */
3317         if (no_iommu || dmar_disabled)
3318                 return -ENODEV;
3319
3320         if (iommu_init_mempool()) {
3321                 if (force_on)
3322                         panic("tboot: Failed to initialize iommu memory\n");
3323                 return  -ENODEV;
3324         }
3325
3326         if (dmar_init_reserved_ranges()) {
3327                 if (force_on)
3328                         panic("tboot: Failed to reserve iommu ranges\n");
3329                 return  -ENODEV;
3330         }
3331
3332         init_no_remapping_devices();
3333
3334         ret = init_dmars();
3335         if (ret) {
3336                 if (force_on)
3337                         panic("tboot: Failed to initialize DMARs\n");
3338                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3339                 put_iova_domain(&reserved_iova_list);
3340                 iommu_exit_mempool();
3341                 return ret;
3342         }
3343         printk(KERN_INFO
3344         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3345
3346         init_timer(&unmap_timer);
3347 #ifdef CONFIG_SWIOTLB
3348         swiotlb = 0;
3349 #endif
3350         dma_ops = &intel_dma_ops;
3351
3352         init_iommu_pm_ops();
3353
3354         register_iommu(&intel_iommu_ops);
3355
3356         bus_register_notifier(&pci_bus_type, &device_nb);
3357
3358         return 0;
3359 }
3360
3361 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3362                                            struct pci_dev *pdev)
3363 {
3364         struct pci_dev *tmp, *parent;
3365
3366         if (!iommu || !pdev)
3367                 return;
3368
3369         /* dependent device detach */
3370         tmp = pci_find_upstream_pcie_bridge(pdev);
3371         /* Secondary interface's bus number and devfn 0 */
3372         if (tmp) {
3373                 parent = pdev->bus->self;
3374                 while (parent != tmp) {
3375                         iommu_detach_dev(iommu, parent->bus->number,
3376                                          parent->devfn);
3377                         parent = parent->bus->self;
3378                 }
3379                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3380                         iommu_detach_dev(iommu,
3381                                 tmp->subordinate->number, 0);
3382                 else /* this is a legacy PCI bridge */
3383                         iommu_detach_dev(iommu, tmp->bus->number,
3384                                          tmp->devfn);
3385         }
3386 }
3387
3388 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3389                                           struct pci_dev *pdev)
3390 {
3391         struct device_domain_info *info;
3392         struct intel_iommu *iommu;
3393         unsigned long flags;
3394         int found = 0;
3395         struct list_head *entry, *tmp;
3396
3397         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3398                                 pdev->devfn);
3399         if (!iommu)
3400                 return;
3401
3402         spin_lock_irqsave(&device_domain_lock, flags);
3403         list_for_each_safe(entry, tmp, &domain->devices) {
3404                 info = list_entry(entry, struct device_domain_info, link);
3405                 /* No need to compare PCI domain; it has to be the same */
3406                 if (info->bus == pdev->bus->number &&
3407                     info->devfn == pdev->devfn) {
3408                         list_del(&info->link);
3409                         list_del(&info->global);
3410                         if (info->dev)
3411                                 info->dev->dev.archdata.iommu = NULL;
3412                         spin_unlock_irqrestore(&device_domain_lock, flags);
3413
3414                         iommu_disable_dev_iotlb(info);
3415                         iommu_detach_dev(iommu, info->bus, info->devfn);
3416                         iommu_detach_dependent_devices(iommu, pdev);
3417                         free_devinfo_mem(info);
3418
3419                         spin_lock_irqsave(&device_domain_lock, flags);
3420
3421                         if (found)
3422                                 break;
3423                         else
3424                                 continue;
3425                 }
3426
3427                 /* if there is no other devices under the same iommu
3428                  * owned by this domain, clear this iommu in iommu_bmp
3429                  * update iommu count and coherency
3430                  */
3431                 if (iommu == device_to_iommu(info->segment, info->bus,
3432                                             info->devfn))
3433                         found = 1;
3434         }
3435
3436         if (found == 0) {
3437                 unsigned long tmp_flags;
3438                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3439                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3440                 domain->iommu_count--;
3441                 domain_update_iommu_cap(domain);
3442                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3443
3444                 spin_lock_irqsave(&iommu->lock, tmp_flags);
3445                 clear_bit(domain->id, iommu->domain_ids);
3446                 iommu->domains[domain->id] = NULL;
3447                 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3448         }
3449
3450         spin_unlock_irqrestore(&device_domain_lock, flags);
3451 }
3452
3453 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3454 {
3455         struct device_domain_info *info;
3456         struct intel_iommu *iommu;
3457         unsigned long flags1, flags2;
3458
3459         spin_lock_irqsave(&device_domain_lock, flags1);
3460         while (!list_empty(&domain->devices)) {
3461                 info = list_entry(domain->devices.next,
3462                         struct device_domain_info, link);
3463                 list_del(&info->link);
3464                 list_del(&info->global);
3465                 if (info->dev)
3466                         info->dev->dev.archdata.iommu = NULL;
3467
3468                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3469
3470                 iommu_disable_dev_iotlb(info);
3471                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3472                 iommu_detach_dev(iommu, info->bus, info->devfn);
3473                 iommu_detach_dependent_devices(iommu, info->dev);
3474
3475                 /* clear this iommu in iommu_bmp, update iommu count
3476                  * and capabilities
3477                  */
3478                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3479                 if (test_and_clear_bit(iommu->seq_id,
3480                                        &domain->iommu_bmp)) {
3481                         domain->iommu_count--;
3482                         domain_update_iommu_cap(domain);
3483                 }
3484                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3485
3486                 free_devinfo_mem(info);
3487                 spin_lock_irqsave(&device_domain_lock, flags1);
3488         }
3489         spin_unlock_irqrestore(&device_domain_lock, flags1);
3490 }
3491
3492 /* domain id for virtual machine, it won't be set in context */
3493 static unsigned long vm_domid;
3494
3495 static struct dmar_domain *iommu_alloc_vm_domain(void)
3496 {
3497         struct dmar_domain *domain;
3498
3499         domain = alloc_domain_mem();
3500         if (!domain)
3501                 return NULL;
3502
3503         domain->id = vm_domid++;
3504         domain->nid = -1;
3505         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3506         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3507
3508         return domain;
3509 }
3510
3511 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3512 {
3513         int adjust_width;
3514
3515         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3516         spin_lock_init(&domain->iommu_lock);
3517
3518         domain_reserve_special_ranges(domain);
3519
3520         /* calculate AGAW */
3521         domain->gaw = guest_width;
3522         adjust_width = guestwidth_to_adjustwidth(guest_width);
3523         domain->agaw = width_to_agaw(adjust_width);
3524
3525         INIT_LIST_HEAD(&domain->devices);
3526
3527         domain->iommu_count = 0;
3528         domain->iommu_coherency = 0;
3529         domain->iommu_snooping = 0;
3530         domain->max_addr = 0;
3531         domain->nid = -1;
3532
3533         /* always allocate the top pgd */
3534         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3535         if (!domain->pgd)
3536                 return -ENOMEM;
3537         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3538         return 0;
3539 }
3540
3541 static void iommu_free_vm_domain(struct dmar_domain *domain)
3542 {
3543         unsigned long flags;
3544         struct dmar_drhd_unit *drhd;
3545         struct intel_iommu *iommu;
3546         unsigned long i;
3547         unsigned long ndomains;
3548
3549         for_each_drhd_unit(drhd) {
3550                 if (drhd->ignored)
3551                         continue;
3552                 iommu = drhd->iommu;
3553
3554                 ndomains = cap_ndoms(iommu->cap);
3555                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3556                         if (iommu->domains[i] == domain) {
3557                                 spin_lock_irqsave(&iommu->lock, flags);
3558                                 clear_bit(i, iommu->domain_ids);
3559                                 iommu->domains[i] = NULL;
3560                                 spin_unlock_irqrestore(&iommu->lock, flags);
3561                                 break;
3562                         }
3563                 }
3564         }
3565 }
3566
3567 static void vm_domain_exit(struct dmar_domain *domain)
3568 {
3569         /* Domain 0 is reserved, so dont process it */
3570         if (!domain)
3571                 return;
3572
3573         vm_domain_remove_all_dev_info(domain);
3574         /* destroy iovas */
3575         put_iova_domain(&domain->iovad);
3576
3577         /* clear ptes */
3578         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3579
3580         /* free page tables */
3581         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3582
3583         iommu_free_vm_domain(domain);
3584         free_domain_mem(domain);
3585 }
3586
3587 static int intel_iommu_domain_init(struct iommu_domain *domain)
3588 {
3589         struct dmar_domain *dmar_domain;
3590
3591         dmar_domain = iommu_alloc_vm_domain();
3592         if (!dmar_domain) {
3593                 printk(KERN_ERR
3594                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3595                 return -ENOMEM;
3596         }
3597         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3598                 printk(KERN_ERR
3599                         "intel_iommu_domain_init() failed\n");
3600                 vm_domain_exit(dmar_domain);
3601                 return -ENOMEM;
3602         }
3603         domain->priv = dmar_domain;
3604
3605         return 0;
3606 }
3607
3608 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3609 {
3610         struct dmar_domain *dmar_domain = domain->priv;
3611
3612         domain->priv = NULL;
3613         vm_domain_exit(dmar_domain);
3614 }
3615
3616 static int intel_iommu_attach_device(struct iommu_domain *domain,
3617                                      struct device *dev)
3618 {
3619         struct dmar_domain *dmar_domain = domain->priv;
3620         struct pci_dev *pdev = to_pci_dev(dev);
3621         struct intel_iommu *iommu;
3622         int addr_width;
3623
3624         /* normally pdev is not mapped */
3625         if (unlikely(domain_context_mapped(pdev))) {
3626                 struct dmar_domain *old_domain;
3627
3628                 old_domain = find_domain(pdev);
3629                 if (old_domain) {
3630                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3631                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3632                                 domain_remove_one_dev_info(old_domain, pdev);
3633                         else
3634                                 domain_remove_dev_info(old_domain);
3635                 }
3636         }
3637
3638         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3639                                 pdev->devfn);
3640         if (!iommu)
3641                 return -ENODEV;
3642
3643         /* check if this iommu agaw is sufficient for max mapped address */
3644         addr_width = agaw_to_width(iommu->agaw);
3645         if (addr_width > cap_mgaw(iommu->cap))
3646                 addr_width = cap_mgaw(iommu->cap);
3647
3648         if (dmar_domain->max_addr > (1LL << addr_width)) {
3649                 printk(KERN_ERR "%s: iommu width (%d) is not "
3650                        "sufficient for the mapped address (%llx)\n",
3651                        __func__, addr_width, dmar_domain->max_addr);
3652                 return -EFAULT;
3653         }
3654         dmar_domain->gaw = addr_width;
3655
3656         /*
3657          * Knock out extra levels of page tables if necessary
3658          */
3659         while (iommu->agaw < dmar_domain->agaw) {
3660                 struct dma_pte *pte;
3661
3662                 pte = dmar_domain->pgd;
3663                 if (dma_pte_present(pte)) {
3664                         dmar_domain->pgd = (struct dma_pte *)
3665                                 phys_to_virt(dma_pte_addr(pte));
3666                         free_pgtable_page(pte);
3667                 }
3668                 dmar_domain->agaw--;
3669         }
3670
3671         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3672 }
3673
3674 static void intel_iommu_detach_device(struct iommu_domain *domain,
3675                                       struct device *dev)
3676 {
3677         struct dmar_domain *dmar_domain = domain->priv;
3678         struct pci_dev *pdev = to_pci_dev(dev);
3679
3680         domain_remove_one_dev_info(dmar_domain, pdev);
3681 }
3682
3683 static int intel_iommu_map(struct iommu_domain *domain,
3684                            unsigned long iova, phys_addr_t hpa,
3685                            int gfp_order, int iommu_prot)
3686 {
3687         struct dmar_domain *dmar_domain = domain->priv;
3688         u64 max_addr;
3689         int prot = 0;
3690         size_t size;
3691         int ret;
3692
3693         if (iommu_prot & IOMMU_READ)
3694                 prot |= DMA_PTE_READ;
3695         if (iommu_prot & IOMMU_WRITE)
3696                 prot |= DMA_PTE_WRITE;
3697         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3698                 prot |= DMA_PTE_SNP;
3699
3700         size     = PAGE_SIZE << gfp_order;
3701         max_addr = iova + size;
3702         if (dmar_domain->max_addr < max_addr) {
3703                 u64 end;
3704
3705                 /* check if minimum agaw is sufficient for mapped address */
3706                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3707                 if (end < max_addr) {
3708                         printk(KERN_ERR "%s: iommu width (%d) is not "
3709                                "sufficient for the mapped address (%llx)\n",
3710                                __func__, dmar_domain->gaw, max_addr);
3711                         return -EFAULT;
3712                 }
3713                 dmar_domain->max_addr = max_addr;
3714         }
3715         /* Round up size to next multiple of PAGE_SIZE, if it and
3716            the low bits of hpa would take us onto the next page */
3717         size = aligned_nrpages(hpa, size);
3718         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3719                                  hpa >> VTD_PAGE_SHIFT, size, prot);
3720         return ret;
3721 }
3722
3723 static int intel_iommu_unmap(struct iommu_domain *domain,
3724                              unsigned long iova, int gfp_order)
3725 {
3726         struct dmar_domain *dmar_domain = domain->priv;
3727         size_t size = PAGE_SIZE << gfp_order;
3728
3729         dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3730                             (iova + size - 1) >> VTD_PAGE_SHIFT);
3731
3732         if (dmar_domain->max_addr == iova + size)
3733                 dmar_domain->max_addr = iova;
3734
3735         return gfp_order;
3736 }
3737
3738 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3739                                             unsigned long iova)
3740 {
3741         struct dmar_domain *dmar_domain = domain->priv;
3742         struct dma_pte *pte;
3743         u64 phys = 0;
3744
3745         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3746         if (pte)
3747                 phys = dma_pte_addr(pte);
3748
3749         return phys;
3750 }
3751
3752 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3753                                       unsigned long cap)
3754 {
3755         struct dmar_domain *dmar_domain = domain->priv;
3756
3757         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3758                 return dmar_domain->iommu_snooping;
3759         if (cap == IOMMU_CAP_INTR_REMAP)
3760                 return intr_remapping_enabled;
3761
3762         return 0;
3763 }
3764
3765 static struct iommu_ops intel_iommu_ops = {
3766         .domain_init    = intel_iommu_domain_init,
3767         .domain_destroy = intel_iommu_domain_destroy,
3768         .attach_dev     = intel_iommu_attach_device,
3769         .detach_dev     = intel_iommu_detach_device,
3770         .map            = intel_iommu_map,
3771         .unmap          = intel_iommu_unmap,
3772         .iova_to_phys   = intel_iommu_iova_to_phys,
3773         .domain_has_cap = intel_iommu_domain_has_cap,
3774 };
3775
3776 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3777 {
3778         /*
3779          * Mobile 4 Series Chipset neglects to set RWBF capability,
3780          * but needs it:
3781          */
3782         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3783         rwbf_quirk = 1;
3784
3785         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
3786         if (dev->revision == 0x07) {
3787                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
3788                 dmar_map_gfx = 0;
3789         }
3790 }
3791
3792 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3793
3794 #define GGC 0x52
3795 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
3796 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
3797 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
3798 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
3799 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
3800 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
3801 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
3802 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
3803
3804 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
3805 {
3806         unsigned short ggc;
3807
3808         if (pci_read_config_word(dev, GGC, &ggc))
3809                 return;
3810
3811         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
3812                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
3813                 dmar_map_gfx = 0;
3814         }
3815 }
3816 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
3817 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
3818 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
3819 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
3820
3821 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3822    ISOCH DMAR unit for the Azalia sound device, but not give it any
3823    TLB entries, which causes it to deadlock. Check for that.  We do
3824    this in a function called from init_dmars(), instead of in a PCI
3825    quirk, because we don't want to print the obnoxious "BIOS broken"
3826    message if VT-d is actually disabled.
3827 */
3828 static void __init check_tylersburg_isoch(void)
3829 {
3830         struct pci_dev *pdev;
3831         uint32_t vtisochctrl;
3832
3833         /* If there's no Azalia in the system anyway, forget it. */
3834         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3835         if (!pdev)
3836                 return;
3837         pci_dev_put(pdev);
3838
3839         /* System Management Registers. Might be hidden, in which case
3840            we can't do the sanity check. But that's OK, because the
3841            known-broken BIOSes _don't_ actually hide it, so far. */
3842         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3843         if (!pdev)
3844                 return;
3845
3846         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3847                 pci_dev_put(pdev);
3848                 return;
3849         }
3850
3851         pci_dev_put(pdev);
3852
3853         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3854         if (vtisochctrl & 1)
3855                 return;
3856
3857         /* Drop all bits other than the number of TLB entries */
3858         vtisochctrl &= 0x1c;
3859
3860         /* If we have the recommended number of TLB entries (16), fine. */
3861         if (vtisochctrl == 0x10)
3862                 return;
3863
3864         /* Zero TLB entries? You get to ride the short bus to school. */
3865         if (!vtisochctrl) {
3866                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3867                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3868                      dmi_get_system_info(DMI_BIOS_VENDOR),
3869                      dmi_get_system_info(DMI_BIOS_VERSION),
3870                      dmi_get_system_info(DMI_PRODUCT_VERSION));
3871                 iommu_identity_mapping |= IDENTMAP_AZALIA;
3872                 return;
3873         }
3874         
3875         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3876                vtisochctrl);
3877 }