4e4e0202e59d863838dfc66ae5ef6d5bbd8e1beb
[linux-2.6.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/syscore_ops.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
45
46 #define ROOT_SIZE               VTD_PAGE_SIZE
47 #define CONTEXT_SIZE            VTD_PAGE_SIZE
48
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
52
53 #define IOAPIC_RANGE_START      (0xfee00000)
54 #define IOAPIC_RANGE_END        (0xfeefffff)
55 #define IOVA_START_ADDR         (0x1000)
56
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
58
59 #define MAX_AGAW_WIDTH 64
60
61 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
63
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
67                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
69
70 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
73
74 /* page table handling */
75 #define LEVEL_STRIDE            (9)
76 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
77
78 static inline int agaw_to_level(int agaw)
79 {
80         return agaw + 2;
81 }
82
83 static inline int agaw_to_width(int agaw)
84 {
85         return 30 + agaw * LEVEL_STRIDE;
86 }
87
88 static inline int width_to_agaw(int width)
89 {
90         return (width - 30) / LEVEL_STRIDE;
91 }
92
93 static inline unsigned int level_to_offset_bits(int level)
94 {
95         return (level - 1) * LEVEL_STRIDE;
96 }
97
98 static inline int pfn_level_offset(unsigned long pfn, int level)
99 {
100         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
101 }
102
103 static inline unsigned long level_mask(int level)
104 {
105         return -1UL << level_to_offset_bits(level);
106 }
107
108 static inline unsigned long level_size(int level)
109 {
110         return 1UL << level_to_offset_bits(level);
111 }
112
113 static inline unsigned long align_to_level(unsigned long pfn, int level)
114 {
115         return (pfn + level_size(level) - 1) & level_mask(level);
116 }
117
118 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
119    are never going to work. */
120 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
121 {
122         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
123 }
124
125 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
126 {
127         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
128 }
129 static inline unsigned long page_to_dma_pfn(struct page *pg)
130 {
131         return mm_to_dma_pfn(page_to_pfn(pg));
132 }
133 static inline unsigned long virt_to_dma_pfn(void *p)
134 {
135         return page_to_dma_pfn(virt_to_page(p));
136 }
137
138 /* global iommu list, set NULL for ignored DMAR units */
139 static struct intel_iommu **g_iommus;
140
141 static void __init check_tylersburg_isoch(void);
142 static int rwbf_quirk;
143
144 /*
145  * set to 1 to panic kernel if can't successfully enable VT-d
146  * (used when kernel is launched w/ TXT)
147  */
148 static int force_on = 0;
149
150 /*
151  * 0: Present
152  * 1-11: Reserved
153  * 12-63: Context Ptr (12 - (haw-1))
154  * 64-127: Reserved
155  */
156 struct root_entry {
157         u64     val;
158         u64     rsvd1;
159 };
160 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
161 static inline bool root_present(struct root_entry *root)
162 {
163         return (root->val & 1);
164 }
165 static inline void set_root_present(struct root_entry *root)
166 {
167         root->val |= 1;
168 }
169 static inline void set_root_value(struct root_entry *root, unsigned long value)
170 {
171         root->val |= value & VTD_PAGE_MASK;
172 }
173
174 static inline struct context_entry *
175 get_context_addr_from_root(struct root_entry *root)
176 {
177         return (struct context_entry *)
178                 (root_present(root)?phys_to_virt(
179                 root->val & VTD_PAGE_MASK) :
180                 NULL);
181 }
182
183 /*
184  * low 64 bits:
185  * 0: present
186  * 1: fault processing disable
187  * 2-3: translation type
188  * 12-63: address space root
189  * high 64 bits:
190  * 0-2: address width
191  * 3-6: aval
192  * 8-23: domain id
193  */
194 struct context_entry {
195         u64 lo;
196         u64 hi;
197 };
198
199 static inline bool context_present(struct context_entry *context)
200 {
201         return (context->lo & 1);
202 }
203 static inline void context_set_present(struct context_entry *context)
204 {
205         context->lo |= 1;
206 }
207
208 static inline void context_set_fault_enable(struct context_entry *context)
209 {
210         context->lo &= (((u64)-1) << 2) | 1;
211 }
212
213 static inline void context_set_translation_type(struct context_entry *context,
214                                                 unsigned long value)
215 {
216         context->lo &= (((u64)-1) << 4) | 3;
217         context->lo |= (value & 3) << 2;
218 }
219
220 static inline void context_set_address_root(struct context_entry *context,
221                                             unsigned long value)
222 {
223         context->lo |= value & VTD_PAGE_MASK;
224 }
225
226 static inline void context_set_address_width(struct context_entry *context,
227                                              unsigned long value)
228 {
229         context->hi |= value & 7;
230 }
231
232 static inline void context_set_domain_id(struct context_entry *context,
233                                          unsigned long value)
234 {
235         context->hi |= (value & ((1 << 16) - 1)) << 8;
236 }
237
238 static inline void context_clear_entry(struct context_entry *context)
239 {
240         context->lo = 0;
241         context->hi = 0;
242 }
243
244 /*
245  * 0: readable
246  * 1: writable
247  * 2-6: reserved
248  * 7: super page
249  * 8-10: available
250  * 11: snoop behavior
251  * 12-63: Host physcial address
252  */
253 struct dma_pte {
254         u64 val;
255 };
256
257 static inline void dma_clear_pte(struct dma_pte *pte)
258 {
259         pte->val = 0;
260 }
261
262 static inline void dma_set_pte_readable(struct dma_pte *pte)
263 {
264         pte->val |= DMA_PTE_READ;
265 }
266
267 static inline void dma_set_pte_writable(struct dma_pte *pte)
268 {
269         pte->val |= DMA_PTE_WRITE;
270 }
271
272 static inline void dma_set_pte_snp(struct dma_pte *pte)
273 {
274         pte->val |= DMA_PTE_SNP;
275 }
276
277 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
278 {
279         pte->val = (pte->val & ~3) | (prot & 3);
280 }
281
282 static inline u64 dma_pte_addr(struct dma_pte *pte)
283 {
284 #ifdef CONFIG_64BIT
285         return pte->val & VTD_PAGE_MASK;
286 #else
287         /* Must have a full atomic 64-bit read */
288         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
289 #endif
290 }
291
292 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
293 {
294         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
295 }
296
297 static inline bool dma_pte_present(struct dma_pte *pte)
298 {
299         return (pte->val & 3) != 0;
300 }
301
302 static inline int first_pte_in_page(struct dma_pte *pte)
303 {
304         return !((unsigned long)pte & ~VTD_PAGE_MASK);
305 }
306
307 /*
308  * This domain is a statically identity mapping domain.
309  *      1. This domain creats a static 1:1 mapping to all usable memory.
310  *      2. It maps to each iommu if successful.
311  *      3. Each iommu mapps to this domain if successful.
312  */
313 static struct dmar_domain *si_domain;
314 static int hw_pass_through = 1;
315
316 /* devices under the same p2p bridge are owned in one domain */
317 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
318
319 /* domain represents a virtual machine, more than one devices
320  * across iommus may be owned in one domain, e.g. kvm guest.
321  */
322 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
323
324 /* si_domain contains mulitple devices */
325 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
326
327 struct dmar_domain {
328         int     id;                     /* domain id */
329         int     nid;                    /* node id */
330         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
331
332         struct list_head devices;       /* all devices' list */
333         struct iova_domain iovad;       /* iova's that belong to this domain */
334
335         struct dma_pte  *pgd;           /* virtual address */
336         int             gaw;            /* max guest address width */
337
338         /* adjusted guest address width, 0 is level 2 30-bit */
339         int             agaw;
340
341         int             flags;          /* flags to find out type of domain */
342
343         int             iommu_coherency;/* indicate coherency of iommu access */
344         int             iommu_snooping; /* indicate snooping control feature*/
345         int             iommu_count;    /* reference count of iommu */
346         spinlock_t      iommu_lock;     /* protect iommu set in domain */
347         u64             max_addr;       /* maximum mapped address */
348 };
349
350 /* PCI domain-device relationship */
351 struct device_domain_info {
352         struct list_head link;  /* link to domain siblings */
353         struct list_head global; /* link to global list */
354         int segment;            /* PCI domain */
355         u8 bus;                 /* PCI bus number */
356         u8 devfn;               /* PCI devfn number */
357         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
358         struct intel_iommu *iommu; /* IOMMU used by this device */
359         struct dmar_domain *domain; /* pointer to domain */
360 };
361
362 static void flush_unmaps_timeout(unsigned long data);
363
364 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
365
366 #define HIGH_WATER_MARK 250
367 struct deferred_flush_tables {
368         int next;
369         struct iova *iova[HIGH_WATER_MARK];
370         struct dmar_domain *domain[HIGH_WATER_MARK];
371 };
372
373 static struct deferred_flush_tables *deferred_flush;
374
375 /* bitmap for indexing intel_iommus */
376 static int g_num_of_iommus;
377
378 static DEFINE_SPINLOCK(async_umap_flush_lock);
379 static LIST_HEAD(unmaps_to_do);
380
381 static int timer_on;
382 static long list_size;
383
384 static void domain_remove_dev_info(struct dmar_domain *domain);
385
386 #ifdef CONFIG_DMAR_DEFAULT_ON
387 int dmar_disabled = 0;
388 #else
389 int dmar_disabled = 1;
390 #endif /*CONFIG_DMAR_DEFAULT_ON*/
391
392 static int dmar_map_gfx = 1;
393 static int dmar_forcedac;
394 static int intel_iommu_strict;
395
396 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
397 static DEFINE_SPINLOCK(device_domain_lock);
398 static LIST_HEAD(device_domain_list);
399
400 static struct iommu_ops intel_iommu_ops;
401
402 static int __init intel_iommu_setup(char *str)
403 {
404         if (!str)
405                 return -EINVAL;
406         while (*str) {
407                 if (!strncmp(str, "on", 2)) {
408                         dmar_disabled = 0;
409                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
410                 } else if (!strncmp(str, "off", 3)) {
411                         dmar_disabled = 1;
412                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
413                 } else if (!strncmp(str, "igfx_off", 8)) {
414                         dmar_map_gfx = 0;
415                         printk(KERN_INFO
416                                 "Intel-IOMMU: disable GFX device mapping\n");
417                 } else if (!strncmp(str, "forcedac", 8)) {
418                         printk(KERN_INFO
419                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
420                         dmar_forcedac = 1;
421                 } else if (!strncmp(str, "strict", 6)) {
422                         printk(KERN_INFO
423                                 "Intel-IOMMU: disable batched IOTLB flush\n");
424                         intel_iommu_strict = 1;
425                 }
426
427                 str += strcspn(str, ",");
428                 while (*str == ',')
429                         str++;
430         }
431         return 0;
432 }
433 __setup("intel_iommu=", intel_iommu_setup);
434
435 static struct kmem_cache *iommu_domain_cache;
436 static struct kmem_cache *iommu_devinfo_cache;
437 static struct kmem_cache *iommu_iova_cache;
438
439 static inline void *alloc_pgtable_page(int node)
440 {
441         struct page *page;
442         void *vaddr = NULL;
443
444         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
445         if (page)
446                 vaddr = page_address(page);
447         return vaddr;
448 }
449
450 static inline void free_pgtable_page(void *vaddr)
451 {
452         free_page((unsigned long)vaddr);
453 }
454
455 static inline void *alloc_domain_mem(void)
456 {
457         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
458 }
459
460 static void free_domain_mem(void *vaddr)
461 {
462         kmem_cache_free(iommu_domain_cache, vaddr);
463 }
464
465 static inline void * alloc_devinfo_mem(void)
466 {
467         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
468 }
469
470 static inline void free_devinfo_mem(void *vaddr)
471 {
472         kmem_cache_free(iommu_devinfo_cache, vaddr);
473 }
474
475 struct iova *alloc_iova_mem(void)
476 {
477         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
478 }
479
480 void free_iova_mem(struct iova *iova)
481 {
482         kmem_cache_free(iommu_iova_cache, iova);
483 }
484
485
486 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
487 {
488         unsigned long sagaw;
489         int agaw = -1;
490
491         sagaw = cap_sagaw(iommu->cap);
492         for (agaw = width_to_agaw(max_gaw);
493              agaw >= 0; agaw--) {
494                 if (test_bit(agaw, &sagaw))
495                         break;
496         }
497
498         return agaw;
499 }
500
501 /*
502  * Calculate max SAGAW for each iommu.
503  */
504 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
505 {
506         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
507 }
508
509 /*
510  * calculate agaw for each iommu.
511  * "SAGAW" may be different across iommus, use a default agaw, and
512  * get a supported less agaw for iommus that don't support the default agaw.
513  */
514 int iommu_calculate_agaw(struct intel_iommu *iommu)
515 {
516         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
517 }
518
519 /* This functionin only returns single iommu in a domain */
520 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
521 {
522         int iommu_id;
523
524         /* si_domain and vm domain should not get here. */
525         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
526         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
527
528         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
529         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
530                 return NULL;
531
532         return g_iommus[iommu_id];
533 }
534
535 static void domain_update_iommu_coherency(struct dmar_domain *domain)
536 {
537         int i;
538
539         domain->iommu_coherency = 1;
540
541         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
542                 if (!ecap_coherent(g_iommus[i]->ecap)) {
543                         domain->iommu_coherency = 0;
544                         break;
545                 }
546         }
547 }
548
549 static void domain_update_iommu_snooping(struct dmar_domain *domain)
550 {
551         int i;
552
553         domain->iommu_snooping = 1;
554
555         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
556                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
557                         domain->iommu_snooping = 0;
558                         break;
559                 }
560         }
561 }
562
563 /* Some capabilities may be different across iommus */
564 static void domain_update_iommu_cap(struct dmar_domain *domain)
565 {
566         domain_update_iommu_coherency(domain);
567         domain_update_iommu_snooping(domain);
568 }
569
570 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
571 {
572         struct dmar_drhd_unit *drhd = NULL;
573         int i;
574
575         for_each_drhd_unit(drhd) {
576                 if (drhd->ignored)
577                         continue;
578                 if (segment != drhd->segment)
579                         continue;
580
581                 for (i = 0; i < drhd->devices_cnt; i++) {
582                         if (drhd->devices[i] &&
583                             drhd->devices[i]->bus->number == bus &&
584                             drhd->devices[i]->devfn == devfn)
585                                 return drhd->iommu;
586                         if (drhd->devices[i] &&
587                             drhd->devices[i]->subordinate &&
588                             drhd->devices[i]->subordinate->number <= bus &&
589                             drhd->devices[i]->subordinate->subordinate >= bus)
590                                 return drhd->iommu;
591                 }
592
593                 if (drhd->include_all)
594                         return drhd->iommu;
595         }
596
597         return NULL;
598 }
599
600 static void domain_flush_cache(struct dmar_domain *domain,
601                                void *addr, int size)
602 {
603         if (!domain->iommu_coherency)
604                 clflush_cache_range(addr, size);
605 }
606
607 /* Gets context entry for a given bus and devfn */
608 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
609                 u8 bus, u8 devfn)
610 {
611         struct root_entry *root;
612         struct context_entry *context;
613         unsigned long phy_addr;
614         unsigned long flags;
615
616         spin_lock_irqsave(&iommu->lock, flags);
617         root = &iommu->root_entry[bus];
618         context = get_context_addr_from_root(root);
619         if (!context) {
620                 context = (struct context_entry *)
621                                 alloc_pgtable_page(iommu->node);
622                 if (!context) {
623                         spin_unlock_irqrestore(&iommu->lock, flags);
624                         return NULL;
625                 }
626                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
627                 phy_addr = virt_to_phys((void *)context);
628                 set_root_value(root, phy_addr);
629                 set_root_present(root);
630                 __iommu_flush_cache(iommu, root, sizeof(*root));
631         }
632         spin_unlock_irqrestore(&iommu->lock, flags);
633         return &context[devfn];
634 }
635
636 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
637 {
638         struct root_entry *root;
639         struct context_entry *context;
640         int ret;
641         unsigned long flags;
642
643         spin_lock_irqsave(&iommu->lock, flags);
644         root = &iommu->root_entry[bus];
645         context = get_context_addr_from_root(root);
646         if (!context) {
647                 ret = 0;
648                 goto out;
649         }
650         ret = context_present(&context[devfn]);
651 out:
652         spin_unlock_irqrestore(&iommu->lock, flags);
653         return ret;
654 }
655
656 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
657 {
658         struct root_entry *root;
659         struct context_entry *context;
660         unsigned long flags;
661
662         spin_lock_irqsave(&iommu->lock, flags);
663         root = &iommu->root_entry[bus];
664         context = get_context_addr_from_root(root);
665         if (context) {
666                 context_clear_entry(&context[devfn]);
667                 __iommu_flush_cache(iommu, &context[devfn], \
668                         sizeof(*context));
669         }
670         spin_unlock_irqrestore(&iommu->lock, flags);
671 }
672
673 static void free_context_table(struct intel_iommu *iommu)
674 {
675         struct root_entry *root;
676         int i;
677         unsigned long flags;
678         struct context_entry *context;
679
680         spin_lock_irqsave(&iommu->lock, flags);
681         if (!iommu->root_entry) {
682                 goto out;
683         }
684         for (i = 0; i < ROOT_ENTRY_NR; i++) {
685                 root = &iommu->root_entry[i];
686                 context = get_context_addr_from_root(root);
687                 if (context)
688                         free_pgtable_page(context);
689         }
690         free_pgtable_page(iommu->root_entry);
691         iommu->root_entry = NULL;
692 out:
693         spin_unlock_irqrestore(&iommu->lock, flags);
694 }
695
696 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
697                                       unsigned long pfn)
698 {
699         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
700         struct dma_pte *parent, *pte = NULL;
701         int level = agaw_to_level(domain->agaw);
702         int offset;
703
704         BUG_ON(!domain->pgd);
705         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
706         parent = domain->pgd;
707
708         while (level > 0) {
709                 void *tmp_page;
710
711                 offset = pfn_level_offset(pfn, level);
712                 pte = &parent[offset];
713                 if (level == 1)
714                         break;
715
716                 if (!dma_pte_present(pte)) {
717                         uint64_t pteval;
718
719                         tmp_page = alloc_pgtable_page(domain->nid);
720
721                         if (!tmp_page)
722                                 return NULL;
723
724                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
725                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
726                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
727                                 /* Someone else set it while we were thinking; use theirs. */
728                                 free_pgtable_page(tmp_page);
729                         } else {
730                                 dma_pte_addr(pte);
731                                 domain_flush_cache(domain, pte, sizeof(*pte));
732                         }
733                 }
734                 parent = phys_to_virt(dma_pte_addr(pte));
735                 level--;
736         }
737
738         return pte;
739 }
740
741 /* return address's pte at specific level */
742 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
743                                          unsigned long pfn,
744                                          int level)
745 {
746         struct dma_pte *parent, *pte = NULL;
747         int total = agaw_to_level(domain->agaw);
748         int offset;
749
750         parent = domain->pgd;
751         while (level <= total) {
752                 offset = pfn_level_offset(pfn, total);
753                 pte = &parent[offset];
754                 if (level == total)
755                         return pte;
756
757                 if (!dma_pte_present(pte))
758                         break;
759                 parent = phys_to_virt(dma_pte_addr(pte));
760                 total--;
761         }
762         return NULL;
763 }
764
765 /* clear last level pte, a tlb flush should be followed */
766 static void dma_pte_clear_range(struct dmar_domain *domain,
767                                 unsigned long start_pfn,
768                                 unsigned long last_pfn)
769 {
770         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
771         struct dma_pte *first_pte, *pte;
772
773         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
774         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
775         BUG_ON(start_pfn > last_pfn);
776
777         /* we don't need lock here; nobody else touches the iova range */
778         do {
779                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
780                 if (!pte) {
781                         start_pfn = align_to_level(start_pfn + 1, 2);
782                         continue;
783                 }
784                 do { 
785                         dma_clear_pte(pte);
786                         start_pfn++;
787                         pte++;
788                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
789
790                 domain_flush_cache(domain, first_pte,
791                                    (void *)pte - (void *)first_pte);
792
793         } while (start_pfn && start_pfn <= last_pfn);
794 }
795
796 /* free page table pages. last level pte should already be cleared */
797 static void dma_pte_free_pagetable(struct dmar_domain *domain,
798                                    unsigned long start_pfn,
799                                    unsigned long last_pfn)
800 {
801         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
802         struct dma_pte *first_pte, *pte;
803         int total = agaw_to_level(domain->agaw);
804         int level;
805         unsigned long tmp;
806
807         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
808         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
809         BUG_ON(start_pfn > last_pfn);
810
811         /* We don't need lock here; nobody else touches the iova range */
812         level = 2;
813         while (level <= total) {
814                 tmp = align_to_level(start_pfn, level);
815
816                 /* If we can't even clear one PTE at this level, we're done */
817                 if (tmp + level_size(level) - 1 > last_pfn)
818                         return;
819
820                 do {
821                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
822                         if (!pte) {
823                                 tmp = align_to_level(tmp + 1, level + 1);
824                                 continue;
825                         }
826                         do {
827                                 if (dma_pte_present(pte)) {
828                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
829                                         dma_clear_pte(pte);
830                                 }
831                                 pte++;
832                                 tmp += level_size(level);
833                         } while (!first_pte_in_page(pte) &&
834                                  tmp + level_size(level) - 1 <= last_pfn);
835
836                         domain_flush_cache(domain, first_pte,
837                                            (void *)pte - (void *)first_pte);
838                         
839                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
840                 level++;
841         }
842         /* free pgd */
843         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
844                 free_pgtable_page(domain->pgd);
845                 domain->pgd = NULL;
846         }
847 }
848
849 /* iommu handling */
850 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
851 {
852         struct root_entry *root;
853         unsigned long flags;
854
855         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
856         if (!root)
857                 return -ENOMEM;
858
859         __iommu_flush_cache(iommu, root, ROOT_SIZE);
860
861         spin_lock_irqsave(&iommu->lock, flags);
862         iommu->root_entry = root;
863         spin_unlock_irqrestore(&iommu->lock, flags);
864
865         return 0;
866 }
867
868 static void iommu_set_root_entry(struct intel_iommu *iommu)
869 {
870         void *addr;
871         u32 sts;
872         unsigned long flag;
873
874         addr = iommu->root_entry;
875
876         spin_lock_irqsave(&iommu->register_lock, flag);
877         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
878
879         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
880
881         /* Make sure hardware complete it */
882         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
883                       readl, (sts & DMA_GSTS_RTPS), sts);
884
885         spin_unlock_irqrestore(&iommu->register_lock, flag);
886 }
887
888 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
889 {
890         u32 val;
891         unsigned long flag;
892
893         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
894                 return;
895
896         spin_lock_irqsave(&iommu->register_lock, flag);
897         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
898
899         /* Make sure hardware complete it */
900         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
901                       readl, (!(val & DMA_GSTS_WBFS)), val);
902
903         spin_unlock_irqrestore(&iommu->register_lock, flag);
904 }
905
906 /* return value determine if we need a write buffer flush */
907 static void __iommu_flush_context(struct intel_iommu *iommu,
908                                   u16 did, u16 source_id, u8 function_mask,
909                                   u64 type)
910 {
911         u64 val = 0;
912         unsigned long flag;
913
914         switch (type) {
915         case DMA_CCMD_GLOBAL_INVL:
916                 val = DMA_CCMD_GLOBAL_INVL;
917                 break;
918         case DMA_CCMD_DOMAIN_INVL:
919                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
920                 break;
921         case DMA_CCMD_DEVICE_INVL:
922                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
923                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
924                 break;
925         default:
926                 BUG();
927         }
928         val |= DMA_CCMD_ICC;
929
930         spin_lock_irqsave(&iommu->register_lock, flag);
931         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
932
933         /* Make sure hardware complete it */
934         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
935                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
936
937         spin_unlock_irqrestore(&iommu->register_lock, flag);
938 }
939
940 /* return value determine if we need a write buffer flush */
941 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
942                                 u64 addr, unsigned int size_order, u64 type)
943 {
944         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
945         u64 val = 0, val_iva = 0;
946         unsigned long flag;
947
948         switch (type) {
949         case DMA_TLB_GLOBAL_FLUSH:
950                 /* global flush doesn't need set IVA_REG */
951                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
952                 break;
953         case DMA_TLB_DSI_FLUSH:
954                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
955                 break;
956         case DMA_TLB_PSI_FLUSH:
957                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
958                 /* Note: always flush non-leaf currently */
959                 val_iva = size_order | addr;
960                 break;
961         default:
962                 BUG();
963         }
964         /* Note: set drain read/write */
965 #if 0
966         /*
967          * This is probably to be super secure.. Looks like we can
968          * ignore it without any impact.
969          */
970         if (cap_read_drain(iommu->cap))
971                 val |= DMA_TLB_READ_DRAIN;
972 #endif
973         if (cap_write_drain(iommu->cap))
974                 val |= DMA_TLB_WRITE_DRAIN;
975
976         spin_lock_irqsave(&iommu->register_lock, flag);
977         /* Note: Only uses first TLB reg currently */
978         if (val_iva)
979                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
980         dmar_writeq(iommu->reg + tlb_offset + 8, val);
981
982         /* Make sure hardware complete it */
983         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
984                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
985
986         spin_unlock_irqrestore(&iommu->register_lock, flag);
987
988         /* check IOTLB invalidation granularity */
989         if (DMA_TLB_IAIG(val) == 0)
990                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
991         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
992                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
993                         (unsigned long long)DMA_TLB_IIRG(type),
994                         (unsigned long long)DMA_TLB_IAIG(val));
995 }
996
997 static struct device_domain_info *iommu_support_dev_iotlb(
998         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
999 {
1000         int found = 0;
1001         unsigned long flags;
1002         struct device_domain_info *info;
1003         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1004
1005         if (!ecap_dev_iotlb_support(iommu->ecap))
1006                 return NULL;
1007
1008         if (!iommu->qi)
1009                 return NULL;
1010
1011         spin_lock_irqsave(&device_domain_lock, flags);
1012         list_for_each_entry(info, &domain->devices, link)
1013                 if (info->bus == bus && info->devfn == devfn) {
1014                         found = 1;
1015                         break;
1016                 }
1017         spin_unlock_irqrestore(&device_domain_lock, flags);
1018
1019         if (!found || !info->dev)
1020                 return NULL;
1021
1022         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1023                 return NULL;
1024
1025         if (!dmar_find_matched_atsr_unit(info->dev))
1026                 return NULL;
1027
1028         info->iommu = iommu;
1029
1030         return info;
1031 }
1032
1033 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1034 {
1035         if (!info)
1036                 return;
1037
1038         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1039 }
1040
1041 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1042 {
1043         if (!info->dev || !pci_ats_enabled(info->dev))
1044                 return;
1045
1046         pci_disable_ats(info->dev);
1047 }
1048
1049 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1050                                   u64 addr, unsigned mask)
1051 {
1052         u16 sid, qdep;
1053         unsigned long flags;
1054         struct device_domain_info *info;
1055
1056         spin_lock_irqsave(&device_domain_lock, flags);
1057         list_for_each_entry(info, &domain->devices, link) {
1058                 if (!info->dev || !pci_ats_enabled(info->dev))
1059                         continue;
1060
1061                 sid = info->bus << 8 | info->devfn;
1062                 qdep = pci_ats_queue_depth(info->dev);
1063                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1064         }
1065         spin_unlock_irqrestore(&device_domain_lock, flags);
1066 }
1067
1068 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1069                                   unsigned long pfn, unsigned int pages, int map)
1070 {
1071         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1072         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1073
1074         BUG_ON(pages == 0);
1075
1076         /*
1077          * Fallback to domain selective flush if no PSI support or the size is
1078          * too big.
1079          * PSI requires page size to be 2 ^ x, and the base address is naturally
1080          * aligned to the size
1081          */
1082         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1083                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1084                                                 DMA_TLB_DSI_FLUSH);
1085         else
1086                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1087                                                 DMA_TLB_PSI_FLUSH);
1088
1089         /*
1090          * In caching mode, changes of pages from non-present to present require
1091          * flush. However, device IOTLB doesn't need to be flushed in this case.
1092          */
1093         if (!cap_caching_mode(iommu->cap) || !map)
1094                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1095 }
1096
1097 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1098 {
1099         u32 pmen;
1100         unsigned long flags;
1101
1102         spin_lock_irqsave(&iommu->register_lock, flags);
1103         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1104         pmen &= ~DMA_PMEN_EPM;
1105         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1106
1107         /* wait for the protected region status bit to clear */
1108         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1109                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1110
1111         spin_unlock_irqrestore(&iommu->register_lock, flags);
1112 }
1113
1114 static int iommu_enable_translation(struct intel_iommu *iommu)
1115 {
1116         u32 sts;
1117         unsigned long flags;
1118
1119         spin_lock_irqsave(&iommu->register_lock, flags);
1120         iommu->gcmd |= DMA_GCMD_TE;
1121         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1122
1123         /* Make sure hardware complete it */
1124         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1125                       readl, (sts & DMA_GSTS_TES), sts);
1126
1127         spin_unlock_irqrestore(&iommu->register_lock, flags);
1128         return 0;
1129 }
1130
1131 static int iommu_disable_translation(struct intel_iommu *iommu)
1132 {
1133         u32 sts;
1134         unsigned long flag;
1135
1136         spin_lock_irqsave(&iommu->register_lock, flag);
1137         iommu->gcmd &= ~DMA_GCMD_TE;
1138         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1139
1140         /* Make sure hardware complete it */
1141         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1142                       readl, (!(sts & DMA_GSTS_TES)), sts);
1143
1144         spin_unlock_irqrestore(&iommu->register_lock, flag);
1145         return 0;
1146 }
1147
1148
1149 static int iommu_init_domains(struct intel_iommu *iommu)
1150 {
1151         unsigned long ndomains;
1152         unsigned long nlongs;
1153
1154         ndomains = cap_ndoms(iommu->cap);
1155         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1156                         ndomains);
1157         nlongs = BITS_TO_LONGS(ndomains);
1158
1159         spin_lock_init(&iommu->lock);
1160
1161         /* TBD: there might be 64K domains,
1162          * consider other allocation for future chip
1163          */
1164         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1165         if (!iommu->domain_ids) {
1166                 printk(KERN_ERR "Allocating domain id array failed\n");
1167                 return -ENOMEM;
1168         }
1169         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1170                         GFP_KERNEL);
1171         if (!iommu->domains) {
1172                 printk(KERN_ERR "Allocating domain array failed\n");
1173                 return -ENOMEM;
1174         }
1175
1176         /*
1177          * if Caching mode is set, then invalid translations are tagged
1178          * with domainid 0. Hence we need to pre-allocate it.
1179          */
1180         if (cap_caching_mode(iommu->cap))
1181                 set_bit(0, iommu->domain_ids);
1182         return 0;
1183 }
1184
1185
1186 static void domain_exit(struct dmar_domain *domain);
1187 static void vm_domain_exit(struct dmar_domain *domain);
1188
1189 void free_dmar_iommu(struct intel_iommu *iommu)
1190 {
1191         struct dmar_domain *domain;
1192         int i;
1193         unsigned long flags;
1194
1195         if ((iommu->domains) && (iommu->domain_ids)) {
1196                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1197                         domain = iommu->domains[i];
1198                         clear_bit(i, iommu->domain_ids);
1199
1200                         spin_lock_irqsave(&domain->iommu_lock, flags);
1201                         if (--domain->iommu_count == 0) {
1202                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1203                                         vm_domain_exit(domain);
1204                                 else
1205                                         domain_exit(domain);
1206                         }
1207                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1208                 }
1209         }
1210
1211         if (iommu->gcmd & DMA_GCMD_TE)
1212                 iommu_disable_translation(iommu);
1213
1214         if (iommu->irq) {
1215                 irq_set_handler_data(iommu->irq, NULL);
1216                 /* This will mask the irq */
1217                 free_irq(iommu->irq, iommu);
1218                 destroy_irq(iommu->irq);
1219         }
1220
1221         kfree(iommu->domains);
1222         kfree(iommu->domain_ids);
1223
1224         g_iommus[iommu->seq_id] = NULL;
1225
1226         /* if all iommus are freed, free g_iommus */
1227         for (i = 0; i < g_num_of_iommus; i++) {
1228                 if (g_iommus[i])
1229                         break;
1230         }
1231
1232         if (i == g_num_of_iommus)
1233                 kfree(g_iommus);
1234
1235         /* free context mapping */
1236         free_context_table(iommu);
1237 }
1238
1239 static struct dmar_domain *alloc_domain(void)
1240 {
1241         struct dmar_domain *domain;
1242
1243         domain = alloc_domain_mem();
1244         if (!domain)
1245                 return NULL;
1246
1247         domain->nid = -1;
1248         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1249         domain->flags = 0;
1250
1251         return domain;
1252 }
1253
1254 static int iommu_attach_domain(struct dmar_domain *domain,
1255                                struct intel_iommu *iommu)
1256 {
1257         int num;
1258         unsigned long ndomains;
1259         unsigned long flags;
1260
1261         ndomains = cap_ndoms(iommu->cap);
1262
1263         spin_lock_irqsave(&iommu->lock, flags);
1264
1265         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1266         if (num >= ndomains) {
1267                 spin_unlock_irqrestore(&iommu->lock, flags);
1268                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1269                 return -ENOMEM;
1270         }
1271
1272         domain->id = num;
1273         set_bit(num, iommu->domain_ids);
1274         set_bit(iommu->seq_id, &domain->iommu_bmp);
1275         iommu->domains[num] = domain;
1276         spin_unlock_irqrestore(&iommu->lock, flags);
1277
1278         return 0;
1279 }
1280
1281 static void iommu_detach_domain(struct dmar_domain *domain,
1282                                 struct intel_iommu *iommu)
1283 {
1284         unsigned long flags;
1285         int num, ndomains;
1286         int found = 0;
1287
1288         spin_lock_irqsave(&iommu->lock, flags);
1289         ndomains = cap_ndoms(iommu->cap);
1290         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1291                 if (iommu->domains[num] == domain) {
1292                         found = 1;
1293                         break;
1294                 }
1295         }
1296
1297         if (found) {
1298                 clear_bit(num, iommu->domain_ids);
1299                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1300                 iommu->domains[num] = NULL;
1301         }
1302         spin_unlock_irqrestore(&iommu->lock, flags);
1303 }
1304
1305 static struct iova_domain reserved_iova_list;
1306 static struct lock_class_key reserved_rbtree_key;
1307
1308 static int dmar_init_reserved_ranges(void)
1309 {
1310         struct pci_dev *pdev = NULL;
1311         struct iova *iova;
1312         int i;
1313
1314         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1315
1316         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1317                 &reserved_rbtree_key);
1318
1319         /* IOAPIC ranges shouldn't be accessed by DMA */
1320         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1321                 IOVA_PFN(IOAPIC_RANGE_END));
1322         if (!iova) {
1323                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1324                 return -ENODEV;
1325         }
1326
1327         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1328         for_each_pci_dev(pdev) {
1329                 struct resource *r;
1330
1331                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1332                         r = &pdev->resource[i];
1333                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1334                                 continue;
1335                         iova = reserve_iova(&reserved_iova_list,
1336                                             IOVA_PFN(r->start),
1337                                             IOVA_PFN(r->end));
1338                         if (!iova) {
1339                                 printk(KERN_ERR "Reserve iova failed\n");
1340                                 return -ENODEV;
1341                         }
1342                 }
1343         }
1344         return 0;
1345 }
1346
1347 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1348 {
1349         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1350 }
1351
1352 static inline int guestwidth_to_adjustwidth(int gaw)
1353 {
1354         int agaw;
1355         int r = (gaw - 12) % 9;
1356
1357         if (r == 0)
1358                 agaw = gaw;
1359         else
1360                 agaw = gaw + 9 - r;
1361         if (agaw > 64)
1362                 agaw = 64;
1363         return agaw;
1364 }
1365
1366 static int domain_init(struct dmar_domain *domain, int guest_width)
1367 {
1368         struct intel_iommu *iommu;
1369         int adjust_width, agaw;
1370         unsigned long sagaw;
1371
1372         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1373         spin_lock_init(&domain->iommu_lock);
1374
1375         domain_reserve_special_ranges(domain);
1376
1377         /* calculate AGAW */
1378         iommu = domain_get_iommu(domain);
1379         if (guest_width > cap_mgaw(iommu->cap))
1380                 guest_width = cap_mgaw(iommu->cap);
1381         domain->gaw = guest_width;
1382         adjust_width = guestwidth_to_adjustwidth(guest_width);
1383         agaw = width_to_agaw(adjust_width);
1384         sagaw = cap_sagaw(iommu->cap);
1385         if (!test_bit(agaw, &sagaw)) {
1386                 /* hardware doesn't support it, choose a bigger one */
1387                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1388                 agaw = find_next_bit(&sagaw, 5, agaw);
1389                 if (agaw >= 5)
1390                         return -ENODEV;
1391         }
1392         domain->agaw = agaw;
1393         INIT_LIST_HEAD(&domain->devices);
1394
1395         if (ecap_coherent(iommu->ecap))
1396                 domain->iommu_coherency = 1;
1397         else
1398                 domain->iommu_coherency = 0;
1399
1400         if (ecap_sc_support(iommu->ecap))
1401                 domain->iommu_snooping = 1;
1402         else
1403                 domain->iommu_snooping = 0;
1404
1405         domain->iommu_count = 1;
1406         domain->nid = iommu->node;
1407
1408         /* always allocate the top pgd */
1409         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1410         if (!domain->pgd)
1411                 return -ENOMEM;
1412         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1413         return 0;
1414 }
1415
1416 static void domain_exit(struct dmar_domain *domain)
1417 {
1418         struct dmar_drhd_unit *drhd;
1419         struct intel_iommu *iommu;
1420
1421         /* Domain 0 is reserved, so dont process it */
1422         if (!domain)
1423                 return;
1424
1425         domain_remove_dev_info(domain);
1426         /* destroy iovas */
1427         put_iova_domain(&domain->iovad);
1428
1429         /* clear ptes */
1430         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1431
1432         /* free page tables */
1433         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1434
1435         for_each_active_iommu(iommu, drhd)
1436                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1437                         iommu_detach_domain(domain, iommu);
1438
1439         free_domain_mem(domain);
1440 }
1441
1442 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1443                                  u8 bus, u8 devfn, int translation)
1444 {
1445         struct context_entry *context;
1446         unsigned long flags;
1447         struct intel_iommu *iommu;
1448         struct dma_pte *pgd;
1449         unsigned long num;
1450         unsigned long ndomains;
1451         int id;
1452         int agaw;
1453         struct device_domain_info *info = NULL;
1454
1455         pr_debug("Set context mapping for %02x:%02x.%d\n",
1456                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1457
1458         BUG_ON(!domain->pgd);
1459         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1460                translation != CONTEXT_TT_MULTI_LEVEL);
1461
1462         iommu = device_to_iommu(segment, bus, devfn);
1463         if (!iommu)
1464                 return -ENODEV;
1465
1466         context = device_to_context_entry(iommu, bus, devfn);
1467         if (!context)
1468                 return -ENOMEM;
1469         spin_lock_irqsave(&iommu->lock, flags);
1470         if (context_present(context)) {
1471                 spin_unlock_irqrestore(&iommu->lock, flags);
1472                 return 0;
1473         }
1474
1475         id = domain->id;
1476         pgd = domain->pgd;
1477
1478         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1479             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1480                 int found = 0;
1481
1482                 /* find an available domain id for this device in iommu */
1483                 ndomains = cap_ndoms(iommu->cap);
1484                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1485                         if (iommu->domains[num] == domain) {
1486                                 id = num;
1487                                 found = 1;
1488                                 break;
1489                         }
1490                 }
1491
1492                 if (found == 0) {
1493                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1494                         if (num >= ndomains) {
1495                                 spin_unlock_irqrestore(&iommu->lock, flags);
1496                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1497                                 return -EFAULT;
1498                         }
1499
1500                         set_bit(num, iommu->domain_ids);
1501                         iommu->domains[num] = domain;
1502                         id = num;
1503                 }
1504
1505                 /* Skip top levels of page tables for
1506                  * iommu which has less agaw than default.
1507                  * Unnecessary for PT mode.
1508                  */
1509                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1510                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1511                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1512                                 if (!dma_pte_present(pgd)) {
1513                                         spin_unlock_irqrestore(&iommu->lock, flags);
1514                                         return -ENOMEM;
1515                                 }
1516                         }
1517                 }
1518         }
1519
1520         context_set_domain_id(context, id);
1521
1522         if (translation != CONTEXT_TT_PASS_THROUGH) {
1523                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1524                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1525                                      CONTEXT_TT_MULTI_LEVEL;
1526         }
1527         /*
1528          * In pass through mode, AW must be programmed to indicate the largest
1529          * AGAW value supported by hardware. And ASR is ignored by hardware.
1530          */
1531         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1532                 context_set_address_width(context, iommu->msagaw);
1533         else {
1534                 context_set_address_root(context, virt_to_phys(pgd));
1535                 context_set_address_width(context, iommu->agaw);
1536         }
1537
1538         context_set_translation_type(context, translation);
1539         context_set_fault_enable(context);
1540         context_set_present(context);
1541         domain_flush_cache(domain, context, sizeof(*context));
1542
1543         /*
1544          * It's a non-present to present mapping. If hardware doesn't cache
1545          * non-present entry we only need to flush the write-buffer. If the
1546          * _does_ cache non-present entries, then it does so in the special
1547          * domain #0, which we have to flush:
1548          */
1549         if (cap_caching_mode(iommu->cap)) {
1550                 iommu->flush.flush_context(iommu, 0,
1551                                            (((u16)bus) << 8) | devfn,
1552                                            DMA_CCMD_MASK_NOBIT,
1553                                            DMA_CCMD_DEVICE_INVL);
1554                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1555         } else {
1556                 iommu_flush_write_buffer(iommu);
1557         }
1558         iommu_enable_dev_iotlb(info);
1559         spin_unlock_irqrestore(&iommu->lock, flags);
1560
1561         spin_lock_irqsave(&domain->iommu_lock, flags);
1562         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1563                 domain->iommu_count++;
1564                 if (domain->iommu_count == 1)
1565                         domain->nid = iommu->node;
1566                 domain_update_iommu_cap(domain);
1567         }
1568         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1569         return 0;
1570 }
1571
1572 static int
1573 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1574                         int translation)
1575 {
1576         int ret;
1577         struct pci_dev *tmp, *parent;
1578
1579         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1580                                          pdev->bus->number, pdev->devfn,
1581                                          translation);
1582         if (ret)
1583                 return ret;
1584
1585         /* dependent device mapping */
1586         tmp = pci_find_upstream_pcie_bridge(pdev);
1587         if (!tmp)
1588                 return 0;
1589         /* Secondary interface's bus number and devfn 0 */
1590         parent = pdev->bus->self;
1591         while (parent != tmp) {
1592                 ret = domain_context_mapping_one(domain,
1593                                                  pci_domain_nr(parent->bus),
1594                                                  parent->bus->number,
1595                                                  parent->devfn, translation);
1596                 if (ret)
1597                         return ret;
1598                 parent = parent->bus->self;
1599         }
1600         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1601                 return domain_context_mapping_one(domain,
1602                                         pci_domain_nr(tmp->subordinate),
1603                                         tmp->subordinate->number, 0,
1604                                         translation);
1605         else /* this is a legacy PCI bridge */
1606                 return domain_context_mapping_one(domain,
1607                                                   pci_domain_nr(tmp->bus),
1608                                                   tmp->bus->number,
1609                                                   tmp->devfn,
1610                                                   translation);
1611 }
1612
1613 static int domain_context_mapped(struct pci_dev *pdev)
1614 {
1615         int ret;
1616         struct pci_dev *tmp, *parent;
1617         struct intel_iommu *iommu;
1618
1619         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1620                                 pdev->devfn);
1621         if (!iommu)
1622                 return -ENODEV;
1623
1624         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1625         if (!ret)
1626                 return ret;
1627         /* dependent device mapping */
1628         tmp = pci_find_upstream_pcie_bridge(pdev);
1629         if (!tmp)
1630                 return ret;
1631         /* Secondary interface's bus number and devfn 0 */
1632         parent = pdev->bus->self;
1633         while (parent != tmp) {
1634                 ret = device_context_mapped(iommu, parent->bus->number,
1635                                             parent->devfn);
1636                 if (!ret)
1637                         return ret;
1638                 parent = parent->bus->self;
1639         }
1640         if (pci_is_pcie(tmp))
1641                 return device_context_mapped(iommu, tmp->subordinate->number,
1642                                              0);
1643         else
1644                 return device_context_mapped(iommu, tmp->bus->number,
1645                                              tmp->devfn);
1646 }
1647
1648 /* Returns a number of VTD pages, but aligned to MM page size */
1649 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1650                                             size_t size)
1651 {
1652         host_addr &= ~PAGE_MASK;
1653         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1654 }
1655
1656 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1657                             struct scatterlist *sg, unsigned long phys_pfn,
1658                             unsigned long nr_pages, int prot)
1659 {
1660         struct dma_pte *first_pte = NULL, *pte = NULL;
1661         phys_addr_t uninitialized_var(pteval);
1662         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1663         unsigned long sg_res;
1664
1665         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1666
1667         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1668                 return -EINVAL;
1669
1670         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1671
1672         if (sg)
1673                 sg_res = 0;
1674         else {
1675                 sg_res = nr_pages + 1;
1676                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1677         }
1678
1679         while (nr_pages--) {
1680                 uint64_t tmp;
1681
1682                 if (!sg_res) {
1683                         sg_res = aligned_nrpages(sg->offset, sg->length);
1684                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1685                         sg->dma_length = sg->length;
1686                         pteval = page_to_phys(sg_page(sg)) | prot;
1687                 }
1688                 if (!pte) {
1689                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1690                         if (!pte)
1691                                 return -ENOMEM;
1692                 }
1693                 /* We don't need lock here, nobody else
1694                  * touches the iova range
1695                  */
1696                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1697                 if (tmp) {
1698                         static int dumps = 5;
1699                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1700                                iov_pfn, tmp, (unsigned long long)pteval);
1701                         if (dumps) {
1702                                 dumps--;
1703                                 debug_dma_dump_mappings(NULL);
1704                         }
1705                         WARN_ON(1);
1706                 }
1707                 pte++;
1708                 if (!nr_pages || first_pte_in_page(pte)) {
1709                         domain_flush_cache(domain, first_pte,
1710                                            (void *)pte - (void *)first_pte);
1711                         pte = NULL;
1712                 }
1713                 iov_pfn++;
1714                 pteval += VTD_PAGE_SIZE;
1715                 sg_res--;
1716                 if (!sg_res)
1717                         sg = sg_next(sg);
1718         }
1719         return 0;
1720 }
1721
1722 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1723                                     struct scatterlist *sg, unsigned long nr_pages,
1724                                     int prot)
1725 {
1726         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1727 }
1728
1729 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1730                                      unsigned long phys_pfn, unsigned long nr_pages,
1731                                      int prot)
1732 {
1733         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1734 }
1735
1736 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1737 {
1738         if (!iommu)
1739                 return;
1740
1741         clear_context_table(iommu, bus, devfn);
1742         iommu->flush.flush_context(iommu, 0, 0, 0,
1743                                            DMA_CCMD_GLOBAL_INVL);
1744         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1745 }
1746
1747 static void domain_remove_dev_info(struct dmar_domain *domain)
1748 {
1749         struct device_domain_info *info;
1750         unsigned long flags;
1751         struct intel_iommu *iommu;
1752
1753         spin_lock_irqsave(&device_domain_lock, flags);
1754         while (!list_empty(&domain->devices)) {
1755                 info = list_entry(domain->devices.next,
1756                         struct device_domain_info, link);
1757                 list_del(&info->link);
1758                 list_del(&info->global);
1759                 if (info->dev)
1760                         info->dev->dev.archdata.iommu = NULL;
1761                 spin_unlock_irqrestore(&device_domain_lock, flags);
1762
1763                 iommu_disable_dev_iotlb(info);
1764                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1765                 iommu_detach_dev(iommu, info->bus, info->devfn);
1766                 free_devinfo_mem(info);
1767
1768                 spin_lock_irqsave(&device_domain_lock, flags);
1769         }
1770         spin_unlock_irqrestore(&device_domain_lock, flags);
1771 }
1772
1773 /*
1774  * find_domain
1775  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1776  */
1777 static struct dmar_domain *
1778 find_domain(struct pci_dev *pdev)
1779 {
1780         struct device_domain_info *info;
1781
1782         /* No lock here, assumes no domain exit in normal case */
1783         info = pdev->dev.archdata.iommu;
1784         if (info)
1785                 return info->domain;
1786         return NULL;
1787 }
1788
1789 /* domain is initialized */
1790 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1791 {
1792         struct dmar_domain *domain, *found = NULL;
1793         struct intel_iommu *iommu;
1794         struct dmar_drhd_unit *drhd;
1795         struct device_domain_info *info, *tmp;
1796         struct pci_dev *dev_tmp;
1797         unsigned long flags;
1798         int bus = 0, devfn = 0;
1799         int segment;
1800         int ret;
1801
1802         domain = find_domain(pdev);
1803         if (domain)
1804                 return domain;
1805
1806         segment = pci_domain_nr(pdev->bus);
1807
1808         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1809         if (dev_tmp) {
1810                 if (pci_is_pcie(dev_tmp)) {
1811                         bus = dev_tmp->subordinate->number;
1812                         devfn = 0;
1813                 } else {
1814                         bus = dev_tmp->bus->number;
1815                         devfn = dev_tmp->devfn;
1816                 }
1817                 spin_lock_irqsave(&device_domain_lock, flags);
1818                 list_for_each_entry(info, &device_domain_list, global) {
1819                         if (info->segment == segment &&
1820                             info->bus == bus && info->devfn == devfn) {
1821                                 found = info->domain;
1822                                 break;
1823                         }
1824                 }
1825                 spin_unlock_irqrestore(&device_domain_lock, flags);
1826                 /* pcie-pci bridge already has a domain, uses it */
1827                 if (found) {
1828                         domain = found;
1829                         goto found_domain;
1830                 }
1831         }
1832
1833         domain = alloc_domain();
1834         if (!domain)
1835                 goto error;
1836
1837         /* Allocate new domain for the device */
1838         drhd = dmar_find_matched_drhd_unit(pdev);
1839         if (!drhd) {
1840                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1841                         pci_name(pdev));
1842                 return NULL;
1843         }
1844         iommu = drhd->iommu;
1845
1846         ret = iommu_attach_domain(domain, iommu);
1847         if (ret) {
1848                 free_domain_mem(domain);
1849                 goto error;
1850         }
1851
1852         if (domain_init(domain, gaw)) {
1853                 domain_exit(domain);
1854                 goto error;
1855         }
1856
1857         /* register pcie-to-pci device */
1858         if (dev_tmp) {
1859                 info = alloc_devinfo_mem();
1860                 if (!info) {
1861                         domain_exit(domain);
1862                         goto error;
1863                 }
1864                 info->segment = segment;
1865                 info->bus = bus;
1866                 info->devfn = devfn;
1867                 info->dev = NULL;
1868                 info->domain = domain;
1869                 /* This domain is shared by devices under p2p bridge */
1870                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1871
1872                 /* pcie-to-pci bridge already has a domain, uses it */
1873                 found = NULL;
1874                 spin_lock_irqsave(&device_domain_lock, flags);
1875                 list_for_each_entry(tmp, &device_domain_list, global) {
1876                         if (tmp->segment == segment &&
1877                             tmp->bus == bus && tmp->devfn == devfn) {
1878                                 found = tmp->domain;
1879                                 break;
1880                         }
1881                 }
1882                 if (found) {
1883                         spin_unlock_irqrestore(&device_domain_lock, flags);
1884                         free_devinfo_mem(info);
1885                         domain_exit(domain);
1886                         domain = found;
1887                 } else {
1888                         list_add(&info->link, &domain->devices);
1889                         list_add(&info->global, &device_domain_list);
1890                         spin_unlock_irqrestore(&device_domain_lock, flags);
1891                 }
1892         }
1893
1894 found_domain:
1895         info = alloc_devinfo_mem();
1896         if (!info)
1897                 goto error;
1898         info->segment = segment;
1899         info->bus = pdev->bus->number;
1900         info->devfn = pdev->devfn;
1901         info->dev = pdev;
1902         info->domain = domain;
1903         spin_lock_irqsave(&device_domain_lock, flags);
1904         /* somebody is fast */
1905         found = find_domain(pdev);
1906         if (found != NULL) {
1907                 spin_unlock_irqrestore(&device_domain_lock, flags);
1908                 if (found != domain) {
1909                         domain_exit(domain);
1910                         domain = found;
1911                 }
1912                 free_devinfo_mem(info);
1913                 return domain;
1914         }
1915         list_add(&info->link, &domain->devices);
1916         list_add(&info->global, &device_domain_list);
1917         pdev->dev.archdata.iommu = info;
1918         spin_unlock_irqrestore(&device_domain_lock, flags);
1919         return domain;
1920 error:
1921         /* recheck it here, maybe others set it */
1922         return find_domain(pdev);
1923 }
1924
1925 static int iommu_identity_mapping;
1926 #define IDENTMAP_ALL            1
1927 #define IDENTMAP_GFX            2
1928 #define IDENTMAP_AZALIA         4
1929
1930 static int iommu_domain_identity_map(struct dmar_domain *domain,
1931                                      unsigned long long start,
1932                                      unsigned long long end)
1933 {
1934         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1935         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1936
1937         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1938                           dma_to_mm_pfn(last_vpfn))) {
1939                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1940                 return -ENOMEM;
1941         }
1942
1943         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1944                  start, end, domain->id);
1945         /*
1946          * RMRR range might have overlap with physical memory range,
1947          * clear it first
1948          */
1949         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1950
1951         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1952                                   last_vpfn - first_vpfn + 1,
1953                                   DMA_PTE_READ|DMA_PTE_WRITE);
1954 }
1955
1956 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1957                                       unsigned long long start,
1958                                       unsigned long long end)
1959 {
1960         struct dmar_domain *domain;
1961         int ret;
1962
1963         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1964         if (!domain)
1965                 return -ENOMEM;
1966
1967         /* For _hardware_ passthrough, don't bother. But for software
1968            passthrough, we do it anyway -- it may indicate a memory
1969            range which is reserved in E820, so which didn't get set
1970            up to start with in si_domain */
1971         if (domain == si_domain && hw_pass_through) {
1972                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1973                        pci_name(pdev), start, end);
1974                 return 0;
1975         }
1976
1977         printk(KERN_INFO
1978                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1979                pci_name(pdev), start, end);
1980         
1981         if (end < start) {
1982                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1983                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1984                         dmi_get_system_info(DMI_BIOS_VENDOR),
1985                         dmi_get_system_info(DMI_BIOS_VERSION),
1986                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1987                 ret = -EIO;
1988                 goto error;
1989         }
1990
1991         if (end >> agaw_to_width(domain->agaw)) {
1992                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1993                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1994                      agaw_to_width(domain->agaw),
1995                      dmi_get_system_info(DMI_BIOS_VENDOR),
1996                      dmi_get_system_info(DMI_BIOS_VERSION),
1997                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1998                 ret = -EIO;
1999                 goto error;
2000         }
2001
2002         ret = iommu_domain_identity_map(domain, start, end);
2003         if (ret)
2004                 goto error;
2005
2006         /* context entry init */
2007         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2008         if (ret)
2009                 goto error;
2010
2011         return 0;
2012
2013  error:
2014         domain_exit(domain);
2015         return ret;
2016 }
2017
2018 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2019         struct pci_dev *pdev)
2020 {
2021         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2022                 return 0;
2023         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2024                 rmrr->end_address + 1);
2025 }
2026
2027 #ifdef CONFIG_DMAR_FLOPPY_WA
2028 static inline void iommu_prepare_isa(void)
2029 {
2030         struct pci_dev *pdev;
2031         int ret;
2032
2033         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2034         if (!pdev)
2035                 return;
2036
2037         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2038         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2039
2040         if (ret)
2041                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2042                        "floppy might not work\n");
2043
2044 }
2045 #else
2046 static inline void iommu_prepare_isa(void)
2047 {
2048         return;
2049 }
2050 #endif /* !CONFIG_DMAR_FLPY_WA */
2051
2052 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2053
2054 static int __init si_domain_work_fn(unsigned long start_pfn,
2055                                     unsigned long end_pfn, void *datax)
2056 {
2057         int *ret = datax;
2058
2059         *ret = iommu_domain_identity_map(si_domain,
2060                                          (uint64_t)start_pfn << PAGE_SHIFT,
2061                                          (uint64_t)end_pfn << PAGE_SHIFT);
2062         return *ret;
2063
2064 }
2065
2066 static int __init si_domain_init(int hw)
2067 {
2068         struct dmar_drhd_unit *drhd;
2069         struct intel_iommu *iommu;
2070         int nid, ret = 0;
2071
2072         si_domain = alloc_domain();
2073         if (!si_domain)
2074                 return -EFAULT;
2075
2076         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2077
2078         for_each_active_iommu(iommu, drhd) {
2079                 ret = iommu_attach_domain(si_domain, iommu);
2080                 if (ret) {
2081                         domain_exit(si_domain);
2082                         return -EFAULT;
2083                 }
2084         }
2085
2086         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2087                 domain_exit(si_domain);
2088                 return -EFAULT;
2089         }
2090
2091         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2092
2093         if (hw)
2094                 return 0;
2095
2096         for_each_online_node(nid) {
2097                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2098                 if (ret)
2099                         return ret;
2100         }
2101
2102         return 0;
2103 }
2104
2105 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2106                                           struct pci_dev *pdev);
2107 static int identity_mapping(struct pci_dev *pdev)
2108 {
2109         struct device_domain_info *info;
2110
2111         if (likely(!iommu_identity_mapping))
2112                 return 0;
2113
2114
2115         list_for_each_entry(info, &si_domain->devices, link)
2116                 if (info->dev == pdev)
2117                         return 1;
2118         return 0;
2119 }
2120
2121 static int domain_add_dev_info(struct dmar_domain *domain,
2122                                struct pci_dev *pdev,
2123                                int translation)
2124 {
2125         struct device_domain_info *info;
2126         unsigned long flags;
2127         int ret;
2128
2129         info = alloc_devinfo_mem();
2130         if (!info)
2131                 return -ENOMEM;
2132
2133         ret = domain_context_mapping(domain, pdev, translation);
2134         if (ret) {
2135                 free_devinfo_mem(info);
2136                 return ret;
2137         }
2138
2139         info->segment = pci_domain_nr(pdev->bus);
2140         info->bus = pdev->bus->number;
2141         info->devfn = pdev->devfn;
2142         info->dev = pdev;
2143         info->domain = domain;
2144
2145         spin_lock_irqsave(&device_domain_lock, flags);
2146         list_add(&info->link, &domain->devices);
2147         list_add(&info->global, &device_domain_list);
2148         pdev->dev.archdata.iommu = info;
2149         spin_unlock_irqrestore(&device_domain_lock, flags);
2150
2151         return 0;
2152 }
2153
2154 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2155 {
2156         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2157                 return 1;
2158
2159         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2160                 return 1;
2161
2162         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2163                 return 0;
2164
2165         /*
2166          * We want to start off with all devices in the 1:1 domain, and
2167          * take them out later if we find they can't access all of memory.
2168          *
2169          * However, we can't do this for PCI devices behind bridges,
2170          * because all PCI devices behind the same bridge will end up
2171          * with the same source-id on their transactions.
2172          *
2173          * Practically speaking, we can't change things around for these
2174          * devices at run-time, because we can't be sure there'll be no
2175          * DMA transactions in flight for any of their siblings.
2176          * 
2177          * So PCI devices (unless they're on the root bus) as well as
2178          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2179          * the 1:1 domain, just in _case_ one of their siblings turns out
2180          * not to be able to map all of memory.
2181          */
2182         if (!pci_is_pcie(pdev)) {
2183                 if (!pci_is_root_bus(pdev->bus))
2184                         return 0;
2185                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2186                         return 0;
2187         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2188                 return 0;
2189
2190         /* 
2191          * At boot time, we don't yet know if devices will be 64-bit capable.
2192          * Assume that they will -- if they turn out not to be, then we can 
2193          * take them out of the 1:1 domain later.
2194          */
2195         if (!startup)
2196                 return pdev->dma_mask > DMA_BIT_MASK(32);
2197
2198         return 1;
2199 }
2200
2201 static int __init iommu_prepare_static_identity_mapping(int hw)
2202 {
2203         struct pci_dev *pdev = NULL;
2204         int ret;
2205
2206         ret = si_domain_init(hw);
2207         if (ret)
2208                 return -EFAULT;
2209
2210         for_each_pci_dev(pdev) {
2211                 if (iommu_should_identity_map(pdev, 1)) {
2212                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2213                                hw ? "hardware" : "software", pci_name(pdev));
2214
2215                         ret = domain_add_dev_info(si_domain, pdev,
2216                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2217                                                      CONTEXT_TT_MULTI_LEVEL);
2218                         if (ret)
2219                                 return ret;
2220                 }
2221         }
2222
2223         return 0;
2224 }
2225
2226 static int __init init_dmars(void)
2227 {
2228         struct dmar_drhd_unit *drhd;
2229         struct dmar_rmrr_unit *rmrr;
2230         struct pci_dev *pdev;
2231         struct intel_iommu *iommu;
2232         int i, ret;
2233
2234         /*
2235          * for each drhd
2236          *    allocate root
2237          *    initialize and program root entry to not present
2238          * endfor
2239          */
2240         for_each_drhd_unit(drhd) {
2241                 g_num_of_iommus++;
2242                 /*
2243                  * lock not needed as this is only incremented in the single
2244                  * threaded kernel __init code path all other access are read
2245                  * only
2246                  */
2247         }
2248
2249         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2250                         GFP_KERNEL);
2251         if (!g_iommus) {
2252                 printk(KERN_ERR "Allocating global iommu array failed\n");
2253                 ret = -ENOMEM;
2254                 goto error;
2255         }
2256
2257         deferred_flush = kzalloc(g_num_of_iommus *
2258                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2259         if (!deferred_flush) {
2260                 ret = -ENOMEM;
2261                 goto error;
2262         }
2263
2264         for_each_drhd_unit(drhd) {
2265                 if (drhd->ignored)
2266                         continue;
2267
2268                 iommu = drhd->iommu;
2269                 g_iommus[iommu->seq_id] = iommu;
2270
2271                 ret = iommu_init_domains(iommu);
2272                 if (ret)
2273                         goto error;
2274
2275                 /*
2276                  * TBD:
2277                  * we could share the same root & context tables
2278                  * among all IOMMU's. Need to Split it later.
2279                  */
2280                 ret = iommu_alloc_root_entry(iommu);
2281                 if (ret) {
2282                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2283                         goto error;
2284                 }
2285                 if (!ecap_pass_through(iommu->ecap))
2286                         hw_pass_through = 0;
2287         }
2288
2289         /*
2290          * Start from the sane iommu hardware state.
2291          */
2292         for_each_drhd_unit(drhd) {
2293                 if (drhd->ignored)
2294                         continue;
2295
2296                 iommu = drhd->iommu;
2297
2298                 /*
2299                  * If the queued invalidation is already initialized by us
2300                  * (for example, while enabling interrupt-remapping) then
2301                  * we got the things already rolling from a sane state.
2302                  */
2303                 if (iommu->qi)
2304                         continue;
2305
2306                 /*
2307                  * Clear any previous faults.
2308                  */
2309                 dmar_fault(-1, iommu);
2310                 /*
2311                  * Disable queued invalidation if supported and already enabled
2312                  * before OS handover.
2313                  */
2314                 dmar_disable_qi(iommu);
2315         }
2316
2317         for_each_drhd_unit(drhd) {
2318                 if (drhd->ignored)
2319                         continue;
2320
2321                 iommu = drhd->iommu;
2322
2323                 if (dmar_enable_qi(iommu)) {
2324                         /*
2325                          * Queued Invalidate not enabled, use Register Based
2326                          * Invalidate
2327                          */
2328                         iommu->flush.flush_context = __iommu_flush_context;
2329                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2330                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2331                                "invalidation\n",
2332                                 iommu->seq_id,
2333                                (unsigned long long)drhd->reg_base_addr);
2334                 } else {
2335                         iommu->flush.flush_context = qi_flush_context;
2336                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2337                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2338                                "invalidation\n",
2339                                 iommu->seq_id,
2340                                (unsigned long long)drhd->reg_base_addr);
2341                 }
2342         }
2343
2344         if (iommu_pass_through)
2345                 iommu_identity_mapping |= IDENTMAP_ALL;
2346
2347 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2348         iommu_identity_mapping |= IDENTMAP_GFX;
2349 #endif
2350
2351         check_tylersburg_isoch();
2352
2353         /*
2354          * If pass through is not set or not enabled, setup context entries for
2355          * identity mappings for rmrr, gfx, and isa and may fall back to static
2356          * identity mapping if iommu_identity_mapping is set.
2357          */
2358         if (iommu_identity_mapping) {
2359                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2360                 if (ret) {
2361                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2362                         goto error;
2363                 }
2364         }
2365         /*
2366          * For each rmrr
2367          *   for each dev attached to rmrr
2368          *   do
2369          *     locate drhd for dev, alloc domain for dev
2370          *     allocate free domain
2371          *     allocate page table entries for rmrr
2372          *     if context not allocated for bus
2373          *           allocate and init context
2374          *           set present in root table for this bus
2375          *     init context with domain, translation etc
2376          *    endfor
2377          * endfor
2378          */
2379         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2380         for_each_rmrr_units(rmrr) {
2381                 for (i = 0; i < rmrr->devices_cnt; i++) {
2382                         pdev = rmrr->devices[i];
2383                         /*
2384                          * some BIOS lists non-exist devices in DMAR
2385                          * table.
2386                          */
2387                         if (!pdev)
2388                                 continue;
2389                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2390                         if (ret)
2391                                 printk(KERN_ERR
2392                                        "IOMMU: mapping reserved region failed\n");
2393                 }
2394         }
2395
2396         iommu_prepare_isa();
2397
2398         /*
2399          * for each drhd
2400          *   enable fault log
2401          *   global invalidate context cache
2402          *   global invalidate iotlb
2403          *   enable translation
2404          */
2405         for_each_drhd_unit(drhd) {
2406                 if (drhd->ignored) {
2407                         /*
2408                          * we always have to disable PMRs or DMA may fail on
2409                          * this device
2410                          */
2411                         if (force_on)
2412                                 iommu_disable_protect_mem_regions(drhd->iommu);
2413                         continue;
2414                 }
2415                 iommu = drhd->iommu;
2416
2417                 iommu_flush_write_buffer(iommu);
2418
2419                 ret = dmar_set_interrupt(iommu);
2420                 if (ret)
2421                         goto error;
2422
2423                 iommu_set_root_entry(iommu);
2424
2425                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2426                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2427
2428                 ret = iommu_enable_translation(iommu);
2429                 if (ret)
2430                         goto error;
2431
2432                 iommu_disable_protect_mem_regions(iommu);
2433         }
2434
2435         return 0;
2436 error:
2437         for_each_drhd_unit(drhd) {
2438                 if (drhd->ignored)
2439                         continue;
2440                 iommu = drhd->iommu;
2441                 free_iommu(iommu);
2442         }
2443         kfree(g_iommus);
2444         return ret;
2445 }
2446
2447 /* This takes a number of _MM_ pages, not VTD pages */
2448 static struct iova *intel_alloc_iova(struct device *dev,
2449                                      struct dmar_domain *domain,
2450                                      unsigned long nrpages, uint64_t dma_mask)
2451 {
2452         struct pci_dev *pdev = to_pci_dev(dev);
2453         struct iova *iova = NULL;
2454
2455         /* Restrict dma_mask to the width that the iommu can handle */
2456         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2457
2458         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2459                 /*
2460                  * First try to allocate an io virtual address in
2461                  * DMA_BIT_MASK(32) and if that fails then try allocating
2462                  * from higher range
2463                  */
2464                 iova = alloc_iova(&domain->iovad, nrpages,
2465                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2466                 if (iova)
2467                         return iova;
2468         }
2469         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2470         if (unlikely(!iova)) {
2471                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2472                        nrpages, pci_name(pdev));
2473                 return NULL;
2474         }
2475
2476         return iova;
2477 }
2478
2479 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2480 {
2481         struct dmar_domain *domain;
2482         int ret;
2483
2484         domain = get_domain_for_dev(pdev,
2485                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2486         if (!domain) {
2487                 printk(KERN_ERR
2488                         "Allocating domain for %s failed", pci_name(pdev));
2489                 return NULL;
2490         }
2491
2492         /* make sure context mapping is ok */
2493         if (unlikely(!domain_context_mapped(pdev))) {
2494                 ret = domain_context_mapping(domain, pdev,
2495                                              CONTEXT_TT_MULTI_LEVEL);
2496                 if (ret) {
2497                         printk(KERN_ERR
2498                                 "Domain context map for %s failed",
2499                                 pci_name(pdev));
2500                         return NULL;
2501                 }
2502         }
2503
2504         return domain;
2505 }
2506
2507 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2508 {
2509         struct device_domain_info *info;
2510
2511         /* No lock here, assumes no domain exit in normal case */
2512         info = dev->dev.archdata.iommu;
2513         if (likely(info))
2514                 return info->domain;
2515
2516         return __get_valid_domain_for_dev(dev);
2517 }
2518
2519 static int iommu_dummy(struct pci_dev *pdev)
2520 {
2521         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2522 }
2523
2524 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2525 static int iommu_no_mapping(struct device *dev)
2526 {
2527         struct pci_dev *pdev;
2528         int found;
2529
2530         if (unlikely(dev->bus != &pci_bus_type))
2531                 return 1;
2532
2533         pdev = to_pci_dev(dev);
2534         if (iommu_dummy(pdev))
2535                 return 1;
2536
2537         if (!iommu_identity_mapping)
2538                 return 0;
2539
2540         found = identity_mapping(pdev);
2541         if (found) {
2542                 if (iommu_should_identity_map(pdev, 0))
2543                         return 1;
2544                 else {
2545                         /*
2546                          * 32 bit DMA is removed from si_domain and fall back
2547                          * to non-identity mapping.
2548                          */
2549                         domain_remove_one_dev_info(si_domain, pdev);
2550                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2551                                pci_name(pdev));
2552                         return 0;
2553                 }
2554         } else {
2555                 /*
2556                  * In case of a detached 64 bit DMA device from vm, the device
2557                  * is put into si_domain for identity mapping.
2558                  */
2559                 if (iommu_should_identity_map(pdev, 0)) {
2560                         int ret;
2561                         ret = domain_add_dev_info(si_domain, pdev,
2562                                                   hw_pass_through ?
2563                                                   CONTEXT_TT_PASS_THROUGH :
2564                                                   CONTEXT_TT_MULTI_LEVEL);
2565                         if (!ret) {
2566                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2567                                        pci_name(pdev));
2568                                 return 1;
2569                         }
2570                 }
2571         }
2572
2573         return 0;
2574 }
2575
2576 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2577                                      size_t size, int dir, u64 dma_mask)
2578 {
2579         struct pci_dev *pdev = to_pci_dev(hwdev);
2580         struct dmar_domain *domain;
2581         phys_addr_t start_paddr;
2582         struct iova *iova;
2583         int prot = 0;
2584         int ret;
2585         struct intel_iommu *iommu;
2586         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2587
2588         BUG_ON(dir == DMA_NONE);
2589
2590         if (iommu_no_mapping(hwdev))
2591                 return paddr;
2592
2593         domain = get_valid_domain_for_dev(pdev);
2594         if (!domain)
2595                 return 0;
2596
2597         iommu = domain_get_iommu(domain);
2598         size = aligned_nrpages(paddr, size);
2599
2600         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2601                                 pdev->dma_mask);
2602         if (!iova)
2603                 goto error;
2604
2605         /*
2606          * Check if DMAR supports zero-length reads on write only
2607          * mappings..
2608          */
2609         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2610                         !cap_zlr(iommu->cap))
2611                 prot |= DMA_PTE_READ;
2612         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2613                 prot |= DMA_PTE_WRITE;
2614         /*
2615          * paddr - (paddr + size) might be partial page, we should map the whole
2616          * page.  Note: if two part of one page are separately mapped, we
2617          * might have two guest_addr mapping to the same host paddr, but this
2618          * is not a big problem
2619          */
2620         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2621                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2622         if (ret)
2623                 goto error;
2624
2625         /* it's a non-present to present mapping. Only flush if caching mode */
2626         if (cap_caching_mode(iommu->cap))
2627                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2628         else
2629                 iommu_flush_write_buffer(iommu);
2630
2631         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2632         start_paddr += paddr & ~PAGE_MASK;
2633         return start_paddr;
2634
2635 error:
2636         if (iova)
2637                 __free_iova(&domain->iovad, iova);
2638         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2639                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2640         return 0;
2641 }
2642
2643 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2644                                  unsigned long offset, size_t size,
2645                                  enum dma_data_direction dir,
2646                                  struct dma_attrs *attrs)
2647 {
2648         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2649                                   dir, to_pci_dev(dev)->dma_mask);
2650 }
2651
2652 static void flush_unmaps(void)
2653 {
2654         int i, j;
2655
2656         timer_on = 0;
2657
2658         /* just flush them all */
2659         for (i = 0; i < g_num_of_iommus; i++) {
2660                 struct intel_iommu *iommu = g_iommus[i];
2661                 if (!iommu)
2662                         continue;
2663
2664                 if (!deferred_flush[i].next)
2665                         continue;
2666
2667                 /* In caching mode, global flushes turn emulation expensive */
2668                 if (!cap_caching_mode(iommu->cap))
2669                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2670                                          DMA_TLB_GLOBAL_FLUSH);
2671                 for (j = 0; j < deferred_flush[i].next; j++) {
2672                         unsigned long mask;
2673                         struct iova *iova = deferred_flush[i].iova[j];
2674                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2675
2676                         /* On real hardware multiple invalidations are expensive */
2677                         if (cap_caching_mode(iommu->cap))
2678                                 iommu_flush_iotlb_psi(iommu, domain->id,
2679                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2680                         else {
2681                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2682                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2683                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2684                         }
2685                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2686                 }
2687                 deferred_flush[i].next = 0;
2688         }
2689
2690         list_size = 0;
2691 }
2692
2693 static void flush_unmaps_timeout(unsigned long data)
2694 {
2695         unsigned long flags;
2696
2697         spin_lock_irqsave(&async_umap_flush_lock, flags);
2698         flush_unmaps();
2699         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2700 }
2701
2702 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2703 {
2704         unsigned long flags;
2705         int next, iommu_id;
2706         struct intel_iommu *iommu;
2707
2708         spin_lock_irqsave(&async_umap_flush_lock, flags);
2709         if (list_size == HIGH_WATER_MARK)
2710                 flush_unmaps();
2711
2712         iommu = domain_get_iommu(dom);
2713         iommu_id = iommu->seq_id;
2714
2715         next = deferred_flush[iommu_id].next;
2716         deferred_flush[iommu_id].domain[next] = dom;
2717         deferred_flush[iommu_id].iova[next] = iova;
2718         deferred_flush[iommu_id].next++;
2719
2720         if (!timer_on) {
2721                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2722                 timer_on = 1;
2723         }
2724         list_size++;
2725         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2726 }
2727
2728 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2729                              size_t size, enum dma_data_direction dir,
2730                              struct dma_attrs *attrs)
2731 {
2732         struct pci_dev *pdev = to_pci_dev(dev);
2733         struct dmar_domain *domain;
2734         unsigned long start_pfn, last_pfn;
2735         struct iova *iova;
2736         struct intel_iommu *iommu;
2737
2738         if (iommu_no_mapping(dev))
2739                 return;
2740
2741         domain = find_domain(pdev);
2742         BUG_ON(!domain);
2743
2744         iommu = domain_get_iommu(domain);
2745
2746         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2747         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2748                       (unsigned long long)dev_addr))
2749                 return;
2750
2751         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2752         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2753
2754         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2755                  pci_name(pdev), start_pfn, last_pfn);
2756
2757         /*  clear the whole page */
2758         dma_pte_clear_range(domain, start_pfn, last_pfn);
2759
2760         /* free page tables */
2761         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2762
2763         if (intel_iommu_strict) {
2764                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2765                                       last_pfn - start_pfn + 1, 0);
2766                 /* free iova */
2767                 __free_iova(&domain->iovad, iova);
2768         } else {
2769                 add_unmap(domain, iova);
2770                 /*
2771                  * queue up the release of the unmap to save the 1/6th of the
2772                  * cpu used up by the iotlb flush operation...
2773                  */
2774         }
2775 }
2776
2777 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2778                                   dma_addr_t *dma_handle, gfp_t flags)
2779 {
2780         void *vaddr;
2781         int order;
2782
2783         size = PAGE_ALIGN(size);
2784         order = get_order(size);
2785
2786         if (!iommu_no_mapping(hwdev))
2787                 flags &= ~(GFP_DMA | GFP_DMA32);
2788         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2789                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2790                         flags |= GFP_DMA;
2791                 else
2792                         flags |= GFP_DMA32;
2793         }
2794
2795         vaddr = (void *)__get_free_pages(flags, order);
2796         if (!vaddr)
2797                 return NULL;
2798         memset(vaddr, 0, size);
2799
2800         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2801                                          DMA_BIDIRECTIONAL,
2802                                          hwdev->coherent_dma_mask);
2803         if (*dma_handle)
2804                 return vaddr;
2805         free_pages((unsigned long)vaddr, order);
2806         return NULL;
2807 }
2808
2809 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2810                                 dma_addr_t dma_handle)
2811 {
2812         int order;
2813
2814         size = PAGE_ALIGN(size);
2815         order = get_order(size);
2816
2817         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2818         free_pages((unsigned long)vaddr, order);
2819 }
2820
2821 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2822                            int nelems, enum dma_data_direction dir,
2823                            struct dma_attrs *attrs)
2824 {
2825         struct pci_dev *pdev = to_pci_dev(hwdev);
2826         struct dmar_domain *domain;
2827         unsigned long start_pfn, last_pfn;
2828         struct iova *iova;
2829         struct intel_iommu *iommu;
2830
2831         if (iommu_no_mapping(hwdev))
2832                 return;
2833
2834         domain = find_domain(pdev);
2835         BUG_ON(!domain);
2836
2837         iommu = domain_get_iommu(domain);
2838
2839         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2840         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2841                       (unsigned long long)sglist[0].dma_address))
2842                 return;
2843
2844         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2845         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2846
2847         /*  clear the whole page */
2848         dma_pte_clear_range(domain, start_pfn, last_pfn);
2849
2850         /* free page tables */
2851         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2852
2853         if (intel_iommu_strict) {
2854                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2855                                       last_pfn - start_pfn + 1, 0);
2856                 /* free iova */
2857                 __free_iova(&domain->iovad, iova);
2858         } else {
2859                 add_unmap(domain, iova);
2860                 /*
2861                  * queue up the release of the unmap to save the 1/6th of the
2862                  * cpu used up by the iotlb flush operation...
2863                  */
2864         }
2865 }
2866
2867 static int intel_nontranslate_map_sg(struct device *hddev,
2868         struct scatterlist *sglist, int nelems, int dir)
2869 {
2870         int i;
2871         struct scatterlist *sg;
2872
2873         for_each_sg(sglist, sg, nelems, i) {
2874                 BUG_ON(!sg_page(sg));
2875                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2876                 sg->dma_length = sg->length;
2877         }
2878         return nelems;
2879 }
2880
2881 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2882                         enum dma_data_direction dir, struct dma_attrs *attrs)
2883 {
2884         int i;
2885         struct pci_dev *pdev = to_pci_dev(hwdev);
2886         struct dmar_domain *domain;
2887         size_t size = 0;
2888         int prot = 0;
2889         struct iova *iova = NULL;
2890         int ret;
2891         struct scatterlist *sg;
2892         unsigned long start_vpfn;
2893         struct intel_iommu *iommu;
2894
2895         BUG_ON(dir == DMA_NONE);
2896         if (iommu_no_mapping(hwdev))
2897                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2898
2899         domain = get_valid_domain_for_dev(pdev);
2900         if (!domain)
2901                 return 0;
2902
2903         iommu = domain_get_iommu(domain);
2904
2905         for_each_sg(sglist, sg, nelems, i)
2906                 size += aligned_nrpages(sg->offset, sg->length);
2907
2908         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2909                                 pdev->dma_mask);
2910         if (!iova) {
2911                 sglist->dma_length = 0;
2912                 return 0;
2913         }
2914
2915         /*
2916          * Check if DMAR supports zero-length reads on write only
2917          * mappings..
2918          */
2919         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2920                         !cap_zlr(iommu->cap))
2921                 prot |= DMA_PTE_READ;
2922         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2923                 prot |= DMA_PTE_WRITE;
2924
2925         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2926
2927         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2928         if (unlikely(ret)) {
2929                 /*  clear the page */
2930                 dma_pte_clear_range(domain, start_vpfn,
2931                                     start_vpfn + size - 1);
2932                 /* free page tables */
2933                 dma_pte_free_pagetable(domain, start_vpfn,
2934                                        start_vpfn + size - 1);
2935                 /* free iova */
2936                 __free_iova(&domain->iovad, iova);
2937                 return 0;
2938         }
2939
2940         /* it's a non-present to present mapping. Only flush if caching mode */
2941         if (cap_caching_mode(iommu->cap))
2942                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
2943         else
2944                 iommu_flush_write_buffer(iommu);
2945
2946         return nelems;
2947 }
2948
2949 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2950 {
2951         return !dma_addr;
2952 }
2953
2954 struct dma_map_ops intel_dma_ops = {
2955         .alloc_coherent = intel_alloc_coherent,
2956         .free_coherent = intel_free_coherent,
2957         .map_sg = intel_map_sg,
2958         .unmap_sg = intel_unmap_sg,
2959         .map_page = intel_map_page,
2960         .unmap_page = intel_unmap_page,
2961         .mapping_error = intel_mapping_error,
2962 };
2963
2964 static inline int iommu_domain_cache_init(void)
2965 {
2966         int ret = 0;
2967
2968         iommu_domain_cache = kmem_cache_create("iommu_domain",
2969                                          sizeof(struct dmar_domain),
2970                                          0,
2971                                          SLAB_HWCACHE_ALIGN,
2972
2973                                          NULL);
2974         if (!iommu_domain_cache) {
2975                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2976                 ret = -ENOMEM;
2977         }
2978
2979         return ret;
2980 }
2981
2982 static inline int iommu_devinfo_cache_init(void)
2983 {
2984         int ret = 0;
2985
2986         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2987                                          sizeof(struct device_domain_info),
2988                                          0,
2989                                          SLAB_HWCACHE_ALIGN,
2990                                          NULL);
2991         if (!iommu_devinfo_cache) {
2992                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2993                 ret = -ENOMEM;
2994         }
2995
2996         return ret;
2997 }
2998
2999 static inline int iommu_iova_cache_init(void)
3000 {
3001         int ret = 0;
3002
3003         iommu_iova_cache = kmem_cache_create("iommu_iova",
3004                                          sizeof(struct iova),
3005                                          0,
3006                                          SLAB_HWCACHE_ALIGN,
3007                                          NULL);
3008         if (!iommu_iova_cache) {
3009                 printk(KERN_ERR "Couldn't create iova cache\n");
3010                 ret = -ENOMEM;
3011         }
3012
3013         return ret;
3014 }
3015
3016 static int __init iommu_init_mempool(void)
3017 {
3018         int ret;
3019         ret = iommu_iova_cache_init();
3020         if (ret)
3021                 return ret;
3022
3023         ret = iommu_domain_cache_init();
3024         if (ret)
3025                 goto domain_error;
3026
3027         ret = iommu_devinfo_cache_init();
3028         if (!ret)
3029                 return ret;
3030
3031         kmem_cache_destroy(iommu_domain_cache);
3032 domain_error:
3033         kmem_cache_destroy(iommu_iova_cache);
3034
3035         return -ENOMEM;
3036 }
3037
3038 static void __init iommu_exit_mempool(void)
3039 {
3040         kmem_cache_destroy(iommu_devinfo_cache);
3041         kmem_cache_destroy(iommu_domain_cache);
3042         kmem_cache_destroy(iommu_iova_cache);
3043
3044 }
3045
3046 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3047 {
3048         struct dmar_drhd_unit *drhd;
3049         u32 vtbar;
3050         int rc;
3051
3052         /* We know that this device on this chipset has its own IOMMU.
3053          * If we find it under a different IOMMU, then the BIOS is lying
3054          * to us. Hope that the IOMMU for this device is actually
3055          * disabled, and it needs no translation...
3056          */
3057         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3058         if (rc) {
3059                 /* "can't" happen */
3060                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3061                 return;
3062         }
3063         vtbar &= 0xffff0000;
3064
3065         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3066         drhd = dmar_find_matched_drhd_unit(pdev);
3067         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3068                             TAINT_FIRMWARE_WORKAROUND,
3069                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3070                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3071 }
3072 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3073
3074 static void __init init_no_remapping_devices(void)
3075 {
3076         struct dmar_drhd_unit *drhd;
3077
3078         for_each_drhd_unit(drhd) {
3079                 if (!drhd->include_all) {
3080                         int i;
3081                         for (i = 0; i < drhd->devices_cnt; i++)
3082                                 if (drhd->devices[i] != NULL)
3083                                         break;
3084                         /* ignore DMAR unit if no pci devices exist */
3085                         if (i == drhd->devices_cnt)
3086                                 drhd->ignored = 1;
3087                 }
3088         }
3089
3090         if (dmar_map_gfx)
3091                 return;
3092
3093         for_each_drhd_unit(drhd) {
3094                 int i;
3095                 if (drhd->ignored || drhd->include_all)
3096                         continue;
3097
3098                 for (i = 0; i < drhd->devices_cnt; i++)
3099                         if (drhd->devices[i] &&
3100                                 !IS_GFX_DEVICE(drhd->devices[i]))
3101                                 break;
3102
3103                 if (i < drhd->devices_cnt)
3104                         continue;
3105
3106                 /* bypass IOMMU if it is just for gfx devices */
3107                 drhd->ignored = 1;
3108                 for (i = 0; i < drhd->devices_cnt; i++) {
3109                         if (!drhd->devices[i])
3110                                 continue;
3111                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3112                 }
3113         }
3114 }
3115
3116 #ifdef CONFIG_SUSPEND
3117 static int init_iommu_hw(void)
3118 {
3119         struct dmar_drhd_unit *drhd;
3120         struct intel_iommu *iommu = NULL;
3121
3122         for_each_active_iommu(iommu, drhd)
3123                 if (iommu->qi)
3124                         dmar_reenable_qi(iommu);
3125
3126         for_each_iommu(iommu, drhd) {
3127                 if (drhd->ignored) {
3128                         /*
3129                          * we always have to disable PMRs or DMA may fail on
3130                          * this device
3131                          */
3132                         if (force_on)
3133                                 iommu_disable_protect_mem_regions(iommu);
3134                         continue;
3135                 }
3136         
3137                 iommu_flush_write_buffer(iommu);
3138
3139                 iommu_set_root_entry(iommu);
3140
3141                 iommu->flush.flush_context(iommu, 0, 0, 0,
3142                                            DMA_CCMD_GLOBAL_INVL);
3143                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3144                                          DMA_TLB_GLOBAL_FLUSH);
3145                 if (iommu_enable_translation(iommu))
3146                         return 1;
3147                 iommu_disable_protect_mem_regions(iommu);
3148         }
3149
3150         return 0;
3151 }
3152
3153 static void iommu_flush_all(void)
3154 {
3155         struct dmar_drhd_unit *drhd;
3156         struct intel_iommu *iommu;
3157
3158         for_each_active_iommu(iommu, drhd) {
3159                 iommu->flush.flush_context(iommu, 0, 0, 0,
3160                                            DMA_CCMD_GLOBAL_INVL);
3161                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3162                                          DMA_TLB_GLOBAL_FLUSH);
3163         }
3164 }
3165
3166 static int iommu_suspend(void)
3167 {
3168         struct dmar_drhd_unit *drhd;
3169         struct intel_iommu *iommu = NULL;
3170         unsigned long flag;
3171
3172         for_each_active_iommu(iommu, drhd) {
3173                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3174                                                  GFP_ATOMIC);
3175                 if (!iommu->iommu_state)
3176                         goto nomem;
3177         }
3178
3179         iommu_flush_all();
3180
3181         for_each_active_iommu(iommu, drhd) {
3182                 iommu_disable_translation(iommu);
3183
3184                 spin_lock_irqsave(&iommu->register_lock, flag);
3185
3186                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3187                         readl(iommu->reg + DMAR_FECTL_REG);
3188                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3189                         readl(iommu->reg + DMAR_FEDATA_REG);
3190                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3191                         readl(iommu->reg + DMAR_FEADDR_REG);
3192                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3193                         readl(iommu->reg + DMAR_FEUADDR_REG);
3194
3195                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3196         }
3197         return 0;
3198
3199 nomem:
3200         for_each_active_iommu(iommu, drhd)
3201                 kfree(iommu->iommu_state);
3202
3203         return -ENOMEM;
3204 }
3205
3206 static void iommu_resume(void)
3207 {
3208         struct dmar_drhd_unit *drhd;
3209         struct intel_iommu *iommu = NULL;
3210         unsigned long flag;
3211
3212         if (init_iommu_hw()) {
3213                 if (force_on)
3214                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3215                 else
3216                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3217                 return;
3218         }
3219
3220         for_each_active_iommu(iommu, drhd) {
3221
3222                 spin_lock_irqsave(&iommu->register_lock, flag);
3223
3224                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3225                         iommu->reg + DMAR_FECTL_REG);
3226                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3227                         iommu->reg + DMAR_FEDATA_REG);
3228                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3229                         iommu->reg + DMAR_FEADDR_REG);
3230                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3231                         iommu->reg + DMAR_FEUADDR_REG);
3232
3233                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3234         }
3235
3236         for_each_active_iommu(iommu, drhd)
3237                 kfree(iommu->iommu_state);
3238 }
3239
3240 static struct syscore_ops iommu_syscore_ops = {
3241         .resume         = iommu_resume,
3242         .suspend        = iommu_suspend,
3243 };
3244
3245 static void __init init_iommu_pm_ops(void)
3246 {
3247         register_syscore_ops(&iommu_syscore_ops);
3248 }
3249
3250 #else
3251 static inline int init_iommu_pm_ops(void) { }
3252 #endif  /* CONFIG_PM */
3253
3254 /*
3255  * Here we only respond to action of unbound device from driver.
3256  *
3257  * Added device is not attached to its DMAR domain here yet. That will happen
3258  * when mapping the device to iova.
3259  */
3260 static int device_notifier(struct notifier_block *nb,
3261                                   unsigned long action, void *data)
3262 {
3263         struct device *dev = data;
3264         struct pci_dev *pdev = to_pci_dev(dev);
3265         struct dmar_domain *domain;
3266
3267         if (iommu_no_mapping(dev))
3268                 return 0;
3269
3270         domain = find_domain(pdev);
3271         if (!domain)
3272                 return 0;
3273
3274         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3275                 domain_remove_one_dev_info(domain, pdev);
3276
3277                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3278                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3279                     list_empty(&domain->devices))
3280                         domain_exit(domain);
3281         }
3282
3283         return 0;
3284 }
3285
3286 static struct notifier_block device_nb = {
3287         .notifier_call = device_notifier,
3288 };
3289
3290 int __init intel_iommu_init(void)
3291 {
3292         int ret = 0;
3293
3294         /* VT-d is required for a TXT/tboot launch, so enforce that */
3295         force_on = tboot_force_iommu();
3296
3297         if (dmar_table_init()) {
3298                 if (force_on)
3299                         panic("tboot: Failed to initialize DMAR table\n");
3300                 return  -ENODEV;
3301         }
3302
3303         if (dmar_dev_scope_init()) {
3304                 if (force_on)
3305                         panic("tboot: Failed to initialize DMAR device scope\n");
3306                 return  -ENODEV;
3307         }
3308
3309         /*
3310          * Check the need for DMA-remapping initialization now.
3311          * Above initialization will also be used by Interrupt-remapping.
3312          */
3313         if (no_iommu || dmar_disabled)
3314                 return -ENODEV;
3315
3316         if (iommu_init_mempool()) {
3317                 if (force_on)
3318                         panic("tboot: Failed to initialize iommu memory\n");
3319                 return  -ENODEV;
3320         }
3321
3322         if (dmar_init_reserved_ranges()) {
3323                 if (force_on)
3324                         panic("tboot: Failed to reserve iommu ranges\n");
3325                 return  -ENODEV;
3326         }
3327
3328         init_no_remapping_devices();
3329
3330         ret = init_dmars();
3331         if (ret) {
3332                 if (force_on)
3333                         panic("tboot: Failed to initialize DMARs\n");
3334                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3335                 put_iova_domain(&reserved_iova_list);
3336                 iommu_exit_mempool();
3337                 return ret;
3338         }
3339         printk(KERN_INFO
3340         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3341
3342         init_timer(&unmap_timer);
3343 #ifdef CONFIG_SWIOTLB
3344         swiotlb = 0;
3345 #endif
3346         dma_ops = &intel_dma_ops;
3347
3348         init_iommu_pm_ops();
3349
3350         register_iommu(&intel_iommu_ops);
3351
3352         bus_register_notifier(&pci_bus_type, &device_nb);
3353
3354         return 0;
3355 }
3356
3357 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3358                                            struct pci_dev *pdev)
3359 {
3360         struct pci_dev *tmp, *parent;
3361
3362         if (!iommu || !pdev)
3363                 return;
3364
3365         /* dependent device detach */
3366         tmp = pci_find_upstream_pcie_bridge(pdev);
3367         /* Secondary interface's bus number and devfn 0 */
3368         if (tmp) {
3369                 parent = pdev->bus->self;
3370                 while (parent != tmp) {
3371                         iommu_detach_dev(iommu, parent->bus->number,
3372                                          parent->devfn);
3373                         parent = parent->bus->self;
3374                 }
3375                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3376                         iommu_detach_dev(iommu,
3377                                 tmp->subordinate->number, 0);
3378                 else /* this is a legacy PCI bridge */
3379                         iommu_detach_dev(iommu, tmp->bus->number,
3380                                          tmp->devfn);
3381         }
3382 }
3383
3384 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3385                                           struct pci_dev *pdev)
3386 {
3387         struct device_domain_info *info;
3388         struct intel_iommu *iommu;
3389         unsigned long flags;
3390         int found = 0;
3391         struct list_head *entry, *tmp;
3392
3393         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3394                                 pdev->devfn);
3395         if (!iommu)
3396                 return;
3397
3398         spin_lock_irqsave(&device_domain_lock, flags);
3399         list_for_each_safe(entry, tmp, &domain->devices) {
3400                 info = list_entry(entry, struct device_domain_info, link);
3401                 /* No need to compare PCI domain; it has to be the same */
3402                 if (info->bus == pdev->bus->number &&
3403                     info->devfn == pdev->devfn) {
3404                         list_del(&info->link);
3405                         list_del(&info->global);
3406                         if (info->dev)
3407                                 info->dev->dev.archdata.iommu = NULL;
3408                         spin_unlock_irqrestore(&device_domain_lock, flags);
3409
3410                         iommu_disable_dev_iotlb(info);
3411                         iommu_detach_dev(iommu, info->bus, info->devfn);
3412                         iommu_detach_dependent_devices(iommu, pdev);
3413                         free_devinfo_mem(info);
3414
3415                         spin_lock_irqsave(&device_domain_lock, flags);
3416
3417                         if (found)
3418                                 break;
3419                         else
3420                                 continue;
3421                 }
3422
3423                 /* if there is no other devices under the same iommu
3424                  * owned by this domain, clear this iommu in iommu_bmp
3425                  * update iommu count and coherency
3426                  */
3427                 if (iommu == device_to_iommu(info->segment, info->bus,
3428                                             info->devfn))
3429                         found = 1;
3430         }
3431
3432         if (found == 0) {
3433                 unsigned long tmp_flags;
3434                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3435                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3436                 domain->iommu_count--;
3437                 domain_update_iommu_cap(domain);
3438                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3439
3440                 spin_lock_irqsave(&iommu->lock, tmp_flags);
3441                 clear_bit(domain->id, iommu->domain_ids);
3442                 iommu->domains[domain->id] = NULL;
3443                 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3444         }
3445
3446         spin_unlock_irqrestore(&device_domain_lock, flags);
3447 }
3448
3449 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3450 {
3451         struct device_domain_info *info;
3452         struct intel_iommu *iommu;
3453         unsigned long flags1, flags2;
3454
3455         spin_lock_irqsave(&device_domain_lock, flags1);
3456         while (!list_empty(&domain->devices)) {
3457                 info = list_entry(domain->devices.next,
3458                         struct device_domain_info, link);
3459                 list_del(&info->link);
3460                 list_del(&info->global);
3461                 if (info->dev)
3462                         info->dev->dev.archdata.iommu = NULL;
3463
3464                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3465
3466                 iommu_disable_dev_iotlb(info);
3467                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3468                 iommu_detach_dev(iommu, info->bus, info->devfn);
3469                 iommu_detach_dependent_devices(iommu, info->dev);
3470
3471                 /* clear this iommu in iommu_bmp, update iommu count
3472                  * and capabilities
3473                  */
3474                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3475                 if (test_and_clear_bit(iommu->seq_id,
3476                                        &domain->iommu_bmp)) {
3477                         domain->iommu_count--;
3478                         domain_update_iommu_cap(domain);
3479                 }
3480                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3481
3482                 free_devinfo_mem(info);
3483                 spin_lock_irqsave(&device_domain_lock, flags1);
3484         }
3485         spin_unlock_irqrestore(&device_domain_lock, flags1);
3486 }
3487
3488 /* domain id for virtual machine, it won't be set in context */
3489 static unsigned long vm_domid;
3490
3491 static struct dmar_domain *iommu_alloc_vm_domain(void)
3492 {
3493         struct dmar_domain *domain;
3494
3495         domain = alloc_domain_mem();
3496         if (!domain)
3497                 return NULL;
3498
3499         domain->id = vm_domid++;
3500         domain->nid = -1;
3501         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3502         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3503
3504         return domain;
3505 }
3506
3507 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3508 {
3509         int adjust_width;
3510
3511         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3512         spin_lock_init(&domain->iommu_lock);
3513
3514         domain_reserve_special_ranges(domain);
3515
3516         /* calculate AGAW */
3517         domain->gaw = guest_width;
3518         adjust_width = guestwidth_to_adjustwidth(guest_width);
3519         domain->agaw = width_to_agaw(adjust_width);
3520
3521         INIT_LIST_HEAD(&domain->devices);
3522
3523         domain->iommu_count = 0;
3524         domain->iommu_coherency = 0;
3525         domain->iommu_snooping = 0;
3526         domain->max_addr = 0;
3527         domain->nid = -1;
3528
3529         /* always allocate the top pgd */
3530         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3531         if (!domain->pgd)
3532                 return -ENOMEM;
3533         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3534         return 0;
3535 }
3536
3537 static void iommu_free_vm_domain(struct dmar_domain *domain)
3538 {
3539         unsigned long flags;
3540         struct dmar_drhd_unit *drhd;
3541         struct intel_iommu *iommu;
3542         unsigned long i;
3543         unsigned long ndomains;
3544
3545         for_each_drhd_unit(drhd) {
3546                 if (drhd->ignored)
3547                         continue;
3548                 iommu = drhd->iommu;
3549
3550                 ndomains = cap_ndoms(iommu->cap);
3551                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3552                         if (iommu->domains[i] == domain) {
3553                                 spin_lock_irqsave(&iommu->lock, flags);
3554                                 clear_bit(i, iommu->domain_ids);
3555                                 iommu->domains[i] = NULL;
3556                                 spin_unlock_irqrestore(&iommu->lock, flags);
3557                                 break;
3558                         }
3559                 }
3560         }
3561 }
3562
3563 static void vm_domain_exit(struct dmar_domain *domain)
3564 {
3565         /* Domain 0 is reserved, so dont process it */
3566         if (!domain)
3567                 return;
3568
3569         vm_domain_remove_all_dev_info(domain);
3570         /* destroy iovas */
3571         put_iova_domain(&domain->iovad);
3572
3573         /* clear ptes */
3574         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3575
3576         /* free page tables */
3577         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3578
3579         iommu_free_vm_domain(domain);
3580         free_domain_mem(domain);
3581 }
3582
3583 static int intel_iommu_domain_init(struct iommu_domain *domain)
3584 {
3585         struct dmar_domain *dmar_domain;
3586
3587         dmar_domain = iommu_alloc_vm_domain();
3588         if (!dmar_domain) {
3589                 printk(KERN_ERR
3590                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3591                 return -ENOMEM;
3592         }
3593         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3594                 printk(KERN_ERR
3595                         "intel_iommu_domain_init() failed\n");
3596                 vm_domain_exit(dmar_domain);
3597                 return -ENOMEM;
3598         }
3599         domain->priv = dmar_domain;
3600
3601         return 0;
3602 }
3603
3604 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3605 {
3606         struct dmar_domain *dmar_domain = domain->priv;
3607
3608         domain->priv = NULL;
3609         vm_domain_exit(dmar_domain);
3610 }
3611
3612 static int intel_iommu_attach_device(struct iommu_domain *domain,
3613                                      struct device *dev)
3614 {
3615         struct dmar_domain *dmar_domain = domain->priv;
3616         struct pci_dev *pdev = to_pci_dev(dev);
3617         struct intel_iommu *iommu;
3618         int addr_width;
3619
3620         /* normally pdev is not mapped */
3621         if (unlikely(domain_context_mapped(pdev))) {
3622                 struct dmar_domain *old_domain;
3623
3624                 old_domain = find_domain(pdev);
3625                 if (old_domain) {
3626                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3627                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3628                                 domain_remove_one_dev_info(old_domain, pdev);
3629                         else
3630                                 domain_remove_dev_info(old_domain);
3631                 }
3632         }
3633
3634         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3635                                 pdev->devfn);
3636         if (!iommu)
3637                 return -ENODEV;
3638
3639         /* check if this iommu agaw is sufficient for max mapped address */
3640         addr_width = agaw_to_width(iommu->agaw);
3641         if (addr_width > cap_mgaw(iommu->cap))
3642                 addr_width = cap_mgaw(iommu->cap);
3643
3644         if (dmar_domain->max_addr > (1LL << addr_width)) {
3645                 printk(KERN_ERR "%s: iommu width (%d) is not "
3646                        "sufficient for the mapped address (%llx)\n",
3647                        __func__, addr_width, dmar_domain->max_addr);
3648                 return -EFAULT;
3649         }
3650         dmar_domain->gaw = addr_width;
3651
3652         /*
3653          * Knock out extra levels of page tables if necessary
3654          */
3655         while (iommu->agaw < dmar_domain->agaw) {
3656                 struct dma_pte *pte;
3657
3658                 pte = dmar_domain->pgd;
3659                 if (dma_pte_present(pte)) {
3660                         dmar_domain->pgd = (struct dma_pte *)
3661                                 phys_to_virt(dma_pte_addr(pte));
3662                         free_pgtable_page(pte);
3663                 }
3664                 dmar_domain->agaw--;
3665         }
3666
3667         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3668 }
3669
3670 static void intel_iommu_detach_device(struct iommu_domain *domain,
3671                                       struct device *dev)
3672 {
3673         struct dmar_domain *dmar_domain = domain->priv;
3674         struct pci_dev *pdev = to_pci_dev(dev);
3675
3676         domain_remove_one_dev_info(dmar_domain, pdev);
3677 }
3678
3679 static int intel_iommu_map(struct iommu_domain *domain,
3680                            unsigned long iova, phys_addr_t hpa,
3681                            int gfp_order, int iommu_prot)
3682 {
3683         struct dmar_domain *dmar_domain = domain->priv;
3684         u64 max_addr;
3685         int prot = 0;
3686         size_t size;
3687         int ret;
3688
3689         if (iommu_prot & IOMMU_READ)
3690                 prot |= DMA_PTE_READ;
3691         if (iommu_prot & IOMMU_WRITE)
3692                 prot |= DMA_PTE_WRITE;
3693         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3694                 prot |= DMA_PTE_SNP;
3695
3696         size     = PAGE_SIZE << gfp_order;
3697         max_addr = iova + size;
3698         if (dmar_domain->max_addr < max_addr) {
3699                 u64 end;
3700
3701                 /* check if minimum agaw is sufficient for mapped address */
3702                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3703                 if (end < max_addr) {
3704                         printk(KERN_ERR "%s: iommu width (%d) is not "
3705                                "sufficient for the mapped address (%llx)\n",
3706                                __func__, dmar_domain->gaw, max_addr);
3707                         return -EFAULT;
3708                 }
3709                 dmar_domain->max_addr = max_addr;
3710         }
3711         /* Round up size to next multiple of PAGE_SIZE, if it and
3712            the low bits of hpa would take us onto the next page */
3713         size = aligned_nrpages(hpa, size);
3714         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3715                                  hpa >> VTD_PAGE_SHIFT, size, prot);
3716         return ret;
3717 }
3718
3719 static int intel_iommu_unmap(struct iommu_domain *domain,
3720                              unsigned long iova, int gfp_order)
3721 {
3722         struct dmar_domain *dmar_domain = domain->priv;
3723         size_t size = PAGE_SIZE << gfp_order;
3724
3725         dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3726                             (iova + size - 1) >> VTD_PAGE_SHIFT);
3727
3728         if (dmar_domain->max_addr == iova + size)
3729                 dmar_domain->max_addr = iova;
3730
3731         return gfp_order;
3732 }
3733
3734 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3735                                             unsigned long iova)
3736 {
3737         struct dmar_domain *dmar_domain = domain->priv;
3738         struct dma_pte *pte;
3739         u64 phys = 0;
3740
3741         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3742         if (pte)
3743                 phys = dma_pte_addr(pte);
3744
3745         return phys;
3746 }
3747
3748 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3749                                       unsigned long cap)
3750 {
3751         struct dmar_domain *dmar_domain = domain->priv;
3752
3753         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3754                 return dmar_domain->iommu_snooping;
3755         if (cap == IOMMU_CAP_INTR_REMAP)
3756                 return intr_remapping_enabled;
3757
3758         return 0;
3759 }
3760
3761 static struct iommu_ops intel_iommu_ops = {
3762         .domain_init    = intel_iommu_domain_init,
3763         .domain_destroy = intel_iommu_domain_destroy,
3764         .attach_dev     = intel_iommu_attach_device,
3765         .detach_dev     = intel_iommu_detach_device,
3766         .map            = intel_iommu_map,
3767         .unmap          = intel_iommu_unmap,
3768         .iova_to_phys   = intel_iommu_iova_to_phys,
3769         .domain_has_cap = intel_iommu_domain_has_cap,
3770 };
3771
3772 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3773 {
3774         /*
3775          * Mobile 4 Series Chipset neglects to set RWBF capability,
3776          * but needs it:
3777          */
3778         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3779         rwbf_quirk = 1;
3780
3781         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
3782         if (dev->revision == 0x07) {
3783                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
3784                 dmar_map_gfx = 0;
3785         }
3786 }
3787
3788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3789
3790 #define GGC 0x52
3791 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
3792 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
3793 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
3794 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
3795 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
3796 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
3797 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
3798 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
3799
3800 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
3801 {
3802         unsigned short ggc;
3803
3804         if (pci_read_config_word(dev, GGC, &ggc))
3805                 return;
3806
3807         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
3808                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
3809                 dmar_map_gfx = 0;
3810         }
3811 }
3812 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
3813 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
3814 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
3815 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
3816
3817 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3818    ISOCH DMAR unit for the Azalia sound device, but not give it any
3819    TLB entries, which causes it to deadlock. Check for that.  We do
3820    this in a function called from init_dmars(), instead of in a PCI
3821    quirk, because we don't want to print the obnoxious "BIOS broken"
3822    message if VT-d is actually disabled.
3823 */
3824 static void __init check_tylersburg_isoch(void)
3825 {
3826         struct pci_dev *pdev;
3827         uint32_t vtisochctrl;
3828
3829         /* If there's no Azalia in the system anyway, forget it. */
3830         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3831         if (!pdev)
3832                 return;
3833         pci_dev_put(pdev);
3834
3835         /* System Management Registers. Might be hidden, in which case
3836            we can't do the sanity check. But that's OK, because the
3837            known-broken BIOSes _don't_ actually hide it, so far. */
3838         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3839         if (!pdev)
3840                 return;
3841
3842         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3843                 pci_dev_put(pdev);
3844                 return;
3845         }
3846
3847         pci_dev_put(pdev);
3848
3849         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3850         if (vtisochctrl & 1)
3851                 return;
3852
3853         /* Drop all bits other than the number of TLB entries */
3854         vtisochctrl &= 0x1c;
3855
3856         /* If we have the recommended number of TLB entries (16), fine. */
3857         if (vtisochctrl == 0x10)
3858                 return;
3859
3860         /* Zero TLB entries? You get to ride the short bus to school. */
3861         if (!vtisochctrl) {
3862                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3863                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3864                      dmi_get_system_info(DMI_BIOS_VENDOR),
3865                      dmi_get_system_info(DMI_BIOS_VERSION),
3866                      dmi_get_system_info(DMI_PRODUCT_VERSION));
3867                 iommu_identity_mapping |= IDENTMAP_AZALIA;
3868                 return;
3869         }
3870         
3871         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3872                vtisochctrl);
3873 }