]> nv-tegra.nvidia Code Review - linux-2.6.git/blob - drivers/pci/intel-iommu.c
Merge git://git.infradead.org/battery-2.6
[linux-2.6.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
45
46 #define ROOT_SIZE               VTD_PAGE_SIZE
47 #define CONTEXT_SIZE            VTD_PAGE_SIZE
48
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
52
53 #define IOAPIC_RANGE_START      (0xfee00000)
54 #define IOAPIC_RANGE_END        (0xfeefffff)
55 #define IOVA_START_ADDR         (0x1000)
56
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
58
59 #define MAX_AGAW_WIDTH 64
60
61 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
63
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
67                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
69
70 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
73
74
75 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
76    are never going to work. */
77 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
78 {
79         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
80 }
81
82 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
83 {
84         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
85 }
86 static inline unsigned long page_to_dma_pfn(struct page *pg)
87 {
88         return mm_to_dma_pfn(page_to_pfn(pg));
89 }
90 static inline unsigned long virt_to_dma_pfn(void *p)
91 {
92         return page_to_dma_pfn(virt_to_page(p));
93 }
94
95 /* global iommu list, set NULL for ignored DMAR units */
96 static struct intel_iommu **g_iommus;
97
98 static void __init check_tylersburg_isoch(void);
99 static int rwbf_quirk;
100
101 /*
102  * 0: Present
103  * 1-11: Reserved
104  * 12-63: Context Ptr (12 - (haw-1))
105  * 64-127: Reserved
106  */
107 struct root_entry {
108         u64     val;
109         u64     rsvd1;
110 };
111 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
112 static inline bool root_present(struct root_entry *root)
113 {
114         return (root->val & 1);
115 }
116 static inline void set_root_present(struct root_entry *root)
117 {
118         root->val |= 1;
119 }
120 static inline void set_root_value(struct root_entry *root, unsigned long value)
121 {
122         root->val |= value & VTD_PAGE_MASK;
123 }
124
125 static inline struct context_entry *
126 get_context_addr_from_root(struct root_entry *root)
127 {
128         return (struct context_entry *)
129                 (root_present(root)?phys_to_virt(
130                 root->val & VTD_PAGE_MASK) :
131                 NULL);
132 }
133
134 /*
135  * low 64 bits:
136  * 0: present
137  * 1: fault processing disable
138  * 2-3: translation type
139  * 12-63: address space root
140  * high 64 bits:
141  * 0-2: address width
142  * 3-6: aval
143  * 8-23: domain id
144  */
145 struct context_entry {
146         u64 lo;
147         u64 hi;
148 };
149
150 static inline bool context_present(struct context_entry *context)
151 {
152         return (context->lo & 1);
153 }
154 static inline void context_set_present(struct context_entry *context)
155 {
156         context->lo |= 1;
157 }
158
159 static inline void context_set_fault_enable(struct context_entry *context)
160 {
161         context->lo &= (((u64)-1) << 2) | 1;
162 }
163
164 static inline void context_set_translation_type(struct context_entry *context,
165                                                 unsigned long value)
166 {
167         context->lo &= (((u64)-1) << 4) | 3;
168         context->lo |= (value & 3) << 2;
169 }
170
171 static inline void context_set_address_root(struct context_entry *context,
172                                             unsigned long value)
173 {
174         context->lo |= value & VTD_PAGE_MASK;
175 }
176
177 static inline void context_set_address_width(struct context_entry *context,
178                                              unsigned long value)
179 {
180         context->hi |= value & 7;
181 }
182
183 static inline void context_set_domain_id(struct context_entry *context,
184                                          unsigned long value)
185 {
186         context->hi |= (value & ((1 << 16) - 1)) << 8;
187 }
188
189 static inline void context_clear_entry(struct context_entry *context)
190 {
191         context->lo = 0;
192         context->hi = 0;
193 }
194
195 /*
196  * 0: readable
197  * 1: writable
198  * 2-6: reserved
199  * 7: super page
200  * 8-10: available
201  * 11: snoop behavior
202  * 12-63: Host physcial address
203  */
204 struct dma_pte {
205         u64 val;
206 };
207
208 static inline void dma_clear_pte(struct dma_pte *pte)
209 {
210         pte->val = 0;
211 }
212
213 static inline void dma_set_pte_readable(struct dma_pte *pte)
214 {
215         pte->val |= DMA_PTE_READ;
216 }
217
218 static inline void dma_set_pte_writable(struct dma_pte *pte)
219 {
220         pte->val |= DMA_PTE_WRITE;
221 }
222
223 static inline void dma_set_pte_snp(struct dma_pte *pte)
224 {
225         pte->val |= DMA_PTE_SNP;
226 }
227
228 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
229 {
230         pte->val = (pte->val & ~3) | (prot & 3);
231 }
232
233 static inline u64 dma_pte_addr(struct dma_pte *pte)
234 {
235 #ifdef CONFIG_64BIT
236         return pte->val & VTD_PAGE_MASK;
237 #else
238         /* Must have a full atomic 64-bit read */
239         return  __cmpxchg64(pte, 0ULL, 0ULL) & VTD_PAGE_MASK;
240 #endif
241 }
242
243 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
244 {
245         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
246 }
247
248 static inline bool dma_pte_present(struct dma_pte *pte)
249 {
250         return (pte->val & 3) != 0;
251 }
252
253 static inline int first_pte_in_page(struct dma_pte *pte)
254 {
255         return !((unsigned long)pte & ~VTD_PAGE_MASK);
256 }
257
258 /*
259  * This domain is a statically identity mapping domain.
260  *      1. This domain creats a static 1:1 mapping to all usable memory.
261  *      2. It maps to each iommu if successful.
262  *      3. Each iommu mapps to this domain if successful.
263  */
264 static struct dmar_domain *si_domain;
265 static int hw_pass_through = 1;
266
267 /* devices under the same p2p bridge are owned in one domain */
268 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
269
270 /* domain represents a virtual machine, more than one devices
271  * across iommus may be owned in one domain, e.g. kvm guest.
272  */
273 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
274
275 /* si_domain contains mulitple devices */
276 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
277
278 struct dmar_domain {
279         int     id;                     /* domain id */
280         int     nid;                    /* node id */
281         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
282
283         struct list_head devices;       /* all devices' list */
284         struct iova_domain iovad;       /* iova's that belong to this domain */
285
286         struct dma_pte  *pgd;           /* virtual address */
287         int             gaw;            /* max guest address width */
288
289         /* adjusted guest address width, 0 is level 2 30-bit */
290         int             agaw;
291
292         int             flags;          /* flags to find out type of domain */
293
294         int             iommu_coherency;/* indicate coherency of iommu access */
295         int             iommu_snooping; /* indicate snooping control feature*/
296         int             iommu_count;    /* reference count of iommu */
297         spinlock_t      iommu_lock;     /* protect iommu set in domain */
298         u64             max_addr;       /* maximum mapped address */
299 };
300
301 /* PCI domain-device relationship */
302 struct device_domain_info {
303         struct list_head link;  /* link to domain siblings */
304         struct list_head global; /* link to global list */
305         int segment;            /* PCI domain */
306         u8 bus;                 /* PCI bus number */
307         u8 devfn;               /* PCI devfn number */
308         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
309         struct intel_iommu *iommu; /* IOMMU used by this device */
310         struct dmar_domain *domain; /* pointer to domain */
311 };
312
313 static void flush_unmaps_timeout(unsigned long data);
314
315 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
316
317 #define HIGH_WATER_MARK 250
318 struct deferred_flush_tables {
319         int next;
320         struct iova *iova[HIGH_WATER_MARK];
321         struct dmar_domain *domain[HIGH_WATER_MARK];
322 };
323
324 static struct deferred_flush_tables *deferred_flush;
325
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328
329 static DEFINE_SPINLOCK(async_umap_flush_lock);
330 static LIST_HEAD(unmaps_to_do);
331
332 static int timer_on;
333 static long list_size;
334
335 static void domain_remove_dev_info(struct dmar_domain *domain);
336
337 #ifdef CONFIG_DMAR_DEFAULT_ON
338 int dmar_disabled = 0;
339 #else
340 int dmar_disabled = 1;
341 #endif /*CONFIG_DMAR_DEFAULT_ON*/
342
343 static int dmar_map_gfx = 1;
344 static int dmar_forcedac;
345 static int intel_iommu_strict;
346
347 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
348 static DEFINE_SPINLOCK(device_domain_lock);
349 static LIST_HEAD(device_domain_list);
350
351 static struct iommu_ops intel_iommu_ops;
352
353 static int __init intel_iommu_setup(char *str)
354 {
355         if (!str)
356                 return -EINVAL;
357         while (*str) {
358                 if (!strncmp(str, "on", 2)) {
359                         dmar_disabled = 0;
360                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
361                 } else if (!strncmp(str, "off", 3)) {
362                         dmar_disabled = 1;
363                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
364                 } else if (!strncmp(str, "igfx_off", 8)) {
365                         dmar_map_gfx = 0;
366                         printk(KERN_INFO
367                                 "Intel-IOMMU: disable GFX device mapping\n");
368                 } else if (!strncmp(str, "forcedac", 8)) {
369                         printk(KERN_INFO
370                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
371                         dmar_forcedac = 1;
372                 } else if (!strncmp(str, "strict", 6)) {
373                         printk(KERN_INFO
374                                 "Intel-IOMMU: disable batched IOTLB flush\n");
375                         intel_iommu_strict = 1;
376                 }
377
378                 str += strcspn(str, ",");
379                 while (*str == ',')
380                         str++;
381         }
382         return 0;
383 }
384 __setup("intel_iommu=", intel_iommu_setup);
385
386 static struct kmem_cache *iommu_domain_cache;
387 static struct kmem_cache *iommu_devinfo_cache;
388 static struct kmem_cache *iommu_iova_cache;
389
390 static inline void *alloc_pgtable_page(int node)
391 {
392         struct page *page;
393         void *vaddr = NULL;
394
395         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
396         if (page)
397                 vaddr = page_address(page);
398         return vaddr;
399 }
400
401 static inline void free_pgtable_page(void *vaddr)
402 {
403         free_page((unsigned long)vaddr);
404 }
405
406 static inline void *alloc_domain_mem(void)
407 {
408         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
409 }
410
411 static void free_domain_mem(void *vaddr)
412 {
413         kmem_cache_free(iommu_domain_cache, vaddr);
414 }
415
416 static inline void * alloc_devinfo_mem(void)
417 {
418         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
419 }
420
421 static inline void free_devinfo_mem(void *vaddr)
422 {
423         kmem_cache_free(iommu_devinfo_cache, vaddr);
424 }
425
426 struct iova *alloc_iova_mem(void)
427 {
428         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
429 }
430
431 void free_iova_mem(struct iova *iova)
432 {
433         kmem_cache_free(iommu_iova_cache, iova);
434 }
435
436
437 static inline int width_to_agaw(int width);
438
439 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
440 {
441         unsigned long sagaw;
442         int agaw = -1;
443
444         sagaw = cap_sagaw(iommu->cap);
445         for (agaw = width_to_agaw(max_gaw);
446              agaw >= 0; agaw--) {
447                 if (test_bit(agaw, &sagaw))
448                         break;
449         }
450
451         return agaw;
452 }
453
454 /*
455  * Calculate max SAGAW for each iommu.
456  */
457 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
458 {
459         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
460 }
461
462 /*
463  * calculate agaw for each iommu.
464  * "SAGAW" may be different across iommus, use a default agaw, and
465  * get a supported less agaw for iommus that don't support the default agaw.
466  */
467 int iommu_calculate_agaw(struct intel_iommu *iommu)
468 {
469         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
470 }
471
472 /* This functionin only returns single iommu in a domain */
473 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
474 {
475         int iommu_id;
476
477         /* si_domain and vm domain should not get here. */
478         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
479         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
480
481         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
482         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
483                 return NULL;
484
485         return g_iommus[iommu_id];
486 }
487
488 static void domain_update_iommu_coherency(struct dmar_domain *domain)
489 {
490         int i;
491
492         domain->iommu_coherency = 1;
493
494         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
495                 if (!ecap_coherent(g_iommus[i]->ecap)) {
496                         domain->iommu_coherency = 0;
497                         break;
498                 }
499         }
500 }
501
502 static void domain_update_iommu_snooping(struct dmar_domain *domain)
503 {
504         int i;
505
506         domain->iommu_snooping = 1;
507
508         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
509                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
510                         domain->iommu_snooping = 0;
511                         break;
512                 }
513         }
514 }
515
516 /* Some capabilities may be different across iommus */
517 static void domain_update_iommu_cap(struct dmar_domain *domain)
518 {
519         domain_update_iommu_coherency(domain);
520         domain_update_iommu_snooping(domain);
521 }
522
523 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
524 {
525         struct dmar_drhd_unit *drhd = NULL;
526         int i;
527
528         for_each_drhd_unit(drhd) {
529                 if (drhd->ignored)
530                         continue;
531                 if (segment != drhd->segment)
532                         continue;
533
534                 for (i = 0; i < drhd->devices_cnt; i++) {
535                         if (drhd->devices[i] &&
536                             drhd->devices[i]->bus->number == bus &&
537                             drhd->devices[i]->devfn == devfn)
538                                 return drhd->iommu;
539                         if (drhd->devices[i] &&
540                             drhd->devices[i]->subordinate &&
541                             drhd->devices[i]->subordinate->number <= bus &&
542                             drhd->devices[i]->subordinate->subordinate >= bus)
543                                 return drhd->iommu;
544                 }
545
546                 if (drhd->include_all)
547                         return drhd->iommu;
548         }
549
550         return NULL;
551 }
552
553 static void domain_flush_cache(struct dmar_domain *domain,
554                                void *addr, int size)
555 {
556         if (!domain->iommu_coherency)
557                 clflush_cache_range(addr, size);
558 }
559
560 /* Gets context entry for a given bus and devfn */
561 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
562                 u8 bus, u8 devfn)
563 {
564         struct root_entry *root;
565         struct context_entry *context;
566         unsigned long phy_addr;
567         unsigned long flags;
568
569         spin_lock_irqsave(&iommu->lock, flags);
570         root = &iommu->root_entry[bus];
571         context = get_context_addr_from_root(root);
572         if (!context) {
573                 context = (struct context_entry *)
574                                 alloc_pgtable_page(iommu->node);
575                 if (!context) {
576                         spin_unlock_irqrestore(&iommu->lock, flags);
577                         return NULL;
578                 }
579                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
580                 phy_addr = virt_to_phys((void *)context);
581                 set_root_value(root, phy_addr);
582                 set_root_present(root);
583                 __iommu_flush_cache(iommu, root, sizeof(*root));
584         }
585         spin_unlock_irqrestore(&iommu->lock, flags);
586         return &context[devfn];
587 }
588
589 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
590 {
591         struct root_entry *root;
592         struct context_entry *context;
593         int ret;
594         unsigned long flags;
595
596         spin_lock_irqsave(&iommu->lock, flags);
597         root = &iommu->root_entry[bus];
598         context = get_context_addr_from_root(root);
599         if (!context) {
600                 ret = 0;
601                 goto out;
602         }
603         ret = context_present(&context[devfn]);
604 out:
605         spin_unlock_irqrestore(&iommu->lock, flags);
606         return ret;
607 }
608
609 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
610 {
611         struct root_entry *root;
612         struct context_entry *context;
613         unsigned long flags;
614
615         spin_lock_irqsave(&iommu->lock, flags);
616         root = &iommu->root_entry[bus];
617         context = get_context_addr_from_root(root);
618         if (context) {
619                 context_clear_entry(&context[devfn]);
620                 __iommu_flush_cache(iommu, &context[devfn], \
621                         sizeof(*context));
622         }
623         spin_unlock_irqrestore(&iommu->lock, flags);
624 }
625
626 static void free_context_table(struct intel_iommu *iommu)
627 {
628         struct root_entry *root;
629         int i;
630         unsigned long flags;
631         struct context_entry *context;
632
633         spin_lock_irqsave(&iommu->lock, flags);
634         if (!iommu->root_entry) {
635                 goto out;
636         }
637         for (i = 0; i < ROOT_ENTRY_NR; i++) {
638                 root = &iommu->root_entry[i];
639                 context = get_context_addr_from_root(root);
640                 if (context)
641                         free_pgtable_page(context);
642         }
643         free_pgtable_page(iommu->root_entry);
644         iommu->root_entry = NULL;
645 out:
646         spin_unlock_irqrestore(&iommu->lock, flags);
647 }
648
649 /* page table handling */
650 #define LEVEL_STRIDE            (9)
651 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
652
653 static inline int agaw_to_level(int agaw)
654 {
655         return agaw + 2;
656 }
657
658 static inline int agaw_to_width(int agaw)
659 {
660         return 30 + agaw * LEVEL_STRIDE;
661
662 }
663
664 static inline int width_to_agaw(int width)
665 {
666         return (width - 30) / LEVEL_STRIDE;
667 }
668
669 static inline unsigned int level_to_offset_bits(int level)
670 {
671         return (level - 1) * LEVEL_STRIDE;
672 }
673
674 static inline int pfn_level_offset(unsigned long pfn, int level)
675 {
676         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
677 }
678
679 static inline unsigned long level_mask(int level)
680 {
681         return -1UL << level_to_offset_bits(level);
682 }
683
684 static inline unsigned long level_size(int level)
685 {
686         return 1UL << level_to_offset_bits(level);
687 }
688
689 static inline unsigned long align_to_level(unsigned long pfn, int level)
690 {
691         return (pfn + level_size(level) - 1) & level_mask(level);
692 }
693
694 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
695                                       unsigned long pfn)
696 {
697         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
698         struct dma_pte *parent, *pte = NULL;
699         int level = agaw_to_level(domain->agaw);
700         int offset;
701
702         BUG_ON(!domain->pgd);
703         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
704         parent = domain->pgd;
705
706         while (level > 0) {
707                 void *tmp_page;
708
709                 offset = pfn_level_offset(pfn, level);
710                 pte = &parent[offset];
711                 if (level == 1)
712                         break;
713
714                 if (!dma_pte_present(pte)) {
715                         uint64_t pteval;
716
717                         tmp_page = alloc_pgtable_page(domain->nid);
718
719                         if (!tmp_page)
720                                 return NULL;
721
722                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
723                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
724                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
725                                 /* Someone else set it while we were thinking; use theirs. */
726                                 free_pgtable_page(tmp_page);
727                         } else {
728                                 dma_pte_addr(pte);
729                                 domain_flush_cache(domain, pte, sizeof(*pte));
730                         }
731                 }
732                 parent = phys_to_virt(dma_pte_addr(pte));
733                 level--;
734         }
735
736         return pte;
737 }
738
739 /* return address's pte at specific level */
740 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
741                                          unsigned long pfn,
742                                          int level)
743 {
744         struct dma_pte *parent, *pte = NULL;
745         int total = agaw_to_level(domain->agaw);
746         int offset;
747
748         parent = domain->pgd;
749         while (level <= total) {
750                 offset = pfn_level_offset(pfn, total);
751                 pte = &parent[offset];
752                 if (level == total)
753                         return pte;
754
755                 if (!dma_pte_present(pte))
756                         break;
757                 parent = phys_to_virt(dma_pte_addr(pte));
758                 total--;
759         }
760         return NULL;
761 }
762
763 /* clear last level pte, a tlb flush should be followed */
764 static void dma_pte_clear_range(struct dmar_domain *domain,
765                                 unsigned long start_pfn,
766                                 unsigned long last_pfn)
767 {
768         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
769         struct dma_pte *first_pte, *pte;
770
771         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
772         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
773         BUG_ON(start_pfn > last_pfn);
774
775         /* we don't need lock here; nobody else touches the iova range */
776         do {
777                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
778                 if (!pte) {
779                         start_pfn = align_to_level(start_pfn + 1, 2);
780                         continue;
781                 }
782                 do { 
783                         dma_clear_pte(pte);
784                         start_pfn++;
785                         pte++;
786                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
787
788                 domain_flush_cache(domain, first_pte,
789                                    (void *)pte - (void *)first_pte);
790
791         } while (start_pfn && start_pfn <= last_pfn);
792 }
793
794 /* free page table pages. last level pte should already be cleared */
795 static void dma_pte_free_pagetable(struct dmar_domain *domain,
796                                    unsigned long start_pfn,
797                                    unsigned long last_pfn)
798 {
799         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
800         struct dma_pte *first_pte, *pte;
801         int total = agaw_to_level(domain->agaw);
802         int level;
803         unsigned long tmp;
804
805         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
806         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
807         BUG_ON(start_pfn > last_pfn);
808
809         /* We don't need lock here; nobody else touches the iova range */
810         level = 2;
811         while (level <= total) {
812                 tmp = align_to_level(start_pfn, level);
813
814                 /* If we can't even clear one PTE at this level, we're done */
815                 if (tmp + level_size(level) - 1 > last_pfn)
816                         return;
817
818                 do {
819                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
820                         if (!pte) {
821                                 tmp = align_to_level(tmp + 1, level + 1);
822                                 continue;
823                         }
824                         do {
825                                 if (dma_pte_present(pte)) {
826                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
827                                         dma_clear_pte(pte);
828                                 }
829                                 pte++;
830                                 tmp += level_size(level);
831                         } while (!first_pte_in_page(pte) &&
832                                  tmp + level_size(level) - 1 <= last_pfn);
833
834                         domain_flush_cache(domain, first_pte,
835                                            (void *)pte - (void *)first_pte);
836                         
837                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
838                 level++;
839         }
840         /* free pgd */
841         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
842                 free_pgtable_page(domain->pgd);
843                 domain->pgd = NULL;
844         }
845 }
846
847 /* iommu handling */
848 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
849 {
850         struct root_entry *root;
851         unsigned long flags;
852
853         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
854         if (!root)
855                 return -ENOMEM;
856
857         __iommu_flush_cache(iommu, root, ROOT_SIZE);
858
859         spin_lock_irqsave(&iommu->lock, flags);
860         iommu->root_entry = root;
861         spin_unlock_irqrestore(&iommu->lock, flags);
862
863         return 0;
864 }
865
866 static void iommu_set_root_entry(struct intel_iommu *iommu)
867 {
868         void *addr;
869         u32 sts;
870         unsigned long flag;
871
872         addr = iommu->root_entry;
873
874         spin_lock_irqsave(&iommu->register_lock, flag);
875         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
876
877         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
878
879         /* Make sure hardware complete it */
880         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
881                       readl, (sts & DMA_GSTS_RTPS), sts);
882
883         spin_unlock_irqrestore(&iommu->register_lock, flag);
884 }
885
886 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
887 {
888         u32 val;
889         unsigned long flag;
890
891         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
892                 return;
893
894         spin_lock_irqsave(&iommu->register_lock, flag);
895         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
896
897         /* Make sure hardware complete it */
898         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
899                       readl, (!(val & DMA_GSTS_WBFS)), val);
900
901         spin_unlock_irqrestore(&iommu->register_lock, flag);
902 }
903
904 /* return value determine if we need a write buffer flush */
905 static void __iommu_flush_context(struct intel_iommu *iommu,
906                                   u16 did, u16 source_id, u8 function_mask,
907                                   u64 type)
908 {
909         u64 val = 0;
910         unsigned long flag;
911
912         switch (type) {
913         case DMA_CCMD_GLOBAL_INVL:
914                 val = DMA_CCMD_GLOBAL_INVL;
915                 break;
916         case DMA_CCMD_DOMAIN_INVL:
917                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
918                 break;
919         case DMA_CCMD_DEVICE_INVL:
920                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
921                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
922                 break;
923         default:
924                 BUG();
925         }
926         val |= DMA_CCMD_ICC;
927
928         spin_lock_irqsave(&iommu->register_lock, flag);
929         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
930
931         /* Make sure hardware complete it */
932         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
933                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
934
935         spin_unlock_irqrestore(&iommu->register_lock, flag);
936 }
937
938 /* return value determine if we need a write buffer flush */
939 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
940                                 u64 addr, unsigned int size_order, u64 type)
941 {
942         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
943         u64 val = 0, val_iva = 0;
944         unsigned long flag;
945
946         switch (type) {
947         case DMA_TLB_GLOBAL_FLUSH:
948                 /* global flush doesn't need set IVA_REG */
949                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
950                 break;
951         case DMA_TLB_DSI_FLUSH:
952                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
953                 break;
954         case DMA_TLB_PSI_FLUSH:
955                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
956                 /* Note: always flush non-leaf currently */
957                 val_iva = size_order | addr;
958                 break;
959         default:
960                 BUG();
961         }
962         /* Note: set drain read/write */
963 #if 0
964         /*
965          * This is probably to be super secure.. Looks like we can
966          * ignore it without any impact.
967          */
968         if (cap_read_drain(iommu->cap))
969                 val |= DMA_TLB_READ_DRAIN;
970 #endif
971         if (cap_write_drain(iommu->cap))
972                 val |= DMA_TLB_WRITE_DRAIN;
973
974         spin_lock_irqsave(&iommu->register_lock, flag);
975         /* Note: Only uses first TLB reg currently */
976         if (val_iva)
977                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
978         dmar_writeq(iommu->reg + tlb_offset + 8, val);
979
980         /* Make sure hardware complete it */
981         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
982                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
983
984         spin_unlock_irqrestore(&iommu->register_lock, flag);
985
986         /* check IOTLB invalidation granularity */
987         if (DMA_TLB_IAIG(val) == 0)
988                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
989         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
990                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
991                         (unsigned long long)DMA_TLB_IIRG(type),
992                         (unsigned long long)DMA_TLB_IAIG(val));
993 }
994
995 static struct device_domain_info *iommu_support_dev_iotlb(
996         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
997 {
998         int found = 0;
999         unsigned long flags;
1000         struct device_domain_info *info;
1001         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1002
1003         if (!ecap_dev_iotlb_support(iommu->ecap))
1004                 return NULL;
1005
1006         if (!iommu->qi)
1007                 return NULL;
1008
1009         spin_lock_irqsave(&device_domain_lock, flags);
1010         list_for_each_entry(info, &domain->devices, link)
1011                 if (info->bus == bus && info->devfn == devfn) {
1012                         found = 1;
1013                         break;
1014                 }
1015         spin_unlock_irqrestore(&device_domain_lock, flags);
1016
1017         if (!found || !info->dev)
1018                 return NULL;
1019
1020         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1021                 return NULL;
1022
1023         if (!dmar_find_matched_atsr_unit(info->dev))
1024                 return NULL;
1025
1026         info->iommu = iommu;
1027
1028         return info;
1029 }
1030
1031 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1032 {
1033         if (!info)
1034                 return;
1035
1036         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1037 }
1038
1039 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1040 {
1041         if (!info->dev || !pci_ats_enabled(info->dev))
1042                 return;
1043
1044         pci_disable_ats(info->dev);
1045 }
1046
1047 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1048                                   u64 addr, unsigned mask)
1049 {
1050         u16 sid, qdep;
1051         unsigned long flags;
1052         struct device_domain_info *info;
1053
1054         spin_lock_irqsave(&device_domain_lock, flags);
1055         list_for_each_entry(info, &domain->devices, link) {
1056                 if (!info->dev || !pci_ats_enabled(info->dev))
1057                         continue;
1058
1059                 sid = info->bus << 8 | info->devfn;
1060                 qdep = pci_ats_queue_depth(info->dev);
1061                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1062         }
1063         spin_unlock_irqrestore(&device_domain_lock, flags);
1064 }
1065
1066 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1067                                   unsigned long pfn, unsigned int pages, int map)
1068 {
1069         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1070         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1071
1072         BUG_ON(pages == 0);
1073
1074         /*
1075          * Fallback to domain selective flush if no PSI support or the size is
1076          * too big.
1077          * PSI requires page size to be 2 ^ x, and the base address is naturally
1078          * aligned to the size
1079          */
1080         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1081                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1082                                                 DMA_TLB_DSI_FLUSH);
1083         else
1084                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1085                                                 DMA_TLB_PSI_FLUSH);
1086
1087         /*
1088          * In caching mode, changes of pages from non-present to present require
1089          * flush. However, device IOTLB doesn't need to be flushed in this case.
1090          */
1091         if (!cap_caching_mode(iommu->cap) || !map)
1092                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1093 }
1094
1095 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1096 {
1097         u32 pmen;
1098         unsigned long flags;
1099
1100         spin_lock_irqsave(&iommu->register_lock, flags);
1101         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1102         pmen &= ~DMA_PMEN_EPM;
1103         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1104
1105         /* wait for the protected region status bit to clear */
1106         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1107                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1108
1109         spin_unlock_irqrestore(&iommu->register_lock, flags);
1110 }
1111
1112 static int iommu_enable_translation(struct intel_iommu *iommu)
1113 {
1114         u32 sts;
1115         unsigned long flags;
1116
1117         spin_lock_irqsave(&iommu->register_lock, flags);
1118         iommu->gcmd |= DMA_GCMD_TE;
1119         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1120
1121         /* Make sure hardware complete it */
1122         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1123                       readl, (sts & DMA_GSTS_TES), sts);
1124
1125         spin_unlock_irqrestore(&iommu->register_lock, flags);
1126         return 0;
1127 }
1128
1129 static int iommu_disable_translation(struct intel_iommu *iommu)
1130 {
1131         u32 sts;
1132         unsigned long flag;
1133
1134         spin_lock_irqsave(&iommu->register_lock, flag);
1135         iommu->gcmd &= ~DMA_GCMD_TE;
1136         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1137
1138         /* Make sure hardware complete it */
1139         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1140                       readl, (!(sts & DMA_GSTS_TES)), sts);
1141
1142         spin_unlock_irqrestore(&iommu->register_lock, flag);
1143         return 0;
1144 }
1145
1146
1147 static int iommu_init_domains(struct intel_iommu *iommu)
1148 {
1149         unsigned long ndomains;
1150         unsigned long nlongs;
1151
1152         ndomains = cap_ndoms(iommu->cap);
1153         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1154                         ndomains);
1155         nlongs = BITS_TO_LONGS(ndomains);
1156
1157         spin_lock_init(&iommu->lock);
1158
1159         /* TBD: there might be 64K domains,
1160          * consider other allocation for future chip
1161          */
1162         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1163         if (!iommu->domain_ids) {
1164                 printk(KERN_ERR "Allocating domain id array failed\n");
1165                 return -ENOMEM;
1166         }
1167         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1168                         GFP_KERNEL);
1169         if (!iommu->domains) {
1170                 printk(KERN_ERR "Allocating domain array failed\n");
1171                 return -ENOMEM;
1172         }
1173
1174         /*
1175          * if Caching mode is set, then invalid translations are tagged
1176          * with domainid 0. Hence we need to pre-allocate it.
1177          */
1178         if (cap_caching_mode(iommu->cap))
1179                 set_bit(0, iommu->domain_ids);
1180         return 0;
1181 }
1182
1183
1184 static void domain_exit(struct dmar_domain *domain);
1185 static void vm_domain_exit(struct dmar_domain *domain);
1186
1187 void free_dmar_iommu(struct intel_iommu *iommu)
1188 {
1189         struct dmar_domain *domain;
1190         int i;
1191         unsigned long flags;
1192
1193         if ((iommu->domains) && (iommu->domain_ids)) {
1194                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1195                         domain = iommu->domains[i];
1196                         clear_bit(i, iommu->domain_ids);
1197
1198                         spin_lock_irqsave(&domain->iommu_lock, flags);
1199                         if (--domain->iommu_count == 0) {
1200                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1201                                         vm_domain_exit(domain);
1202                                 else
1203                                         domain_exit(domain);
1204                         }
1205                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1206                 }
1207         }
1208
1209         if (iommu->gcmd & DMA_GCMD_TE)
1210                 iommu_disable_translation(iommu);
1211
1212         if (iommu->irq) {
1213                 set_irq_data(iommu->irq, NULL);
1214                 /* This will mask the irq */
1215                 free_irq(iommu->irq, iommu);
1216                 destroy_irq(iommu->irq);
1217         }
1218
1219         kfree(iommu->domains);
1220         kfree(iommu->domain_ids);
1221
1222         g_iommus[iommu->seq_id] = NULL;
1223
1224         /* if all iommus are freed, free g_iommus */
1225         for (i = 0; i < g_num_of_iommus; i++) {
1226                 if (g_iommus[i])
1227                         break;
1228         }
1229
1230         if (i == g_num_of_iommus)
1231                 kfree(g_iommus);
1232
1233         /* free context mapping */
1234         free_context_table(iommu);
1235 }
1236
1237 static struct dmar_domain *alloc_domain(void)
1238 {
1239         struct dmar_domain *domain;
1240
1241         domain = alloc_domain_mem();
1242         if (!domain)
1243                 return NULL;
1244
1245         domain->nid = -1;
1246         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1247         domain->flags = 0;
1248
1249         return domain;
1250 }
1251
1252 static int iommu_attach_domain(struct dmar_domain *domain,
1253                                struct intel_iommu *iommu)
1254 {
1255         int num;
1256         unsigned long ndomains;
1257         unsigned long flags;
1258
1259         ndomains = cap_ndoms(iommu->cap);
1260
1261         spin_lock_irqsave(&iommu->lock, flags);
1262
1263         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1264         if (num >= ndomains) {
1265                 spin_unlock_irqrestore(&iommu->lock, flags);
1266                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1267                 return -ENOMEM;
1268         }
1269
1270         domain->id = num;
1271         set_bit(num, iommu->domain_ids);
1272         set_bit(iommu->seq_id, &domain->iommu_bmp);
1273         iommu->domains[num] = domain;
1274         spin_unlock_irqrestore(&iommu->lock, flags);
1275
1276         return 0;
1277 }
1278
1279 static void iommu_detach_domain(struct dmar_domain *domain,
1280                                 struct intel_iommu *iommu)
1281 {
1282         unsigned long flags;
1283         int num, ndomains;
1284         int found = 0;
1285
1286         spin_lock_irqsave(&iommu->lock, flags);
1287         ndomains = cap_ndoms(iommu->cap);
1288         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1289                 if (iommu->domains[num] == domain) {
1290                         found = 1;
1291                         break;
1292                 }
1293         }
1294
1295         if (found) {
1296                 clear_bit(num, iommu->domain_ids);
1297                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1298                 iommu->domains[num] = NULL;
1299         }
1300         spin_unlock_irqrestore(&iommu->lock, flags);
1301 }
1302
1303 static struct iova_domain reserved_iova_list;
1304 static struct lock_class_key reserved_rbtree_key;
1305
1306 static void dmar_init_reserved_ranges(void)
1307 {
1308         struct pci_dev *pdev = NULL;
1309         struct iova *iova;
1310         int i;
1311
1312         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1313
1314         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1315                 &reserved_rbtree_key);
1316
1317         /* IOAPIC ranges shouldn't be accessed by DMA */
1318         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1319                 IOVA_PFN(IOAPIC_RANGE_END));
1320         if (!iova)
1321                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1322
1323         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1324         for_each_pci_dev(pdev) {
1325                 struct resource *r;
1326
1327                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1328                         r = &pdev->resource[i];
1329                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1330                                 continue;
1331                         iova = reserve_iova(&reserved_iova_list,
1332                                             IOVA_PFN(r->start),
1333                                             IOVA_PFN(r->end));
1334                         if (!iova)
1335                                 printk(KERN_ERR "Reserve iova failed\n");
1336                 }
1337         }
1338
1339 }
1340
1341 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1342 {
1343         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1344 }
1345
1346 static inline int guestwidth_to_adjustwidth(int gaw)
1347 {
1348         int agaw;
1349         int r = (gaw - 12) % 9;
1350
1351         if (r == 0)
1352                 agaw = gaw;
1353         else
1354                 agaw = gaw + 9 - r;
1355         if (agaw > 64)
1356                 agaw = 64;
1357         return agaw;
1358 }
1359
1360 static int domain_init(struct dmar_domain *domain, int guest_width)
1361 {
1362         struct intel_iommu *iommu;
1363         int adjust_width, agaw;
1364         unsigned long sagaw;
1365
1366         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1367         spin_lock_init(&domain->iommu_lock);
1368
1369         domain_reserve_special_ranges(domain);
1370
1371         /* calculate AGAW */
1372         iommu = domain_get_iommu(domain);
1373         if (guest_width > cap_mgaw(iommu->cap))
1374                 guest_width = cap_mgaw(iommu->cap);
1375         domain->gaw = guest_width;
1376         adjust_width = guestwidth_to_adjustwidth(guest_width);
1377         agaw = width_to_agaw(adjust_width);
1378         sagaw = cap_sagaw(iommu->cap);
1379         if (!test_bit(agaw, &sagaw)) {
1380                 /* hardware doesn't support it, choose a bigger one */
1381                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1382                 agaw = find_next_bit(&sagaw, 5, agaw);
1383                 if (agaw >= 5)
1384                         return -ENODEV;
1385         }
1386         domain->agaw = agaw;
1387         INIT_LIST_HEAD(&domain->devices);
1388
1389         if (ecap_coherent(iommu->ecap))
1390                 domain->iommu_coherency = 1;
1391         else
1392                 domain->iommu_coherency = 0;
1393
1394         if (ecap_sc_support(iommu->ecap))
1395                 domain->iommu_snooping = 1;
1396         else
1397                 domain->iommu_snooping = 0;
1398
1399         domain->iommu_count = 1;
1400         domain->nid = iommu->node;
1401
1402         /* always allocate the top pgd */
1403         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1404         if (!domain->pgd)
1405                 return -ENOMEM;
1406         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1407         return 0;
1408 }
1409
1410 static void domain_exit(struct dmar_domain *domain)
1411 {
1412         struct dmar_drhd_unit *drhd;
1413         struct intel_iommu *iommu;
1414
1415         /* Domain 0 is reserved, so dont process it */
1416         if (!domain)
1417                 return;
1418
1419         domain_remove_dev_info(domain);
1420         /* destroy iovas */
1421         put_iova_domain(&domain->iovad);
1422
1423         /* clear ptes */
1424         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1425
1426         /* free page tables */
1427         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1428
1429         for_each_active_iommu(iommu, drhd)
1430                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1431                         iommu_detach_domain(domain, iommu);
1432
1433         free_domain_mem(domain);
1434 }
1435
1436 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1437                                  u8 bus, u8 devfn, int translation)
1438 {
1439         struct context_entry *context;
1440         unsigned long flags;
1441         struct intel_iommu *iommu;
1442         struct dma_pte *pgd;
1443         unsigned long num;
1444         unsigned long ndomains;
1445         int id;
1446         int agaw;
1447         struct device_domain_info *info = NULL;
1448
1449         pr_debug("Set context mapping for %02x:%02x.%d\n",
1450                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1451
1452         BUG_ON(!domain->pgd);
1453         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1454                translation != CONTEXT_TT_MULTI_LEVEL);
1455
1456         iommu = device_to_iommu(segment, bus, devfn);
1457         if (!iommu)
1458                 return -ENODEV;
1459
1460         context = device_to_context_entry(iommu, bus, devfn);
1461         if (!context)
1462                 return -ENOMEM;
1463         spin_lock_irqsave(&iommu->lock, flags);
1464         if (context_present(context)) {
1465                 spin_unlock_irqrestore(&iommu->lock, flags);
1466                 return 0;
1467         }
1468
1469         id = domain->id;
1470         pgd = domain->pgd;
1471
1472         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1473             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1474                 int found = 0;
1475
1476                 /* find an available domain id for this device in iommu */
1477                 ndomains = cap_ndoms(iommu->cap);
1478                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1479                         if (iommu->domains[num] == domain) {
1480                                 id = num;
1481                                 found = 1;
1482                                 break;
1483                         }
1484                 }
1485
1486                 if (found == 0) {
1487                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1488                         if (num >= ndomains) {
1489                                 spin_unlock_irqrestore(&iommu->lock, flags);
1490                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1491                                 return -EFAULT;
1492                         }
1493
1494                         set_bit(num, iommu->domain_ids);
1495                         iommu->domains[num] = domain;
1496                         id = num;
1497                 }
1498
1499                 /* Skip top levels of page tables for
1500                  * iommu which has less agaw than default.
1501                  * Unnecessary for PT mode.
1502                  */
1503                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1504                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1505                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1506                                 if (!dma_pte_present(pgd)) {
1507                                         spin_unlock_irqrestore(&iommu->lock, flags);
1508                                         return -ENOMEM;
1509                                 }
1510                         }
1511                 }
1512         }
1513
1514         context_set_domain_id(context, id);
1515
1516         if (translation != CONTEXT_TT_PASS_THROUGH) {
1517                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1518                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1519                                      CONTEXT_TT_MULTI_LEVEL;
1520         }
1521         /*
1522          * In pass through mode, AW must be programmed to indicate the largest
1523          * AGAW value supported by hardware. And ASR is ignored by hardware.
1524          */
1525         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1526                 context_set_address_width(context, iommu->msagaw);
1527         else {
1528                 context_set_address_root(context, virt_to_phys(pgd));
1529                 context_set_address_width(context, iommu->agaw);
1530         }
1531
1532         context_set_translation_type(context, translation);
1533         context_set_fault_enable(context);
1534         context_set_present(context);
1535         domain_flush_cache(domain, context, sizeof(*context));
1536
1537         /*
1538          * It's a non-present to present mapping. If hardware doesn't cache
1539          * non-present entry we only need to flush the write-buffer. If the
1540          * _does_ cache non-present entries, then it does so in the special
1541          * domain #0, which we have to flush:
1542          */
1543         if (cap_caching_mode(iommu->cap)) {
1544                 iommu->flush.flush_context(iommu, 0,
1545                                            (((u16)bus) << 8) | devfn,
1546                                            DMA_CCMD_MASK_NOBIT,
1547                                            DMA_CCMD_DEVICE_INVL);
1548                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1549         } else {
1550                 iommu_flush_write_buffer(iommu);
1551         }
1552         iommu_enable_dev_iotlb(info);
1553         spin_unlock_irqrestore(&iommu->lock, flags);
1554
1555         spin_lock_irqsave(&domain->iommu_lock, flags);
1556         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1557                 domain->iommu_count++;
1558                 if (domain->iommu_count == 1)
1559                         domain->nid = iommu->node;
1560                 domain_update_iommu_cap(domain);
1561         }
1562         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1563         return 0;
1564 }
1565
1566 static int
1567 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1568                         int translation)
1569 {
1570         int ret;
1571         struct pci_dev *tmp, *parent;
1572
1573         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1574                                          pdev->bus->number, pdev->devfn,
1575                                          translation);
1576         if (ret)
1577                 return ret;
1578
1579         /* dependent device mapping */
1580         tmp = pci_find_upstream_pcie_bridge(pdev);
1581         if (!tmp)
1582                 return 0;
1583         /* Secondary interface's bus number and devfn 0 */
1584         parent = pdev->bus->self;
1585         while (parent != tmp) {
1586                 ret = domain_context_mapping_one(domain,
1587                                                  pci_domain_nr(parent->bus),
1588                                                  parent->bus->number,
1589                                                  parent->devfn, translation);
1590                 if (ret)
1591                         return ret;
1592                 parent = parent->bus->self;
1593         }
1594         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1595                 return domain_context_mapping_one(domain,
1596                                         pci_domain_nr(tmp->subordinate),
1597                                         tmp->subordinate->number, 0,
1598                                         translation);
1599         else /* this is a legacy PCI bridge */
1600                 return domain_context_mapping_one(domain,
1601                                                   pci_domain_nr(tmp->bus),
1602                                                   tmp->bus->number,
1603                                                   tmp->devfn,
1604                                                   translation);
1605 }
1606
1607 static int domain_context_mapped(struct pci_dev *pdev)
1608 {
1609         int ret;
1610         struct pci_dev *tmp, *parent;
1611         struct intel_iommu *iommu;
1612
1613         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1614                                 pdev->devfn);
1615         if (!iommu)
1616                 return -ENODEV;
1617
1618         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1619         if (!ret)
1620                 return ret;
1621         /* dependent device mapping */
1622         tmp = pci_find_upstream_pcie_bridge(pdev);
1623         if (!tmp)
1624                 return ret;
1625         /* Secondary interface's bus number and devfn 0 */
1626         parent = pdev->bus->self;
1627         while (parent != tmp) {
1628                 ret = device_context_mapped(iommu, parent->bus->number,
1629                                             parent->devfn);
1630                 if (!ret)
1631                         return ret;
1632                 parent = parent->bus->self;
1633         }
1634         if (pci_is_pcie(tmp))
1635                 return device_context_mapped(iommu, tmp->subordinate->number,
1636                                              0);
1637         else
1638                 return device_context_mapped(iommu, tmp->bus->number,
1639                                              tmp->devfn);
1640 }
1641
1642 /* Returns a number of VTD pages, but aligned to MM page size */
1643 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1644                                             size_t size)
1645 {
1646         host_addr &= ~PAGE_MASK;
1647         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1648 }
1649
1650 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1651                             struct scatterlist *sg, unsigned long phys_pfn,
1652                             unsigned long nr_pages, int prot)
1653 {
1654         struct dma_pte *first_pte = NULL, *pte = NULL;
1655         phys_addr_t uninitialized_var(pteval);
1656         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1657         unsigned long sg_res;
1658
1659         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1660
1661         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1662                 return -EINVAL;
1663
1664         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1665
1666         if (sg)
1667                 sg_res = 0;
1668         else {
1669                 sg_res = nr_pages + 1;
1670                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1671         }
1672
1673         while (nr_pages--) {
1674                 uint64_t tmp;
1675
1676                 if (!sg_res) {
1677                         sg_res = aligned_nrpages(sg->offset, sg->length);
1678                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1679                         sg->dma_length = sg->length;
1680                         pteval = page_to_phys(sg_page(sg)) | prot;
1681                 }
1682                 if (!pte) {
1683                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1684                         if (!pte)
1685                                 return -ENOMEM;
1686                 }
1687                 /* We don't need lock here, nobody else
1688                  * touches the iova range
1689                  */
1690                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1691                 if (tmp) {
1692                         static int dumps = 5;
1693                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1694                                iov_pfn, tmp, (unsigned long long)pteval);
1695                         if (dumps) {
1696                                 dumps--;
1697                                 debug_dma_dump_mappings(NULL);
1698                         }
1699                         WARN_ON(1);
1700                 }
1701                 pte++;
1702                 if (!nr_pages || first_pte_in_page(pte)) {
1703                         domain_flush_cache(domain, first_pte,
1704                                            (void *)pte - (void *)first_pte);
1705                         pte = NULL;
1706                 }
1707                 iov_pfn++;
1708                 pteval += VTD_PAGE_SIZE;
1709                 sg_res--;
1710                 if (!sg_res)
1711                         sg = sg_next(sg);
1712         }
1713         return 0;
1714 }
1715
1716 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1717                                     struct scatterlist *sg, unsigned long nr_pages,
1718                                     int prot)
1719 {
1720         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1721 }
1722
1723 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1724                                      unsigned long phys_pfn, unsigned long nr_pages,
1725                                      int prot)
1726 {
1727         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1728 }
1729
1730 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1731 {
1732         if (!iommu)
1733                 return;
1734
1735         clear_context_table(iommu, bus, devfn);
1736         iommu->flush.flush_context(iommu, 0, 0, 0,
1737                                            DMA_CCMD_GLOBAL_INVL);
1738         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1739 }
1740
1741 static void domain_remove_dev_info(struct dmar_domain *domain)
1742 {
1743         struct device_domain_info *info;
1744         unsigned long flags;
1745         struct intel_iommu *iommu;
1746
1747         spin_lock_irqsave(&device_domain_lock, flags);
1748         while (!list_empty(&domain->devices)) {
1749                 info = list_entry(domain->devices.next,
1750                         struct device_domain_info, link);
1751                 list_del(&info->link);
1752                 list_del(&info->global);
1753                 if (info->dev)
1754                         info->dev->dev.archdata.iommu = NULL;
1755                 spin_unlock_irqrestore(&device_domain_lock, flags);
1756
1757                 iommu_disable_dev_iotlb(info);
1758                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1759                 iommu_detach_dev(iommu, info->bus, info->devfn);
1760                 free_devinfo_mem(info);
1761
1762                 spin_lock_irqsave(&device_domain_lock, flags);
1763         }
1764         spin_unlock_irqrestore(&device_domain_lock, flags);
1765 }
1766
1767 /*
1768  * find_domain
1769  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1770  */
1771 static struct dmar_domain *
1772 find_domain(struct pci_dev *pdev)
1773 {
1774         struct device_domain_info *info;
1775
1776         /* No lock here, assumes no domain exit in normal case */
1777         info = pdev->dev.archdata.iommu;
1778         if (info)
1779                 return info->domain;
1780         return NULL;
1781 }
1782
1783 /* domain is initialized */
1784 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1785 {
1786         struct dmar_domain *domain, *found = NULL;
1787         struct intel_iommu *iommu;
1788         struct dmar_drhd_unit *drhd;
1789         struct device_domain_info *info, *tmp;
1790         struct pci_dev *dev_tmp;
1791         unsigned long flags;
1792         int bus = 0, devfn = 0;
1793         int segment;
1794         int ret;
1795
1796         domain = find_domain(pdev);
1797         if (domain)
1798                 return domain;
1799
1800         segment = pci_domain_nr(pdev->bus);
1801
1802         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1803         if (dev_tmp) {
1804                 if (pci_is_pcie(dev_tmp)) {
1805                         bus = dev_tmp->subordinate->number;
1806                         devfn = 0;
1807                 } else {
1808                         bus = dev_tmp->bus->number;
1809                         devfn = dev_tmp->devfn;
1810                 }
1811                 spin_lock_irqsave(&device_domain_lock, flags);
1812                 list_for_each_entry(info, &device_domain_list, global) {
1813                         if (info->segment == segment &&
1814                             info->bus == bus && info->devfn == devfn) {
1815                                 found = info->domain;
1816                                 break;
1817                         }
1818                 }
1819                 spin_unlock_irqrestore(&device_domain_lock, flags);
1820                 /* pcie-pci bridge already has a domain, uses it */
1821                 if (found) {
1822                         domain = found;
1823                         goto found_domain;
1824                 }
1825         }
1826
1827         domain = alloc_domain();
1828         if (!domain)
1829                 goto error;
1830
1831         /* Allocate new domain for the device */
1832         drhd = dmar_find_matched_drhd_unit(pdev);
1833         if (!drhd) {
1834                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1835                         pci_name(pdev));
1836                 return NULL;
1837         }
1838         iommu = drhd->iommu;
1839
1840         ret = iommu_attach_domain(domain, iommu);
1841         if (ret) {
1842                 domain_exit(domain);
1843                 goto error;
1844         }
1845
1846         if (domain_init(domain, gaw)) {
1847                 domain_exit(domain);
1848                 goto error;
1849         }
1850
1851         /* register pcie-to-pci device */
1852         if (dev_tmp) {
1853                 info = alloc_devinfo_mem();
1854                 if (!info) {
1855                         domain_exit(domain);
1856                         goto error;
1857                 }
1858                 info->segment = segment;
1859                 info->bus = bus;
1860                 info->devfn = devfn;
1861                 info->dev = NULL;
1862                 info->domain = domain;
1863                 /* This domain is shared by devices under p2p bridge */
1864                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1865
1866                 /* pcie-to-pci bridge already has a domain, uses it */
1867                 found = NULL;
1868                 spin_lock_irqsave(&device_domain_lock, flags);
1869                 list_for_each_entry(tmp, &device_domain_list, global) {
1870                         if (tmp->segment == segment &&
1871                             tmp->bus == bus && tmp->devfn == devfn) {
1872                                 found = tmp->domain;
1873                                 break;
1874                         }
1875                 }
1876                 if (found) {
1877                         spin_unlock_irqrestore(&device_domain_lock, flags);
1878                         free_devinfo_mem(info);
1879                         domain_exit(domain);
1880                         domain = found;
1881                 } else {
1882                         list_add(&info->link, &domain->devices);
1883                         list_add(&info->global, &device_domain_list);
1884                         spin_unlock_irqrestore(&device_domain_lock, flags);
1885                 }
1886         }
1887
1888 found_domain:
1889         info = alloc_devinfo_mem();
1890         if (!info)
1891                 goto error;
1892         info->segment = segment;
1893         info->bus = pdev->bus->number;
1894         info->devfn = pdev->devfn;
1895         info->dev = pdev;
1896         info->domain = domain;
1897         spin_lock_irqsave(&device_domain_lock, flags);
1898         /* somebody is fast */
1899         found = find_domain(pdev);
1900         if (found != NULL) {
1901                 spin_unlock_irqrestore(&device_domain_lock, flags);
1902                 if (found != domain) {
1903                         domain_exit(domain);
1904                         domain = found;
1905                 }
1906                 free_devinfo_mem(info);
1907                 return domain;
1908         }
1909         list_add(&info->link, &domain->devices);
1910         list_add(&info->global, &device_domain_list);
1911         pdev->dev.archdata.iommu = info;
1912         spin_unlock_irqrestore(&device_domain_lock, flags);
1913         return domain;
1914 error:
1915         /* recheck it here, maybe others set it */
1916         return find_domain(pdev);
1917 }
1918
1919 static int iommu_identity_mapping;
1920 #define IDENTMAP_ALL            1
1921 #define IDENTMAP_GFX            2
1922 #define IDENTMAP_AZALIA         4
1923
1924 static int iommu_domain_identity_map(struct dmar_domain *domain,
1925                                      unsigned long long start,
1926                                      unsigned long long end)
1927 {
1928         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1929         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1930
1931         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1932                           dma_to_mm_pfn(last_vpfn))) {
1933                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1934                 return -ENOMEM;
1935         }
1936
1937         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1938                  start, end, domain->id);
1939         /*
1940          * RMRR range might have overlap with physical memory range,
1941          * clear it first
1942          */
1943         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1944
1945         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1946                                   last_vpfn - first_vpfn + 1,
1947                                   DMA_PTE_READ|DMA_PTE_WRITE);
1948 }
1949
1950 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1951                                       unsigned long long start,
1952                                       unsigned long long end)
1953 {
1954         struct dmar_domain *domain;
1955         int ret;
1956
1957         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1958         if (!domain)
1959                 return -ENOMEM;
1960
1961         /* For _hardware_ passthrough, don't bother. But for software
1962            passthrough, we do it anyway -- it may indicate a memory
1963            range which is reserved in E820, so which didn't get set
1964            up to start with in si_domain */
1965         if (domain == si_domain && hw_pass_through) {
1966                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1967                        pci_name(pdev), start, end);
1968                 return 0;
1969         }
1970
1971         printk(KERN_INFO
1972                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1973                pci_name(pdev), start, end);
1974         
1975         if (end < start) {
1976                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1977                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1978                         dmi_get_system_info(DMI_BIOS_VENDOR),
1979                         dmi_get_system_info(DMI_BIOS_VERSION),
1980                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1981                 ret = -EIO;
1982                 goto error;
1983         }
1984
1985         if (end >> agaw_to_width(domain->agaw)) {
1986                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1987                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1988                      agaw_to_width(domain->agaw),
1989                      dmi_get_system_info(DMI_BIOS_VENDOR),
1990                      dmi_get_system_info(DMI_BIOS_VERSION),
1991                      dmi_get_system_info(DMI_PRODUCT_VERSION));
1992                 ret = -EIO;
1993                 goto error;
1994         }
1995
1996         ret = iommu_domain_identity_map(domain, start, end);
1997         if (ret)
1998                 goto error;
1999
2000         /* context entry init */
2001         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2002         if (ret)
2003                 goto error;
2004
2005         return 0;
2006
2007  error:
2008         domain_exit(domain);
2009         return ret;
2010 }
2011
2012 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2013         struct pci_dev *pdev)
2014 {
2015         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2016                 return 0;
2017         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2018                 rmrr->end_address + 1);
2019 }
2020
2021 #ifdef CONFIG_DMAR_FLOPPY_WA
2022 static inline void iommu_prepare_isa(void)
2023 {
2024         struct pci_dev *pdev;
2025         int ret;
2026
2027         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2028         if (!pdev)
2029                 return;
2030
2031         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2032         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2033
2034         if (ret)
2035                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2036                        "floppy might not work\n");
2037
2038 }
2039 #else
2040 static inline void iommu_prepare_isa(void)
2041 {
2042         return;
2043 }
2044 #endif /* !CONFIG_DMAR_FLPY_WA */
2045
2046 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2047
2048 static int __init si_domain_work_fn(unsigned long start_pfn,
2049                                     unsigned long end_pfn, void *datax)
2050 {
2051         int *ret = datax;
2052
2053         *ret = iommu_domain_identity_map(si_domain,
2054                                          (uint64_t)start_pfn << PAGE_SHIFT,
2055                                          (uint64_t)end_pfn << PAGE_SHIFT);
2056         return *ret;
2057
2058 }
2059
2060 static int __init si_domain_init(int hw)
2061 {
2062         struct dmar_drhd_unit *drhd;
2063         struct intel_iommu *iommu;
2064         int nid, ret = 0;
2065
2066         si_domain = alloc_domain();
2067         if (!si_domain)
2068                 return -EFAULT;
2069
2070         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2071
2072         for_each_active_iommu(iommu, drhd) {
2073                 ret = iommu_attach_domain(si_domain, iommu);
2074                 if (ret) {
2075                         domain_exit(si_domain);
2076                         return -EFAULT;
2077                 }
2078         }
2079
2080         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2081                 domain_exit(si_domain);
2082                 return -EFAULT;
2083         }
2084
2085         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2086
2087         if (hw)
2088                 return 0;
2089
2090         for_each_online_node(nid) {
2091                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2092                 if (ret)
2093                         return ret;
2094         }
2095
2096         return 0;
2097 }
2098
2099 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2100                                           struct pci_dev *pdev);
2101 static int identity_mapping(struct pci_dev *pdev)
2102 {
2103         struct device_domain_info *info;
2104
2105         if (likely(!iommu_identity_mapping))
2106                 return 0;
2107
2108
2109         list_for_each_entry(info, &si_domain->devices, link)
2110                 if (info->dev == pdev)
2111                         return 1;
2112         return 0;
2113 }
2114
2115 static int domain_add_dev_info(struct dmar_domain *domain,
2116                                struct pci_dev *pdev,
2117                                int translation)
2118 {
2119         struct device_domain_info *info;
2120         unsigned long flags;
2121         int ret;
2122
2123         info = alloc_devinfo_mem();
2124         if (!info)
2125                 return -ENOMEM;
2126
2127         ret = domain_context_mapping(domain, pdev, translation);
2128         if (ret) {
2129                 free_devinfo_mem(info);
2130                 return ret;
2131         }
2132
2133         info->segment = pci_domain_nr(pdev->bus);
2134         info->bus = pdev->bus->number;
2135         info->devfn = pdev->devfn;
2136         info->dev = pdev;
2137         info->domain = domain;
2138
2139         spin_lock_irqsave(&device_domain_lock, flags);
2140         list_add(&info->link, &domain->devices);
2141         list_add(&info->global, &device_domain_list);
2142         pdev->dev.archdata.iommu = info;
2143         spin_unlock_irqrestore(&device_domain_lock, flags);
2144
2145         return 0;
2146 }
2147
2148 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2149 {
2150         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2151                 return 1;
2152
2153         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2154                 return 1;
2155
2156         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2157                 return 0;
2158
2159         /*
2160          * We want to start off with all devices in the 1:1 domain, and
2161          * take them out later if we find they can't access all of memory.
2162          *
2163          * However, we can't do this for PCI devices behind bridges,
2164          * because all PCI devices behind the same bridge will end up
2165          * with the same source-id on their transactions.
2166          *
2167          * Practically speaking, we can't change things around for these
2168          * devices at run-time, because we can't be sure there'll be no
2169          * DMA transactions in flight for any of their siblings.
2170          * 
2171          * So PCI devices (unless they're on the root bus) as well as
2172          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2173          * the 1:1 domain, just in _case_ one of their siblings turns out
2174          * not to be able to map all of memory.
2175          */
2176         if (!pci_is_pcie(pdev)) {
2177                 if (!pci_is_root_bus(pdev->bus))
2178                         return 0;
2179                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2180                         return 0;
2181         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2182                 return 0;
2183
2184         /* 
2185          * At boot time, we don't yet know if devices will be 64-bit capable.
2186          * Assume that they will -- if they turn out not to be, then we can 
2187          * take them out of the 1:1 domain later.
2188          */
2189         if (!startup)
2190                 return pdev->dma_mask > DMA_BIT_MASK(32);
2191
2192         return 1;
2193 }
2194
2195 static int __init iommu_prepare_static_identity_mapping(int hw)
2196 {
2197         struct pci_dev *pdev = NULL;
2198         int ret;
2199
2200         ret = si_domain_init(hw);
2201         if (ret)
2202                 return -EFAULT;
2203
2204         for_each_pci_dev(pdev) {
2205                 if (iommu_should_identity_map(pdev, 1)) {
2206                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2207                                hw ? "hardware" : "software", pci_name(pdev));
2208
2209                         ret = domain_add_dev_info(si_domain, pdev,
2210                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2211                                                      CONTEXT_TT_MULTI_LEVEL);
2212                         if (ret)
2213                                 return ret;
2214                 }
2215         }
2216
2217         return 0;
2218 }
2219
2220 int __init init_dmars(void)
2221 {
2222         struct dmar_drhd_unit *drhd;
2223         struct dmar_rmrr_unit *rmrr;
2224         struct pci_dev *pdev;
2225         struct intel_iommu *iommu;
2226         int i, ret;
2227
2228         /*
2229          * for each drhd
2230          *    allocate root
2231          *    initialize and program root entry to not present
2232          * endfor
2233          */
2234         for_each_drhd_unit(drhd) {
2235                 g_num_of_iommus++;
2236                 /*
2237                  * lock not needed as this is only incremented in the single
2238                  * threaded kernel __init code path all other access are read
2239                  * only
2240                  */
2241         }
2242
2243         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2244                         GFP_KERNEL);
2245         if (!g_iommus) {
2246                 printk(KERN_ERR "Allocating global iommu array failed\n");
2247                 ret = -ENOMEM;
2248                 goto error;
2249         }
2250
2251         deferred_flush = kzalloc(g_num_of_iommus *
2252                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2253         if (!deferred_flush) {
2254                 ret = -ENOMEM;
2255                 goto error;
2256         }
2257
2258         for_each_drhd_unit(drhd) {
2259                 if (drhd->ignored)
2260                         continue;
2261
2262                 iommu = drhd->iommu;
2263                 g_iommus[iommu->seq_id] = iommu;
2264
2265                 ret = iommu_init_domains(iommu);
2266                 if (ret)
2267                         goto error;
2268
2269                 /*
2270                  * TBD:
2271                  * we could share the same root & context tables
2272                  * amoung all IOMMU's. Need to Split it later.
2273                  */
2274                 ret = iommu_alloc_root_entry(iommu);
2275                 if (ret) {
2276                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2277                         goto error;
2278                 }
2279                 if (!ecap_pass_through(iommu->ecap))
2280                         hw_pass_through = 0;
2281         }
2282
2283         /*
2284          * Start from the sane iommu hardware state.
2285          */
2286         for_each_drhd_unit(drhd) {
2287                 if (drhd->ignored)
2288                         continue;
2289
2290                 iommu = drhd->iommu;
2291
2292                 /*
2293                  * If the queued invalidation is already initialized by us
2294                  * (for example, while enabling interrupt-remapping) then
2295                  * we got the things already rolling from a sane state.
2296                  */
2297                 if (iommu->qi)
2298                         continue;
2299
2300                 /*
2301                  * Clear any previous faults.
2302                  */
2303                 dmar_fault(-1, iommu);
2304                 /*
2305                  * Disable queued invalidation if supported and already enabled
2306                  * before OS handover.
2307                  */
2308                 dmar_disable_qi(iommu);
2309         }
2310
2311         for_each_drhd_unit(drhd) {
2312                 if (drhd->ignored)
2313                         continue;
2314
2315                 iommu = drhd->iommu;
2316
2317                 if (dmar_enable_qi(iommu)) {
2318                         /*
2319                          * Queued Invalidate not enabled, use Register Based
2320                          * Invalidate
2321                          */
2322                         iommu->flush.flush_context = __iommu_flush_context;
2323                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2324                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2325                                "invalidation\n",
2326                                 iommu->seq_id,
2327                                (unsigned long long)drhd->reg_base_addr);
2328                 } else {
2329                         iommu->flush.flush_context = qi_flush_context;
2330                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2331                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2332                                "invalidation\n",
2333                                 iommu->seq_id,
2334                                (unsigned long long)drhd->reg_base_addr);
2335                 }
2336         }
2337
2338         if (iommu_pass_through)
2339                 iommu_identity_mapping |= IDENTMAP_ALL;
2340
2341 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2342         iommu_identity_mapping |= IDENTMAP_GFX;
2343 #endif
2344
2345         check_tylersburg_isoch();
2346
2347         /*
2348          * If pass through is not set or not enabled, setup context entries for
2349          * identity mappings for rmrr, gfx, and isa and may fall back to static
2350          * identity mapping if iommu_identity_mapping is set.
2351          */
2352         if (iommu_identity_mapping) {
2353                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2354                 if (ret) {
2355                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2356                         goto error;
2357                 }
2358         }
2359         /*
2360          * For each rmrr
2361          *   for each dev attached to rmrr
2362          *   do
2363          *     locate drhd for dev, alloc domain for dev
2364          *     allocate free domain
2365          *     allocate page table entries for rmrr
2366          *     if context not allocated for bus
2367          *           allocate and init context
2368          *           set present in root table for this bus
2369          *     init context with domain, translation etc
2370          *    endfor
2371          * endfor
2372          */
2373         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2374         for_each_rmrr_units(rmrr) {
2375                 for (i = 0; i < rmrr->devices_cnt; i++) {
2376                         pdev = rmrr->devices[i];
2377                         /*
2378                          * some BIOS lists non-exist devices in DMAR
2379                          * table.
2380                          */
2381                         if (!pdev)
2382                                 continue;
2383                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2384                         if (ret)
2385                                 printk(KERN_ERR
2386                                        "IOMMU: mapping reserved region failed\n");
2387                 }
2388         }
2389
2390         iommu_prepare_isa();
2391
2392         /*
2393          * for each drhd
2394          *   enable fault log
2395          *   global invalidate context cache
2396          *   global invalidate iotlb
2397          *   enable translation
2398          */
2399         for_each_drhd_unit(drhd) {
2400                 if (drhd->ignored)
2401                         continue;
2402                 iommu = drhd->iommu;
2403
2404                 iommu_flush_write_buffer(iommu);
2405
2406                 ret = dmar_set_interrupt(iommu);
2407                 if (ret)
2408                         goto error;
2409
2410                 iommu_set_root_entry(iommu);
2411
2412                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2413                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2414
2415                 ret = iommu_enable_translation(iommu);
2416                 if (ret)
2417                         goto error;
2418
2419                 iommu_disable_protect_mem_regions(iommu);
2420         }
2421
2422         return 0;
2423 error:
2424         for_each_drhd_unit(drhd) {
2425                 if (drhd->ignored)
2426                         continue;
2427                 iommu = drhd->iommu;
2428                 free_iommu(iommu);
2429         }
2430         kfree(g_iommus);
2431         return ret;
2432 }
2433
2434 /* This takes a number of _MM_ pages, not VTD pages */
2435 static struct iova *intel_alloc_iova(struct device *dev,
2436                                      struct dmar_domain *domain,
2437                                      unsigned long nrpages, uint64_t dma_mask)
2438 {
2439         struct pci_dev *pdev = to_pci_dev(dev);
2440         struct iova *iova = NULL;
2441
2442         /* Restrict dma_mask to the width that the iommu can handle */
2443         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2444
2445         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2446                 /*
2447                  * First try to allocate an io virtual address in
2448                  * DMA_BIT_MASK(32) and if that fails then try allocating
2449                  * from higher range
2450                  */
2451                 iova = alloc_iova(&domain->iovad, nrpages,
2452                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2453                 if (iova)
2454                         return iova;
2455         }
2456         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2457         if (unlikely(!iova)) {
2458                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2459                        nrpages, pci_name(pdev));
2460                 return NULL;
2461         }
2462
2463         return iova;
2464 }
2465
2466 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2467 {
2468         struct dmar_domain *domain;
2469         int ret;
2470
2471         domain = get_domain_for_dev(pdev,
2472                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2473         if (!domain) {
2474                 printk(KERN_ERR
2475                         "Allocating domain for %s failed", pci_name(pdev));
2476                 return NULL;
2477         }
2478
2479         /* make sure context mapping is ok */
2480         if (unlikely(!domain_context_mapped(pdev))) {
2481                 ret = domain_context_mapping(domain, pdev,
2482                                              CONTEXT_TT_MULTI_LEVEL);
2483                 if (ret) {
2484                         printk(KERN_ERR
2485                                 "Domain context map for %s failed",
2486                                 pci_name(pdev));
2487                         return NULL;
2488                 }
2489         }
2490
2491         return domain;
2492 }
2493
2494 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2495 {
2496         struct device_domain_info *info;
2497
2498         /* No lock here, assumes no domain exit in normal case */
2499         info = dev->dev.archdata.iommu;
2500         if (likely(info))
2501                 return info->domain;
2502
2503         return __get_valid_domain_for_dev(dev);
2504 }
2505
2506 static int iommu_dummy(struct pci_dev *pdev)
2507 {
2508         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2509 }
2510
2511 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2512 static int iommu_no_mapping(struct device *dev)
2513 {
2514         struct pci_dev *pdev;
2515         int found;
2516
2517         if (unlikely(dev->bus != &pci_bus_type))
2518                 return 1;
2519
2520         pdev = to_pci_dev(dev);
2521         if (iommu_dummy(pdev))
2522                 return 1;
2523
2524         if (!iommu_identity_mapping)
2525                 return 0;
2526
2527         found = identity_mapping(pdev);
2528         if (found) {
2529                 if (iommu_should_identity_map(pdev, 0))
2530                         return 1;
2531                 else {
2532                         /*
2533                          * 32 bit DMA is removed from si_domain and fall back
2534                          * to non-identity mapping.
2535                          */
2536                         domain_remove_one_dev_info(si_domain, pdev);
2537                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2538                                pci_name(pdev));
2539                         return 0;
2540                 }
2541         } else {
2542                 /*
2543                  * In case of a detached 64 bit DMA device from vm, the device
2544                  * is put into si_domain for identity mapping.
2545                  */
2546                 if (iommu_should_identity_map(pdev, 0)) {
2547                         int ret;
2548                         ret = domain_add_dev_info(si_domain, pdev,
2549                                                   hw_pass_through ?
2550                                                   CONTEXT_TT_PASS_THROUGH :
2551                                                   CONTEXT_TT_MULTI_LEVEL);
2552                         if (!ret) {
2553                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2554                                        pci_name(pdev));
2555                                 return 1;
2556                         }
2557                 }
2558         }
2559
2560         return 0;
2561 }
2562
2563 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2564                                      size_t size, int dir, u64 dma_mask)
2565 {
2566         struct pci_dev *pdev = to_pci_dev(hwdev);
2567         struct dmar_domain *domain;
2568         phys_addr_t start_paddr;
2569         struct iova *iova;
2570         int prot = 0;
2571         int ret;
2572         struct intel_iommu *iommu;
2573         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2574
2575         BUG_ON(dir == DMA_NONE);
2576
2577         if (iommu_no_mapping(hwdev))
2578                 return paddr;
2579
2580         domain = get_valid_domain_for_dev(pdev);
2581         if (!domain)
2582                 return 0;
2583
2584         iommu = domain_get_iommu(domain);
2585         size = aligned_nrpages(paddr, size);
2586
2587         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2588                                 pdev->dma_mask);
2589         if (!iova)
2590                 goto error;
2591
2592         /*
2593          * Check if DMAR supports zero-length reads on write only
2594          * mappings..
2595          */
2596         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2597                         !cap_zlr(iommu->cap))
2598                 prot |= DMA_PTE_READ;
2599         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2600                 prot |= DMA_PTE_WRITE;
2601         /*
2602          * paddr - (paddr + size) might be partial page, we should map the whole
2603          * page.  Note: if two part of one page are separately mapped, we
2604          * might have two guest_addr mapping to the same host paddr, but this
2605          * is not a big problem
2606          */
2607         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2608                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2609         if (ret)
2610                 goto error;
2611
2612         /* it's a non-present to present mapping. Only flush if caching mode */
2613         if (cap_caching_mode(iommu->cap))
2614                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2615         else
2616                 iommu_flush_write_buffer(iommu);
2617
2618         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2619         start_paddr += paddr & ~PAGE_MASK;
2620         return start_paddr;
2621
2622 error:
2623         if (iova)
2624                 __free_iova(&domain->iovad, iova);
2625         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2626                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2627         return 0;
2628 }
2629
2630 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2631                                  unsigned long offset, size_t size,
2632                                  enum dma_data_direction dir,
2633                                  struct dma_attrs *attrs)
2634 {
2635         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2636                                   dir, to_pci_dev(dev)->dma_mask);
2637 }
2638
2639 static void flush_unmaps(void)
2640 {
2641         int i, j;
2642
2643         timer_on = 0;
2644
2645         /* just flush them all */
2646         for (i = 0; i < g_num_of_iommus; i++) {
2647                 struct intel_iommu *iommu = g_iommus[i];
2648                 if (!iommu)
2649                         continue;
2650
2651                 if (!deferred_flush[i].next)
2652                         continue;
2653
2654                 /* In caching mode, global flushes turn emulation expensive */
2655                 if (!cap_caching_mode(iommu->cap))
2656                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2657                                          DMA_TLB_GLOBAL_FLUSH);
2658                 for (j = 0; j < deferred_flush[i].next; j++) {
2659                         unsigned long mask;
2660                         struct iova *iova = deferred_flush[i].iova[j];
2661                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2662
2663                         /* On real hardware multiple invalidations are expensive */
2664                         if (cap_caching_mode(iommu->cap))
2665                                 iommu_flush_iotlb_psi(iommu, domain->id,
2666                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2667                         else {
2668                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2669                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2670                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2671                         }
2672                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2673                 }
2674                 deferred_flush[i].next = 0;
2675         }
2676
2677         list_size = 0;
2678 }
2679
2680 static void flush_unmaps_timeout(unsigned long data)
2681 {
2682         unsigned long flags;
2683
2684         spin_lock_irqsave(&async_umap_flush_lock, flags);
2685         flush_unmaps();
2686         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2687 }
2688
2689 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2690 {
2691         unsigned long flags;
2692         int next, iommu_id;
2693         struct intel_iommu *iommu;
2694
2695         spin_lock_irqsave(&async_umap_flush_lock, flags);
2696         if (list_size == HIGH_WATER_MARK)
2697                 flush_unmaps();
2698
2699         iommu = domain_get_iommu(dom);
2700         iommu_id = iommu->seq_id;
2701
2702         next = deferred_flush[iommu_id].next;
2703         deferred_flush[iommu_id].domain[next] = dom;
2704         deferred_flush[iommu_id].iova[next] = iova;
2705         deferred_flush[iommu_id].next++;
2706
2707         if (!timer_on) {
2708                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2709                 timer_on = 1;
2710         }
2711         list_size++;
2712         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2713 }
2714
2715 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2716                              size_t size, enum dma_data_direction dir,
2717                              struct dma_attrs *attrs)
2718 {
2719         struct pci_dev *pdev = to_pci_dev(dev);
2720         struct dmar_domain *domain;
2721         unsigned long start_pfn, last_pfn;
2722         struct iova *iova;
2723         struct intel_iommu *iommu;
2724
2725         if (iommu_no_mapping(dev))
2726                 return;
2727
2728         domain = find_domain(pdev);
2729         BUG_ON(!domain);
2730
2731         iommu = domain_get_iommu(domain);
2732
2733         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2734         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2735                       (unsigned long long)dev_addr))
2736                 return;
2737
2738         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2739         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2740
2741         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2742                  pci_name(pdev), start_pfn, last_pfn);
2743
2744         /*  clear the whole page */
2745         dma_pte_clear_range(domain, start_pfn, last_pfn);
2746
2747         /* free page tables */
2748         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2749
2750         if (intel_iommu_strict) {
2751                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2752                                       last_pfn - start_pfn + 1, 0);
2753                 /* free iova */
2754                 __free_iova(&domain->iovad, iova);
2755         } else {
2756                 add_unmap(domain, iova);
2757                 /*
2758                  * queue up the release of the unmap to save the 1/6th of the
2759                  * cpu used up by the iotlb flush operation...
2760                  */
2761         }
2762 }
2763
2764 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2765                                   dma_addr_t *dma_handle, gfp_t flags)
2766 {
2767         void *vaddr;
2768         int order;
2769
2770         size = PAGE_ALIGN(size);
2771         order = get_order(size);
2772
2773         if (!iommu_no_mapping(hwdev))
2774                 flags &= ~(GFP_DMA | GFP_DMA32);
2775         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2776                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2777                         flags |= GFP_DMA;
2778                 else
2779                         flags |= GFP_DMA32;
2780         }
2781
2782         vaddr = (void *)__get_free_pages(flags, order);
2783         if (!vaddr)
2784                 return NULL;
2785         memset(vaddr, 0, size);
2786
2787         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2788                                          DMA_BIDIRECTIONAL,
2789                                          hwdev->coherent_dma_mask);
2790         if (*dma_handle)
2791                 return vaddr;
2792         free_pages((unsigned long)vaddr, order);
2793         return NULL;
2794 }
2795
2796 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2797                                 dma_addr_t dma_handle)
2798 {
2799         int order;
2800
2801         size = PAGE_ALIGN(size);
2802         order = get_order(size);
2803
2804         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2805         free_pages((unsigned long)vaddr, order);
2806 }
2807
2808 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2809                            int nelems, enum dma_data_direction dir,
2810                            struct dma_attrs *attrs)
2811 {
2812         struct pci_dev *pdev = to_pci_dev(hwdev);
2813         struct dmar_domain *domain;
2814         unsigned long start_pfn, last_pfn;
2815         struct iova *iova;
2816         struct intel_iommu *iommu;
2817
2818         if (iommu_no_mapping(hwdev))
2819                 return;
2820
2821         domain = find_domain(pdev);
2822         BUG_ON(!domain);
2823
2824         iommu = domain_get_iommu(domain);
2825
2826         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2827         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2828                       (unsigned long long)sglist[0].dma_address))
2829                 return;
2830
2831         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2832         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2833
2834         /*  clear the whole page */
2835         dma_pte_clear_range(domain, start_pfn, last_pfn);
2836
2837         /* free page tables */
2838         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2839
2840         if (intel_iommu_strict) {
2841                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2842                                       last_pfn - start_pfn + 1, 0);
2843                 /* free iova */
2844                 __free_iova(&domain->iovad, iova);
2845         } else {
2846                 add_unmap(domain, iova);
2847                 /*
2848                  * queue up the release of the unmap to save the 1/6th of the
2849                  * cpu used up by the iotlb flush operation...
2850                  */
2851         }
2852 }
2853
2854 static int intel_nontranslate_map_sg(struct device *hddev,
2855         struct scatterlist *sglist, int nelems, int dir)
2856 {
2857         int i;
2858         struct scatterlist *sg;
2859
2860         for_each_sg(sglist, sg, nelems, i) {
2861                 BUG_ON(!sg_page(sg));
2862                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2863                 sg->dma_length = sg->length;
2864         }
2865         return nelems;
2866 }
2867
2868 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2869                         enum dma_data_direction dir, struct dma_attrs *attrs)
2870 {
2871         int i;
2872         struct pci_dev *pdev = to_pci_dev(hwdev);
2873         struct dmar_domain *domain;
2874         size_t size = 0;
2875         int prot = 0;
2876         struct iova *iova = NULL;
2877         int ret;
2878         struct scatterlist *sg;
2879         unsigned long start_vpfn;
2880         struct intel_iommu *iommu;
2881
2882         BUG_ON(dir == DMA_NONE);
2883         if (iommu_no_mapping(hwdev))
2884                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2885
2886         domain = get_valid_domain_for_dev(pdev);
2887         if (!domain)
2888                 return 0;
2889
2890         iommu = domain_get_iommu(domain);
2891
2892         for_each_sg(sglist, sg, nelems, i)
2893                 size += aligned_nrpages(sg->offset, sg->length);
2894
2895         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2896                                 pdev->dma_mask);
2897         if (!iova) {
2898                 sglist->dma_length = 0;
2899                 return 0;
2900         }
2901
2902         /*
2903          * Check if DMAR supports zero-length reads on write only
2904          * mappings..
2905          */
2906         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2907                         !cap_zlr(iommu->cap))
2908                 prot |= DMA_PTE_READ;
2909         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2910                 prot |= DMA_PTE_WRITE;
2911
2912         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2913
2914         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2915         if (unlikely(ret)) {
2916                 /*  clear the page */
2917                 dma_pte_clear_range(domain, start_vpfn,
2918                                     start_vpfn + size - 1);
2919                 /* free page tables */
2920                 dma_pte_free_pagetable(domain, start_vpfn,
2921                                        start_vpfn + size - 1);
2922                 /* free iova */
2923                 __free_iova(&domain->iovad, iova);
2924                 return 0;
2925         }
2926
2927         /* it's a non-present to present mapping. Only flush if caching mode */
2928         if (cap_caching_mode(iommu->cap))
2929                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
2930         else
2931                 iommu_flush_write_buffer(iommu);
2932
2933         return nelems;
2934 }
2935
2936 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2937 {
2938         return !dma_addr;
2939 }
2940
2941 struct dma_map_ops intel_dma_ops = {
2942         .alloc_coherent = intel_alloc_coherent,
2943         .free_coherent = intel_free_coherent,
2944         .map_sg = intel_map_sg,
2945         .unmap_sg = intel_unmap_sg,
2946         .map_page = intel_map_page,
2947         .unmap_page = intel_unmap_page,
2948         .mapping_error = intel_mapping_error,
2949 };
2950
2951 static inline int iommu_domain_cache_init(void)
2952 {
2953         int ret = 0;
2954
2955         iommu_domain_cache = kmem_cache_create("iommu_domain",
2956                                          sizeof(struct dmar_domain),
2957                                          0,
2958                                          SLAB_HWCACHE_ALIGN,
2959
2960                                          NULL);
2961         if (!iommu_domain_cache) {
2962                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2963                 ret = -ENOMEM;
2964         }
2965
2966         return ret;
2967 }
2968
2969 static inline int iommu_devinfo_cache_init(void)
2970 {
2971         int ret = 0;
2972
2973         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2974                                          sizeof(struct device_domain_info),
2975                                          0,
2976                                          SLAB_HWCACHE_ALIGN,
2977                                          NULL);
2978         if (!iommu_devinfo_cache) {
2979                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2980                 ret = -ENOMEM;
2981         }
2982
2983         return ret;
2984 }
2985
2986 static inline int iommu_iova_cache_init(void)
2987 {
2988         int ret = 0;
2989
2990         iommu_iova_cache = kmem_cache_create("iommu_iova",
2991                                          sizeof(struct iova),
2992                                          0,
2993                                          SLAB_HWCACHE_ALIGN,
2994                                          NULL);
2995         if (!iommu_iova_cache) {
2996                 printk(KERN_ERR "Couldn't create iova cache\n");
2997                 ret = -ENOMEM;
2998         }
2999
3000         return ret;
3001 }
3002
3003 static int __init iommu_init_mempool(void)
3004 {
3005         int ret;
3006         ret = iommu_iova_cache_init();
3007         if (ret)
3008                 return ret;
3009
3010         ret = iommu_domain_cache_init();
3011         if (ret)
3012                 goto domain_error;
3013
3014         ret = iommu_devinfo_cache_init();
3015         if (!ret)
3016                 return ret;
3017
3018         kmem_cache_destroy(iommu_domain_cache);
3019 domain_error:
3020         kmem_cache_destroy(iommu_iova_cache);
3021
3022         return -ENOMEM;
3023 }
3024
3025 static void __init iommu_exit_mempool(void)
3026 {
3027         kmem_cache_destroy(iommu_devinfo_cache);
3028         kmem_cache_destroy(iommu_domain_cache);
3029         kmem_cache_destroy(iommu_iova_cache);
3030
3031 }
3032
3033 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3034 {
3035         struct dmar_drhd_unit *drhd;
3036         u32 vtbar;
3037         int rc;
3038
3039         /* We know that this device on this chipset has its own IOMMU.
3040          * If we find it under a different IOMMU, then the BIOS is lying
3041          * to us. Hope that the IOMMU for this device is actually
3042          * disabled, and it needs no translation...
3043          */
3044         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3045         if (rc) {
3046                 /* "can't" happen */
3047                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3048                 return;
3049         }
3050         vtbar &= 0xffff0000;
3051
3052         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3053         drhd = dmar_find_matched_drhd_unit(pdev);
3054         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3055                             TAINT_FIRMWARE_WORKAROUND,
3056                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3057                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3058 }
3059 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3060
3061 static void __init init_no_remapping_devices(void)
3062 {
3063         struct dmar_drhd_unit *drhd;
3064
3065         for_each_drhd_unit(drhd) {
3066                 if (!drhd->include_all) {
3067                         int i;
3068                         for (i = 0; i < drhd->devices_cnt; i++)
3069                                 if (drhd->devices[i] != NULL)
3070                                         break;
3071                         /* ignore DMAR unit if no pci devices exist */
3072                         if (i == drhd->devices_cnt)
3073                                 drhd->ignored = 1;
3074                 }
3075         }
3076
3077         if (dmar_map_gfx)
3078                 return;
3079
3080         for_each_drhd_unit(drhd) {
3081                 int i;
3082                 if (drhd->ignored || drhd->include_all)
3083                         continue;
3084
3085                 for (i = 0; i < drhd->devices_cnt; i++)
3086                         if (drhd->devices[i] &&
3087                                 !IS_GFX_DEVICE(drhd->devices[i]))
3088                                 break;
3089
3090                 if (i < drhd->devices_cnt)
3091                         continue;
3092
3093                 /* bypass IOMMU if it is just for gfx devices */
3094                 drhd->ignored = 1;
3095                 for (i = 0; i < drhd->devices_cnt; i++) {
3096                         if (!drhd->devices[i])
3097                                 continue;
3098                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3099                 }
3100         }
3101 }
3102
3103 #ifdef CONFIG_SUSPEND
3104 static int init_iommu_hw(void)
3105 {
3106         struct dmar_drhd_unit *drhd;
3107         struct intel_iommu *iommu = NULL;
3108
3109         for_each_active_iommu(iommu, drhd)
3110                 if (iommu->qi)
3111                         dmar_reenable_qi(iommu);
3112
3113         for_each_active_iommu(iommu, drhd) {
3114                 iommu_flush_write_buffer(iommu);
3115
3116                 iommu_set_root_entry(iommu);
3117
3118                 iommu->flush.flush_context(iommu, 0, 0, 0,
3119                                            DMA_CCMD_GLOBAL_INVL);
3120                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3121                                          DMA_TLB_GLOBAL_FLUSH);
3122                 iommu_enable_translation(iommu);
3123                 iommu_disable_protect_mem_regions(iommu);
3124         }
3125
3126         return 0;
3127 }
3128
3129 static void iommu_flush_all(void)
3130 {
3131         struct dmar_drhd_unit *drhd;
3132         struct intel_iommu *iommu;
3133
3134         for_each_active_iommu(iommu, drhd) {
3135                 iommu->flush.flush_context(iommu, 0, 0, 0,
3136                                            DMA_CCMD_GLOBAL_INVL);
3137                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3138                                          DMA_TLB_GLOBAL_FLUSH);
3139         }
3140 }
3141
3142 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
3143 {
3144         struct dmar_drhd_unit *drhd;
3145         struct intel_iommu *iommu = NULL;
3146         unsigned long flag;
3147
3148         for_each_active_iommu(iommu, drhd) {
3149                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3150                                                  GFP_ATOMIC);
3151                 if (!iommu->iommu_state)
3152                         goto nomem;
3153         }
3154
3155         iommu_flush_all();
3156
3157         for_each_active_iommu(iommu, drhd) {
3158                 iommu_disable_translation(iommu);
3159
3160                 spin_lock_irqsave(&iommu->register_lock, flag);
3161
3162                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3163                         readl(iommu->reg + DMAR_FECTL_REG);
3164                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3165                         readl(iommu->reg + DMAR_FEDATA_REG);
3166                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3167                         readl(iommu->reg + DMAR_FEADDR_REG);
3168                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3169                         readl(iommu->reg + DMAR_FEUADDR_REG);
3170
3171                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3172         }
3173         return 0;
3174
3175 nomem:
3176         for_each_active_iommu(iommu, drhd)
3177                 kfree(iommu->iommu_state);
3178
3179         return -ENOMEM;
3180 }
3181
3182 static int iommu_resume(struct sys_device *dev)
3183 {
3184         struct dmar_drhd_unit *drhd;
3185         struct intel_iommu *iommu = NULL;
3186         unsigned long flag;
3187
3188         if (init_iommu_hw()) {
3189                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3190                 return -EIO;
3191         }
3192
3193         for_each_active_iommu(iommu, drhd) {
3194
3195                 spin_lock_irqsave(&iommu->register_lock, flag);
3196
3197                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3198                         iommu->reg + DMAR_FECTL_REG);
3199                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3200                         iommu->reg + DMAR_FEDATA_REG);
3201                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3202                         iommu->reg + DMAR_FEADDR_REG);
3203                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3204                         iommu->reg + DMAR_FEUADDR_REG);
3205
3206                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3207         }
3208
3209         for_each_active_iommu(iommu, drhd)
3210                 kfree(iommu->iommu_state);
3211
3212         return 0;
3213 }
3214
3215 static struct sysdev_class iommu_sysclass = {
3216         .name           = "iommu",
3217         .resume         = iommu_resume,
3218         .suspend        = iommu_suspend,
3219 };
3220
3221 static struct sys_device device_iommu = {
3222         .cls    = &iommu_sysclass,
3223 };
3224
3225 static int __init init_iommu_sysfs(void)
3226 {
3227         int error;
3228
3229         error = sysdev_class_register(&iommu_sysclass);
3230         if (error)
3231                 return error;
3232
3233         error = sysdev_register(&device_iommu);
3234         if (error)
3235                 sysdev_class_unregister(&iommu_sysclass);
3236
3237         return error;
3238 }
3239
3240 #else
3241 static int __init init_iommu_sysfs(void)
3242 {
3243         return 0;
3244 }
3245 #endif  /* CONFIG_PM */
3246
3247 /*
3248  * Here we only respond to action of unbound device from driver.
3249  *
3250  * Added device is not attached to its DMAR domain here yet. That will happen
3251  * when mapping the device to iova.
3252  */
3253 static int device_notifier(struct notifier_block *nb,
3254                                   unsigned long action, void *data)
3255 {
3256         struct device *dev = data;
3257         struct pci_dev *pdev = to_pci_dev(dev);
3258         struct dmar_domain *domain;
3259
3260         if (iommu_no_mapping(dev))
3261                 return 0;
3262
3263         domain = find_domain(pdev);
3264         if (!domain)
3265                 return 0;
3266
3267         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through)
3268                 domain_remove_one_dev_info(domain, pdev);
3269
3270         return 0;
3271 }
3272
3273 static struct notifier_block device_nb = {
3274         .notifier_call = device_notifier,
3275 };
3276
3277 int __init intel_iommu_init(void)
3278 {
3279         int ret = 0;
3280         int force_on = 0;
3281
3282         /* VT-d is required for a TXT/tboot launch, so enforce that */
3283         force_on = tboot_force_iommu();
3284
3285         if (dmar_table_init()) {
3286                 if (force_on)
3287                         panic("tboot: Failed to initialize DMAR table\n");
3288                 return  -ENODEV;
3289         }
3290
3291         if (dmar_dev_scope_init()) {
3292                 if (force_on)
3293                         panic("tboot: Failed to initialize DMAR device scope\n");
3294                 return  -ENODEV;
3295         }
3296
3297         /*
3298          * Check the need for DMA-remapping initialization now.
3299          * Above initialization will also be used by Interrupt-remapping.
3300          */
3301         if (no_iommu || dmar_disabled)
3302                 return -ENODEV;
3303
3304         iommu_init_mempool();
3305         dmar_init_reserved_ranges();
3306
3307         init_no_remapping_devices();
3308
3309         ret = init_dmars();
3310         if (ret) {
3311                 if (force_on)
3312                         panic("tboot: Failed to initialize DMARs\n");
3313                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3314                 put_iova_domain(&reserved_iova_list);
3315                 iommu_exit_mempool();
3316                 return ret;
3317         }
3318         printk(KERN_INFO
3319         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3320
3321         init_timer(&unmap_timer);
3322 #ifdef CONFIG_SWIOTLB
3323         swiotlb = 0;
3324 #endif
3325         dma_ops = &intel_dma_ops;
3326
3327         init_iommu_sysfs();
3328
3329         register_iommu(&intel_iommu_ops);
3330
3331         bus_register_notifier(&pci_bus_type, &device_nb);
3332
3333         return 0;
3334 }
3335
3336 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3337                                            struct pci_dev *pdev)
3338 {
3339         struct pci_dev *tmp, *parent;
3340
3341         if (!iommu || !pdev)
3342                 return;
3343
3344         /* dependent device detach */
3345         tmp = pci_find_upstream_pcie_bridge(pdev);
3346         /* Secondary interface's bus number and devfn 0 */
3347         if (tmp) {
3348                 parent = pdev->bus->self;
3349                 while (parent != tmp) {
3350                         iommu_detach_dev(iommu, parent->bus->number,
3351                                          parent->devfn);
3352                         parent = parent->bus->self;
3353                 }
3354                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3355                         iommu_detach_dev(iommu,
3356                                 tmp->subordinate->number, 0);
3357                 else /* this is a legacy PCI bridge */
3358                         iommu_detach_dev(iommu, tmp->bus->number,
3359                                          tmp->devfn);
3360         }
3361 }
3362
3363 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3364                                           struct pci_dev *pdev)
3365 {
3366         struct device_domain_info *info;
3367         struct intel_iommu *iommu;
3368         unsigned long flags;
3369         int found = 0;
3370         struct list_head *entry, *tmp;
3371
3372         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3373                                 pdev->devfn);
3374         if (!iommu)
3375                 return;
3376
3377         spin_lock_irqsave(&device_domain_lock, flags);
3378         list_for_each_safe(entry, tmp, &domain->devices) {
3379                 info = list_entry(entry, struct device_domain_info, link);
3380                 /* No need to compare PCI domain; it has to be the same */
3381                 if (info->bus == pdev->bus->number &&
3382                     info->devfn == pdev->devfn) {
3383                         list_del(&info->link);
3384                         list_del(&info->global);
3385                         if (info->dev)
3386                                 info->dev->dev.archdata.iommu = NULL;
3387                         spin_unlock_irqrestore(&device_domain_lock, flags);
3388
3389                         iommu_disable_dev_iotlb(info);
3390                         iommu_detach_dev(iommu, info->bus, info->devfn);
3391                         iommu_detach_dependent_devices(iommu, pdev);
3392                         free_devinfo_mem(info);
3393
3394                         spin_lock_irqsave(&device_domain_lock, flags);
3395
3396                         if (found)
3397                                 break;
3398                         else
3399                                 continue;
3400                 }
3401
3402                 /* if there is no other devices under the same iommu
3403                  * owned by this domain, clear this iommu in iommu_bmp
3404                  * update iommu count and coherency
3405                  */
3406                 if (iommu == device_to_iommu(info->segment, info->bus,
3407                                             info->devfn))
3408                         found = 1;
3409         }
3410
3411         if (found == 0) {
3412                 unsigned long tmp_flags;
3413                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3414                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3415                 domain->iommu_count--;
3416                 domain_update_iommu_cap(domain);
3417                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3418         }
3419
3420         spin_unlock_irqrestore(&device_domain_lock, flags);
3421 }
3422
3423 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3424 {
3425         struct device_domain_info *info;
3426         struct intel_iommu *iommu;
3427         unsigned long flags1, flags2;
3428
3429         spin_lock_irqsave(&device_domain_lock, flags1);
3430         while (!list_empty(&domain->devices)) {
3431                 info = list_entry(domain->devices.next,
3432                         struct device_domain_info, link);
3433                 list_del(&info->link);
3434                 list_del(&info->global);
3435                 if (info->dev)
3436                         info->dev->dev.archdata.iommu = NULL;
3437
3438                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3439
3440                 iommu_disable_dev_iotlb(info);
3441                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3442                 iommu_detach_dev(iommu, info->bus, info->devfn);
3443                 iommu_detach_dependent_devices(iommu, info->dev);
3444
3445                 /* clear this iommu in iommu_bmp, update iommu count
3446                  * and capabilities
3447                  */
3448                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3449                 if (test_and_clear_bit(iommu->seq_id,
3450                                        &domain->iommu_bmp)) {
3451                         domain->iommu_count--;
3452                         domain_update_iommu_cap(domain);
3453                 }
3454                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3455
3456                 free_devinfo_mem(info);
3457                 spin_lock_irqsave(&device_domain_lock, flags1);
3458         }
3459         spin_unlock_irqrestore(&device_domain_lock, flags1);
3460 }
3461
3462 /* domain id for virtual machine, it won't be set in context */
3463 static unsigned long vm_domid;
3464
3465 static struct dmar_domain *iommu_alloc_vm_domain(void)
3466 {
3467         struct dmar_domain *domain;
3468
3469         domain = alloc_domain_mem();
3470         if (!domain)
3471                 return NULL;
3472
3473         domain->id = vm_domid++;
3474         domain->nid = -1;
3475         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3476         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3477
3478         return domain;
3479 }
3480
3481 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3482 {
3483         int adjust_width;
3484
3485         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3486         spin_lock_init(&domain->iommu_lock);
3487
3488         domain_reserve_special_ranges(domain);
3489
3490         /* calculate AGAW */
3491         domain->gaw = guest_width;
3492         adjust_width = guestwidth_to_adjustwidth(guest_width);
3493         domain->agaw = width_to_agaw(adjust_width);
3494
3495         INIT_LIST_HEAD(&domain->devices);
3496
3497         domain->iommu_count = 0;
3498         domain->iommu_coherency = 0;
3499         domain->iommu_snooping = 0;
3500         domain->max_addr = 0;
3501         domain->nid = -1;
3502
3503         /* always allocate the top pgd */
3504         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3505         if (!domain->pgd)
3506                 return -ENOMEM;
3507         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3508         return 0;
3509 }
3510
3511 static void iommu_free_vm_domain(struct dmar_domain *domain)
3512 {
3513         unsigned long flags;
3514         struct dmar_drhd_unit *drhd;
3515         struct intel_iommu *iommu;
3516         unsigned long i;
3517         unsigned long ndomains;
3518
3519         for_each_drhd_unit(drhd) {
3520                 if (drhd->ignored)
3521                         continue;
3522                 iommu = drhd->iommu;
3523
3524                 ndomains = cap_ndoms(iommu->cap);
3525                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3526                         if (iommu->domains[i] == domain) {
3527                                 spin_lock_irqsave(&iommu->lock, flags);
3528                                 clear_bit(i, iommu->domain_ids);
3529                                 iommu->domains[i] = NULL;
3530                                 spin_unlock_irqrestore(&iommu->lock, flags);
3531                                 break;
3532                         }
3533                 }
3534         }
3535 }
3536
3537 static void vm_domain_exit(struct dmar_domain *domain)
3538 {
3539         /* Domain 0 is reserved, so dont process it */
3540         if (!domain)
3541                 return;
3542
3543         vm_domain_remove_all_dev_info(domain);
3544         /* destroy iovas */
3545         put_iova_domain(&domain->iovad);
3546
3547         /* clear ptes */
3548         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3549
3550         /* free page tables */
3551         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3552
3553         iommu_free_vm_domain(domain);
3554         free_domain_mem(domain);
3555 }
3556
3557 static int intel_iommu_domain_init(struct iommu_domain *domain)
3558 {
3559         struct dmar_domain *dmar_domain;
3560
3561         dmar_domain = iommu_alloc_vm_domain();
3562         if (!dmar_domain) {
3563                 printk(KERN_ERR
3564                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3565                 return -ENOMEM;
3566         }
3567         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3568                 printk(KERN_ERR
3569                         "intel_iommu_domain_init() failed\n");
3570                 vm_domain_exit(dmar_domain);
3571                 return -ENOMEM;
3572         }
3573         domain->priv = dmar_domain;
3574
3575         return 0;
3576 }
3577
3578 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3579 {
3580         struct dmar_domain *dmar_domain = domain->priv;
3581
3582         domain->priv = NULL;
3583         vm_domain_exit(dmar_domain);
3584 }
3585
3586 static int intel_iommu_attach_device(struct iommu_domain *domain,
3587                                      struct device *dev)
3588 {
3589         struct dmar_domain *dmar_domain = domain->priv;
3590         struct pci_dev *pdev = to_pci_dev(dev);
3591         struct intel_iommu *iommu;
3592         int addr_width;
3593
3594         /* normally pdev is not mapped */
3595         if (unlikely(domain_context_mapped(pdev))) {
3596                 struct dmar_domain *old_domain;
3597
3598                 old_domain = find_domain(pdev);
3599                 if (old_domain) {
3600                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3601                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3602                                 domain_remove_one_dev_info(old_domain, pdev);
3603                         else
3604                                 domain_remove_dev_info(old_domain);
3605                 }
3606         }
3607
3608         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3609                                 pdev->devfn);
3610         if (!iommu)
3611                 return -ENODEV;
3612
3613         /* check if this iommu agaw is sufficient for max mapped address */
3614         addr_width = agaw_to_width(iommu->agaw);
3615         if (addr_width > cap_mgaw(iommu->cap))
3616                 addr_width = cap_mgaw(iommu->cap);
3617
3618         if (dmar_domain->max_addr > (1LL << addr_width)) {
3619                 printk(KERN_ERR "%s: iommu width (%d) is not "
3620                        "sufficient for the mapped address (%llx)\n",
3621                        __func__, addr_width, dmar_domain->max_addr);
3622                 return -EFAULT;
3623         }
3624         dmar_domain->gaw = addr_width;
3625
3626         /*
3627          * Knock out extra levels of page tables if necessary
3628          */
3629         while (iommu->agaw < dmar_domain->agaw) {
3630                 struct dma_pte *pte;
3631
3632                 pte = dmar_domain->pgd;
3633                 if (dma_pte_present(pte)) {
3634                         free_pgtable_page(dmar_domain->pgd);
3635                         dmar_domain->pgd = (struct dma_pte *)
3636                                 phys_to_virt(dma_pte_addr(pte));
3637                 }
3638                 dmar_domain->agaw--;
3639         }
3640
3641         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3642 }
3643
3644 static void intel_iommu_detach_device(struct iommu_domain *domain,
3645                                       struct device *dev)
3646 {
3647         struct dmar_domain *dmar_domain = domain->priv;
3648         struct pci_dev *pdev = to_pci_dev(dev);
3649
3650         domain_remove_one_dev_info(dmar_domain, pdev);
3651 }
3652
3653 static int intel_iommu_map(struct iommu_domain *domain,
3654                            unsigned long iova, phys_addr_t hpa,
3655                            int gfp_order, int iommu_prot)
3656 {
3657         struct dmar_domain *dmar_domain = domain->priv;
3658         u64 max_addr;
3659         int prot = 0;
3660         size_t size;
3661         int ret;
3662
3663         if (iommu_prot & IOMMU_READ)
3664                 prot |= DMA_PTE_READ;
3665         if (iommu_prot & IOMMU_WRITE)
3666                 prot |= DMA_PTE_WRITE;
3667         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3668                 prot |= DMA_PTE_SNP;
3669
3670         size     = PAGE_SIZE << gfp_order;
3671         max_addr = iova + size;
3672         if (dmar_domain->max_addr < max_addr) {
3673                 u64 end;
3674
3675                 /* check if minimum agaw is sufficient for mapped address */
3676                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3677                 if (end < max_addr) {
3678                         printk(KERN_ERR "%s: iommu width (%d) is not "
3679                                "sufficient for the mapped address (%llx)\n",
3680                                __func__, dmar_domain->gaw, max_addr);
3681                         return -EFAULT;
3682                 }
3683                 dmar_domain->max_addr = max_addr;
3684         }
3685         /* Round up size to next multiple of PAGE_SIZE, if it and
3686            the low bits of hpa would take us onto the next page */
3687         size = aligned_nrpages(hpa, size);
3688         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3689                                  hpa >> VTD_PAGE_SHIFT, size, prot);
3690         return ret;
3691 }
3692
3693 static int intel_iommu_unmap(struct iommu_domain *domain,
3694                              unsigned long iova, int gfp_order)
3695 {
3696         struct dmar_domain *dmar_domain = domain->priv;
3697         size_t size = PAGE_SIZE << gfp_order;
3698
3699         dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3700                             (iova + size - 1) >> VTD_PAGE_SHIFT);
3701
3702         if (dmar_domain->max_addr == iova + size)
3703                 dmar_domain->max_addr = iova;
3704
3705         return gfp_order;
3706 }
3707
3708 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3709                                             unsigned long iova)
3710 {
3711         struct dmar_domain *dmar_domain = domain->priv;
3712         struct dma_pte *pte;
3713         u64 phys = 0;
3714
3715         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3716         if (pte)
3717                 phys = dma_pte_addr(pte);
3718
3719         return phys;
3720 }
3721
3722 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3723                                       unsigned long cap)
3724 {
3725         struct dmar_domain *dmar_domain = domain->priv;
3726
3727         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3728                 return dmar_domain->iommu_snooping;
3729         if (cap == IOMMU_CAP_INTR_REMAP)
3730                 return intr_remapping_enabled;
3731
3732         return 0;
3733 }
3734
3735 static struct iommu_ops intel_iommu_ops = {
3736         .domain_init    = intel_iommu_domain_init,
3737         .domain_destroy = intel_iommu_domain_destroy,
3738         .attach_dev     = intel_iommu_attach_device,
3739         .detach_dev     = intel_iommu_detach_device,
3740         .map            = intel_iommu_map,
3741         .unmap          = intel_iommu_unmap,
3742         .iova_to_phys   = intel_iommu_iova_to_phys,
3743         .domain_has_cap = intel_iommu_domain_has_cap,
3744 };
3745
3746 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3747 {
3748         /*
3749          * Mobile 4 Series Chipset neglects to set RWBF capability,
3750          * but needs it:
3751          */
3752         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3753         rwbf_quirk = 1;
3754
3755         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
3756         if (dev->revision == 0x07) {
3757                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
3758                 dmar_map_gfx = 0;
3759         }
3760 }
3761
3762 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3763
3764 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3765    ISOCH DMAR unit for the Azalia sound device, but not give it any
3766    TLB entries, which causes it to deadlock. Check for that.  We do
3767    this in a function called from init_dmars(), instead of in a PCI
3768    quirk, because we don't want to print the obnoxious "BIOS broken"
3769    message if VT-d is actually disabled.
3770 */
3771 static void __init check_tylersburg_isoch(void)
3772 {
3773         struct pci_dev *pdev;
3774         uint32_t vtisochctrl;
3775
3776         /* If there's no Azalia in the system anyway, forget it. */
3777         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3778         if (!pdev)
3779                 return;
3780         pci_dev_put(pdev);
3781
3782         /* System Management Registers. Might be hidden, in which case
3783            we can't do the sanity check. But that's OK, because the
3784            known-broken BIOSes _don't_ actually hide it, so far. */
3785         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3786         if (!pdev)
3787                 return;
3788
3789         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3790                 pci_dev_put(pdev);
3791                 return;
3792         }
3793
3794         pci_dev_put(pdev);
3795
3796         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3797         if (vtisochctrl & 1)
3798                 return;
3799
3800         /* Drop all bits other than the number of TLB entries */
3801         vtisochctrl &= 0x1c;
3802
3803         /* If we have the recommended number of TLB entries (16), fine. */
3804         if (vtisochctrl == 0x10)
3805                 return;
3806
3807         /* Zero TLB entries? You get to ride the short bus to school. */
3808         if (!vtisochctrl) {
3809                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3810                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3811                      dmi_get_system_info(DMI_BIOS_VENDOR),
3812                      dmi_get_system_info(DMI_BIOS_VERSION),
3813                      dmi_get_system_info(DMI_PRODUCT_VERSION));
3814                 iommu_identity_mapping |= IDENTMAP_AZALIA;
3815                 return;
3816         }
3817         
3818         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3819                vtisochctrl);
3820 }