iommu: Export intel_iommu_enabled to signal when iommu is in use
[linux-2.6.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
46
47 #define ROOT_SIZE               VTD_PAGE_SIZE
48 #define CONTEXT_SIZE            VTD_PAGE_SIZE
49
50 #define IS_BRIDGE_HOST_DEVICE(pdev) \
51                             ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63
64 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66
67 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
70                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72
73 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
74 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
75 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
76
77 /* page table handling */
78 #define LEVEL_STRIDE            (9)
79 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
80
81 static inline int agaw_to_level(int agaw)
82 {
83         return agaw + 2;
84 }
85
86 static inline int agaw_to_width(int agaw)
87 {
88         return 30 + agaw * LEVEL_STRIDE;
89 }
90
91 static inline int width_to_agaw(int width)
92 {
93         return (width - 30) / LEVEL_STRIDE;
94 }
95
96 static inline unsigned int level_to_offset_bits(int level)
97 {
98         return (level - 1) * LEVEL_STRIDE;
99 }
100
101 static inline int pfn_level_offset(unsigned long pfn, int level)
102 {
103         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
104 }
105
106 static inline unsigned long level_mask(int level)
107 {
108         return -1UL << level_to_offset_bits(level);
109 }
110
111 static inline unsigned long level_size(int level)
112 {
113         return 1UL << level_to_offset_bits(level);
114 }
115
116 static inline unsigned long align_to_level(unsigned long pfn, int level)
117 {
118         return (pfn + level_size(level) - 1) & level_mask(level);
119 }
120
121 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
122 {
123         return  1 << ((lvl - 1) * LEVEL_STRIDE);
124 }
125
126 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
127    are never going to work. */
128 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
129 {
130         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
131 }
132
133 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
134 {
135         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
136 }
137 static inline unsigned long page_to_dma_pfn(struct page *pg)
138 {
139         return mm_to_dma_pfn(page_to_pfn(pg));
140 }
141 static inline unsigned long virt_to_dma_pfn(void *p)
142 {
143         return page_to_dma_pfn(virt_to_page(p));
144 }
145
146 /* global iommu list, set NULL for ignored DMAR units */
147 static struct intel_iommu **g_iommus;
148
149 static void __init check_tylersburg_isoch(void);
150 static int rwbf_quirk;
151
152 /*
153  * set to 1 to panic kernel if can't successfully enable VT-d
154  * (used when kernel is launched w/ TXT)
155  */
156 static int force_on = 0;
157
158 /*
159  * 0: Present
160  * 1-11: Reserved
161  * 12-63: Context Ptr (12 - (haw-1))
162  * 64-127: Reserved
163  */
164 struct root_entry {
165         u64     val;
166         u64     rsvd1;
167 };
168 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
169 static inline bool root_present(struct root_entry *root)
170 {
171         return (root->val & 1);
172 }
173 static inline void set_root_present(struct root_entry *root)
174 {
175         root->val |= 1;
176 }
177 static inline void set_root_value(struct root_entry *root, unsigned long value)
178 {
179         root->val |= value & VTD_PAGE_MASK;
180 }
181
182 static inline struct context_entry *
183 get_context_addr_from_root(struct root_entry *root)
184 {
185         return (struct context_entry *)
186                 (root_present(root)?phys_to_virt(
187                 root->val & VTD_PAGE_MASK) :
188                 NULL);
189 }
190
191 /*
192  * low 64 bits:
193  * 0: present
194  * 1: fault processing disable
195  * 2-3: translation type
196  * 12-63: address space root
197  * high 64 bits:
198  * 0-2: address width
199  * 3-6: aval
200  * 8-23: domain id
201  */
202 struct context_entry {
203         u64 lo;
204         u64 hi;
205 };
206
207 static inline bool context_present(struct context_entry *context)
208 {
209         return (context->lo & 1);
210 }
211 static inline void context_set_present(struct context_entry *context)
212 {
213         context->lo |= 1;
214 }
215
216 static inline void context_set_fault_enable(struct context_entry *context)
217 {
218         context->lo &= (((u64)-1) << 2) | 1;
219 }
220
221 static inline void context_set_translation_type(struct context_entry *context,
222                                                 unsigned long value)
223 {
224         context->lo &= (((u64)-1) << 4) | 3;
225         context->lo |= (value & 3) << 2;
226 }
227
228 static inline void context_set_address_root(struct context_entry *context,
229                                             unsigned long value)
230 {
231         context->lo |= value & VTD_PAGE_MASK;
232 }
233
234 static inline void context_set_address_width(struct context_entry *context,
235                                              unsigned long value)
236 {
237         context->hi |= value & 7;
238 }
239
240 static inline void context_set_domain_id(struct context_entry *context,
241                                          unsigned long value)
242 {
243         context->hi |= (value & ((1 << 16) - 1)) << 8;
244 }
245
246 static inline void context_clear_entry(struct context_entry *context)
247 {
248         context->lo = 0;
249         context->hi = 0;
250 }
251
252 /*
253  * 0: readable
254  * 1: writable
255  * 2-6: reserved
256  * 7: super page
257  * 8-10: available
258  * 11: snoop behavior
259  * 12-63: Host physcial address
260  */
261 struct dma_pte {
262         u64 val;
263 };
264
265 static inline void dma_clear_pte(struct dma_pte *pte)
266 {
267         pte->val = 0;
268 }
269
270 static inline void dma_set_pte_readable(struct dma_pte *pte)
271 {
272         pte->val |= DMA_PTE_READ;
273 }
274
275 static inline void dma_set_pte_writable(struct dma_pte *pte)
276 {
277         pte->val |= DMA_PTE_WRITE;
278 }
279
280 static inline void dma_set_pte_snp(struct dma_pte *pte)
281 {
282         pte->val |= DMA_PTE_SNP;
283 }
284
285 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
286 {
287         pte->val = (pte->val & ~3) | (prot & 3);
288 }
289
290 static inline u64 dma_pte_addr(struct dma_pte *pte)
291 {
292 #ifdef CONFIG_64BIT
293         return pte->val & VTD_PAGE_MASK;
294 #else
295         /* Must have a full atomic 64-bit read */
296         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
297 #endif
298 }
299
300 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
301 {
302         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
303 }
304
305 static inline bool dma_pte_present(struct dma_pte *pte)
306 {
307         return (pte->val & 3) != 0;
308 }
309
310 static inline bool dma_pte_superpage(struct dma_pte *pte)
311 {
312         return (pte->val & (1 << 7));
313 }
314
315 static inline int first_pte_in_page(struct dma_pte *pte)
316 {
317         return !((unsigned long)pte & ~VTD_PAGE_MASK);
318 }
319
320 /*
321  * This domain is a statically identity mapping domain.
322  *      1. This domain creats a static 1:1 mapping to all usable memory.
323  *      2. It maps to each iommu if successful.
324  *      3. Each iommu mapps to this domain if successful.
325  */
326 static struct dmar_domain *si_domain;
327 static int hw_pass_through = 1;
328
329 /* devices under the same p2p bridge are owned in one domain */
330 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
331
332 /* domain represents a virtual machine, more than one devices
333  * across iommus may be owned in one domain, e.g. kvm guest.
334  */
335 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
336
337 /* si_domain contains mulitple devices */
338 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
339
340 struct dmar_domain {
341         int     id;                     /* domain id */
342         int     nid;                    /* node id */
343         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
344
345         struct list_head devices;       /* all devices' list */
346         struct iova_domain iovad;       /* iova's that belong to this domain */
347
348         struct dma_pte  *pgd;           /* virtual address */
349         int             gaw;            /* max guest address width */
350
351         /* adjusted guest address width, 0 is level 2 30-bit */
352         int             agaw;
353
354         int             flags;          /* flags to find out type of domain */
355
356         int             iommu_coherency;/* indicate coherency of iommu access */
357         int             iommu_snooping; /* indicate snooping control feature*/
358         int             iommu_count;    /* reference count of iommu */
359         int             iommu_superpage;/* Level of superpages supported:
360                                            0 == 4KiB (no superpages), 1 == 2MiB,
361                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
362         spinlock_t      iommu_lock;     /* protect iommu set in domain */
363         u64             max_addr;       /* maximum mapped address */
364 };
365
366 /* PCI domain-device relationship */
367 struct device_domain_info {
368         struct list_head link;  /* link to domain siblings */
369         struct list_head global; /* link to global list */
370         int segment;            /* PCI domain */
371         u8 bus;                 /* PCI bus number */
372         u8 devfn;               /* PCI devfn number */
373         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
374         struct intel_iommu *iommu; /* IOMMU used by this device */
375         struct dmar_domain *domain; /* pointer to domain */
376 };
377
378 static void flush_unmaps_timeout(unsigned long data);
379
380 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
381
382 #define HIGH_WATER_MARK 250
383 struct deferred_flush_tables {
384         int next;
385         struct iova *iova[HIGH_WATER_MARK];
386         struct dmar_domain *domain[HIGH_WATER_MARK];
387 };
388
389 static struct deferred_flush_tables *deferred_flush;
390
391 /* bitmap for indexing intel_iommus */
392 static int g_num_of_iommus;
393
394 static DEFINE_SPINLOCK(async_umap_flush_lock);
395 static LIST_HEAD(unmaps_to_do);
396
397 static int timer_on;
398 static long list_size;
399
400 static void domain_remove_dev_info(struct dmar_domain *domain);
401
402 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
403 int dmar_disabled = 0;
404 #else
405 int dmar_disabled = 1;
406 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
407
408 int intel_iommu_enabled = 0;
409 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
410
411 static int dmar_map_gfx = 1;
412 static int dmar_forcedac;
413 static int intel_iommu_strict;
414 static int intel_iommu_superpage = 1;
415
416 int intel_iommu_gfx_mapped;
417 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
418
419 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
420 static DEFINE_SPINLOCK(device_domain_lock);
421 static LIST_HEAD(device_domain_list);
422
423 static struct iommu_ops intel_iommu_ops;
424
425 static int __init intel_iommu_setup(char *str)
426 {
427         if (!str)
428                 return -EINVAL;
429         while (*str) {
430                 if (!strncmp(str, "on", 2)) {
431                         dmar_disabled = 0;
432                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
433                 } else if (!strncmp(str, "off", 3)) {
434                         dmar_disabled = 1;
435                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
436                 } else if (!strncmp(str, "igfx_off", 8)) {
437                         dmar_map_gfx = 0;
438                         printk(KERN_INFO
439                                 "Intel-IOMMU: disable GFX device mapping\n");
440                 } else if (!strncmp(str, "forcedac", 8)) {
441                         printk(KERN_INFO
442                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
443                         dmar_forcedac = 1;
444                 } else if (!strncmp(str, "strict", 6)) {
445                         printk(KERN_INFO
446                                 "Intel-IOMMU: disable batched IOTLB flush\n");
447                         intel_iommu_strict = 1;
448                 } else if (!strncmp(str, "sp_off", 6)) {
449                         printk(KERN_INFO
450                                 "Intel-IOMMU: disable supported super page\n");
451                         intel_iommu_superpage = 0;
452                 }
453
454                 str += strcspn(str, ",");
455                 while (*str == ',')
456                         str++;
457         }
458         return 0;
459 }
460 __setup("intel_iommu=", intel_iommu_setup);
461
462 static struct kmem_cache *iommu_domain_cache;
463 static struct kmem_cache *iommu_devinfo_cache;
464 static struct kmem_cache *iommu_iova_cache;
465
466 static inline void *alloc_pgtable_page(int node)
467 {
468         struct page *page;
469         void *vaddr = NULL;
470
471         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
472         if (page)
473                 vaddr = page_address(page);
474         return vaddr;
475 }
476
477 static inline void free_pgtable_page(void *vaddr)
478 {
479         free_page((unsigned long)vaddr);
480 }
481
482 static inline void *alloc_domain_mem(void)
483 {
484         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
485 }
486
487 static void free_domain_mem(void *vaddr)
488 {
489         kmem_cache_free(iommu_domain_cache, vaddr);
490 }
491
492 static inline void * alloc_devinfo_mem(void)
493 {
494         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
495 }
496
497 static inline void free_devinfo_mem(void *vaddr)
498 {
499         kmem_cache_free(iommu_devinfo_cache, vaddr);
500 }
501
502 struct iova *alloc_iova_mem(void)
503 {
504         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
505 }
506
507 void free_iova_mem(struct iova *iova)
508 {
509         kmem_cache_free(iommu_iova_cache, iova);
510 }
511
512
513 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
514 {
515         unsigned long sagaw;
516         int agaw = -1;
517
518         sagaw = cap_sagaw(iommu->cap);
519         for (agaw = width_to_agaw(max_gaw);
520              agaw >= 0; agaw--) {
521                 if (test_bit(agaw, &sagaw))
522                         break;
523         }
524
525         return agaw;
526 }
527
528 /*
529  * Calculate max SAGAW for each iommu.
530  */
531 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
532 {
533         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
534 }
535
536 /*
537  * calculate agaw for each iommu.
538  * "SAGAW" may be different across iommus, use a default agaw, and
539  * get a supported less agaw for iommus that don't support the default agaw.
540  */
541 int iommu_calculate_agaw(struct intel_iommu *iommu)
542 {
543         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
544 }
545
546 /* This functionin only returns single iommu in a domain */
547 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
548 {
549         int iommu_id;
550
551         /* si_domain and vm domain should not get here. */
552         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
553         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
554
555         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
556         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
557                 return NULL;
558
559         return g_iommus[iommu_id];
560 }
561
562 static void domain_update_iommu_coherency(struct dmar_domain *domain)
563 {
564         int i;
565
566         domain->iommu_coherency = 1;
567
568         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
569                 if (!ecap_coherent(g_iommus[i]->ecap)) {
570                         domain->iommu_coherency = 0;
571                         break;
572                 }
573         }
574 }
575
576 static void domain_update_iommu_snooping(struct dmar_domain *domain)
577 {
578         int i;
579
580         domain->iommu_snooping = 1;
581
582         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
583                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
584                         domain->iommu_snooping = 0;
585                         break;
586                 }
587         }
588 }
589
590 static void domain_update_iommu_superpage(struct dmar_domain *domain)
591 {
592         struct dmar_drhd_unit *drhd;
593         struct intel_iommu *iommu = NULL;
594         int mask = 0xf;
595
596         if (!intel_iommu_superpage) {
597                 domain->iommu_superpage = 0;
598                 return;
599         }
600
601         /* set iommu_superpage to the smallest common denominator */
602         for_each_active_iommu(iommu, drhd) {
603                 mask &= cap_super_page_val(iommu->cap);
604                 if (!mask) {
605                         break;
606                 }
607         }
608         domain->iommu_superpage = fls(mask);
609 }
610
611 /* Some capabilities may be different across iommus */
612 static void domain_update_iommu_cap(struct dmar_domain *domain)
613 {
614         domain_update_iommu_coherency(domain);
615         domain_update_iommu_snooping(domain);
616         domain_update_iommu_superpage(domain);
617 }
618
619 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
620 {
621         struct dmar_drhd_unit *drhd = NULL;
622         int i;
623
624         for_each_drhd_unit(drhd) {
625                 if (drhd->ignored)
626                         continue;
627                 if (segment != drhd->segment)
628                         continue;
629
630                 for (i = 0; i < drhd->devices_cnt; i++) {
631                         if (drhd->devices[i] &&
632                             drhd->devices[i]->bus->number == bus &&
633                             drhd->devices[i]->devfn == devfn)
634                                 return drhd->iommu;
635                         if (drhd->devices[i] &&
636                             drhd->devices[i]->subordinate &&
637                             drhd->devices[i]->subordinate->number <= bus &&
638                             drhd->devices[i]->subordinate->subordinate >= bus)
639                                 return drhd->iommu;
640                 }
641
642                 if (drhd->include_all)
643                         return drhd->iommu;
644         }
645
646         return NULL;
647 }
648
649 static void domain_flush_cache(struct dmar_domain *domain,
650                                void *addr, int size)
651 {
652         if (!domain->iommu_coherency)
653                 clflush_cache_range(addr, size);
654 }
655
656 /* Gets context entry for a given bus and devfn */
657 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
658                 u8 bus, u8 devfn)
659 {
660         struct root_entry *root;
661         struct context_entry *context;
662         unsigned long phy_addr;
663         unsigned long flags;
664
665         spin_lock_irqsave(&iommu->lock, flags);
666         root = &iommu->root_entry[bus];
667         context = get_context_addr_from_root(root);
668         if (!context) {
669                 context = (struct context_entry *)
670                                 alloc_pgtable_page(iommu->node);
671                 if (!context) {
672                         spin_unlock_irqrestore(&iommu->lock, flags);
673                         return NULL;
674                 }
675                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
676                 phy_addr = virt_to_phys((void *)context);
677                 set_root_value(root, phy_addr);
678                 set_root_present(root);
679                 __iommu_flush_cache(iommu, root, sizeof(*root));
680         }
681         spin_unlock_irqrestore(&iommu->lock, flags);
682         return &context[devfn];
683 }
684
685 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
686 {
687         struct root_entry *root;
688         struct context_entry *context;
689         int ret;
690         unsigned long flags;
691
692         spin_lock_irqsave(&iommu->lock, flags);
693         root = &iommu->root_entry[bus];
694         context = get_context_addr_from_root(root);
695         if (!context) {
696                 ret = 0;
697                 goto out;
698         }
699         ret = context_present(&context[devfn]);
700 out:
701         spin_unlock_irqrestore(&iommu->lock, flags);
702         return ret;
703 }
704
705 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
706 {
707         struct root_entry *root;
708         struct context_entry *context;
709         unsigned long flags;
710
711         spin_lock_irqsave(&iommu->lock, flags);
712         root = &iommu->root_entry[bus];
713         context = get_context_addr_from_root(root);
714         if (context) {
715                 context_clear_entry(&context[devfn]);
716                 __iommu_flush_cache(iommu, &context[devfn], \
717                         sizeof(*context));
718         }
719         spin_unlock_irqrestore(&iommu->lock, flags);
720 }
721
722 static void free_context_table(struct intel_iommu *iommu)
723 {
724         struct root_entry *root;
725         int i;
726         unsigned long flags;
727         struct context_entry *context;
728
729         spin_lock_irqsave(&iommu->lock, flags);
730         if (!iommu->root_entry) {
731                 goto out;
732         }
733         for (i = 0; i < ROOT_ENTRY_NR; i++) {
734                 root = &iommu->root_entry[i];
735                 context = get_context_addr_from_root(root);
736                 if (context)
737                         free_pgtable_page(context);
738         }
739         free_pgtable_page(iommu->root_entry);
740         iommu->root_entry = NULL;
741 out:
742         spin_unlock_irqrestore(&iommu->lock, flags);
743 }
744
745 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
746                                       unsigned long pfn, int target_level)
747 {
748         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
749         struct dma_pte *parent, *pte = NULL;
750         int level = agaw_to_level(domain->agaw);
751         int offset;
752
753         BUG_ON(!domain->pgd);
754         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
755         parent = domain->pgd;
756
757         while (level > 0) {
758                 void *tmp_page;
759
760                 offset = pfn_level_offset(pfn, level);
761                 pte = &parent[offset];
762                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
763                         break;
764                 if (level == target_level)
765                         break;
766
767                 if (!dma_pte_present(pte)) {
768                         uint64_t pteval;
769
770                         tmp_page = alloc_pgtable_page(domain->nid);
771
772                         if (!tmp_page)
773                                 return NULL;
774
775                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
776                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
777                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
778                                 /* Someone else set it while we were thinking; use theirs. */
779                                 free_pgtable_page(tmp_page);
780                         } else {
781                                 dma_pte_addr(pte);
782                                 domain_flush_cache(domain, pte, sizeof(*pte));
783                         }
784                 }
785                 parent = phys_to_virt(dma_pte_addr(pte));
786                 level--;
787         }
788
789         return pte;
790 }
791
792
793 /* return address's pte at specific level */
794 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
795                                          unsigned long pfn,
796                                          int level, int *large_page)
797 {
798         struct dma_pte *parent, *pte = NULL;
799         int total = agaw_to_level(domain->agaw);
800         int offset;
801
802         parent = domain->pgd;
803         while (level <= total) {
804                 offset = pfn_level_offset(pfn, total);
805                 pte = &parent[offset];
806                 if (level == total)
807                         return pte;
808
809                 if (!dma_pte_present(pte)) {
810                         *large_page = total;
811                         break;
812                 }
813
814                 if (pte->val & DMA_PTE_LARGE_PAGE) {
815                         *large_page = total;
816                         return pte;
817                 }
818
819                 parent = phys_to_virt(dma_pte_addr(pte));
820                 total--;
821         }
822         return NULL;
823 }
824
825 /* clear last level pte, a tlb flush should be followed */
826 static int dma_pte_clear_range(struct dmar_domain *domain,
827                                 unsigned long start_pfn,
828                                 unsigned long last_pfn)
829 {
830         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
831         unsigned int large_page = 1;
832         struct dma_pte *first_pte, *pte;
833         int order;
834
835         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
836         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
837         BUG_ON(start_pfn > last_pfn);
838
839         /* we don't need lock here; nobody else touches the iova range */
840         do {
841                 large_page = 1;
842                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
843                 if (!pte) {
844                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
845                         continue;
846                 }
847                 do {
848                         dma_clear_pte(pte);
849                         start_pfn += lvl_to_nr_pages(large_page);
850                         pte++;
851                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
852
853                 domain_flush_cache(domain, first_pte,
854                                    (void *)pte - (void *)first_pte);
855
856         } while (start_pfn && start_pfn <= last_pfn);
857
858         order = (large_page - 1) * 9;
859         return order;
860 }
861
862 /* free page table pages. last level pte should already be cleared */
863 static void dma_pte_free_pagetable(struct dmar_domain *domain,
864                                    unsigned long start_pfn,
865                                    unsigned long last_pfn)
866 {
867         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
868         struct dma_pte *first_pte, *pte;
869         int total = agaw_to_level(domain->agaw);
870         int level;
871         unsigned long tmp;
872         int large_page = 2;
873
874         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
875         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
876         BUG_ON(start_pfn > last_pfn);
877
878         /* We don't need lock here; nobody else touches the iova range */
879         level = 2;
880         while (level <= total) {
881                 tmp = align_to_level(start_pfn, level);
882
883                 /* If we can't even clear one PTE at this level, we're done */
884                 if (tmp + level_size(level) - 1 > last_pfn)
885                         return;
886
887                 do {
888                         large_page = level;
889                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
890                         if (large_page > level)
891                                 level = large_page + 1;
892                         if (!pte) {
893                                 tmp = align_to_level(tmp + 1, level + 1);
894                                 continue;
895                         }
896                         do {
897                                 if (dma_pte_present(pte)) {
898                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
899                                         dma_clear_pte(pte);
900                                 }
901                                 pte++;
902                                 tmp += level_size(level);
903                         } while (!first_pte_in_page(pte) &&
904                                  tmp + level_size(level) - 1 <= last_pfn);
905
906                         domain_flush_cache(domain, first_pte,
907                                            (void *)pte - (void *)first_pte);
908                         
909                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
910                 level++;
911         }
912         /* free pgd */
913         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
914                 free_pgtable_page(domain->pgd);
915                 domain->pgd = NULL;
916         }
917 }
918
919 /* iommu handling */
920 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
921 {
922         struct root_entry *root;
923         unsigned long flags;
924
925         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
926         if (!root)
927                 return -ENOMEM;
928
929         __iommu_flush_cache(iommu, root, ROOT_SIZE);
930
931         spin_lock_irqsave(&iommu->lock, flags);
932         iommu->root_entry = root;
933         spin_unlock_irqrestore(&iommu->lock, flags);
934
935         return 0;
936 }
937
938 static void iommu_set_root_entry(struct intel_iommu *iommu)
939 {
940         void *addr;
941         u32 sts;
942         unsigned long flag;
943
944         addr = iommu->root_entry;
945
946         raw_spin_lock_irqsave(&iommu->register_lock, flag);
947         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
948
949         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
950
951         /* Make sure hardware complete it */
952         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
953                       readl, (sts & DMA_GSTS_RTPS), sts);
954
955         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
956 }
957
958 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
959 {
960         u32 val;
961         unsigned long flag;
962
963         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
964                 return;
965
966         raw_spin_lock_irqsave(&iommu->register_lock, flag);
967         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
968
969         /* Make sure hardware complete it */
970         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
971                       readl, (!(val & DMA_GSTS_WBFS)), val);
972
973         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
974 }
975
976 /* return value determine if we need a write buffer flush */
977 static void __iommu_flush_context(struct intel_iommu *iommu,
978                                   u16 did, u16 source_id, u8 function_mask,
979                                   u64 type)
980 {
981         u64 val = 0;
982         unsigned long flag;
983
984         switch (type) {
985         case DMA_CCMD_GLOBAL_INVL:
986                 val = DMA_CCMD_GLOBAL_INVL;
987                 break;
988         case DMA_CCMD_DOMAIN_INVL:
989                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
990                 break;
991         case DMA_CCMD_DEVICE_INVL:
992                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
993                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
994                 break;
995         default:
996                 BUG();
997         }
998         val |= DMA_CCMD_ICC;
999
1000         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1001         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1002
1003         /* Make sure hardware complete it */
1004         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1005                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1006
1007         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1008 }
1009
1010 /* return value determine if we need a write buffer flush */
1011 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1012                                 u64 addr, unsigned int size_order, u64 type)
1013 {
1014         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1015         u64 val = 0, val_iva = 0;
1016         unsigned long flag;
1017
1018         switch (type) {
1019         case DMA_TLB_GLOBAL_FLUSH:
1020                 /* global flush doesn't need set IVA_REG */
1021                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1022                 break;
1023         case DMA_TLB_DSI_FLUSH:
1024                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1025                 break;
1026         case DMA_TLB_PSI_FLUSH:
1027                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1028                 /* Note: always flush non-leaf currently */
1029                 val_iva = size_order | addr;
1030                 break;
1031         default:
1032                 BUG();
1033         }
1034         /* Note: set drain read/write */
1035 #if 0
1036         /*
1037          * This is probably to be super secure.. Looks like we can
1038          * ignore it without any impact.
1039          */
1040         if (cap_read_drain(iommu->cap))
1041                 val |= DMA_TLB_READ_DRAIN;
1042 #endif
1043         if (cap_write_drain(iommu->cap))
1044                 val |= DMA_TLB_WRITE_DRAIN;
1045
1046         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1047         /* Note: Only uses first TLB reg currently */
1048         if (val_iva)
1049                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1050         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1051
1052         /* Make sure hardware complete it */
1053         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1054                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1055
1056         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1057
1058         /* check IOTLB invalidation granularity */
1059         if (DMA_TLB_IAIG(val) == 0)
1060                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1061         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1062                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1063                         (unsigned long long)DMA_TLB_IIRG(type),
1064                         (unsigned long long)DMA_TLB_IAIG(val));
1065 }
1066
1067 static struct device_domain_info *iommu_support_dev_iotlb(
1068         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1069 {
1070         int found = 0;
1071         unsigned long flags;
1072         struct device_domain_info *info;
1073         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1074
1075         if (!ecap_dev_iotlb_support(iommu->ecap))
1076                 return NULL;
1077
1078         if (!iommu->qi)
1079                 return NULL;
1080
1081         spin_lock_irqsave(&device_domain_lock, flags);
1082         list_for_each_entry(info, &domain->devices, link)
1083                 if (info->bus == bus && info->devfn == devfn) {
1084                         found = 1;
1085                         break;
1086                 }
1087         spin_unlock_irqrestore(&device_domain_lock, flags);
1088
1089         if (!found || !info->dev)
1090                 return NULL;
1091
1092         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1093                 return NULL;
1094
1095         if (!dmar_find_matched_atsr_unit(info->dev))
1096                 return NULL;
1097
1098         info->iommu = iommu;
1099
1100         return info;
1101 }
1102
1103 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1104 {
1105         if (!info)
1106                 return;
1107
1108         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1109 }
1110
1111 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1112 {
1113         if (!info->dev || !pci_ats_enabled(info->dev))
1114                 return;
1115
1116         pci_disable_ats(info->dev);
1117 }
1118
1119 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1120                                   u64 addr, unsigned mask)
1121 {
1122         u16 sid, qdep;
1123         unsigned long flags;
1124         struct device_domain_info *info;
1125
1126         spin_lock_irqsave(&device_domain_lock, flags);
1127         list_for_each_entry(info, &domain->devices, link) {
1128                 if (!info->dev || !pci_ats_enabled(info->dev))
1129                         continue;
1130
1131                 sid = info->bus << 8 | info->devfn;
1132                 qdep = pci_ats_queue_depth(info->dev);
1133                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1134         }
1135         spin_unlock_irqrestore(&device_domain_lock, flags);
1136 }
1137
1138 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1139                                   unsigned long pfn, unsigned int pages, int map)
1140 {
1141         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1142         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1143
1144         BUG_ON(pages == 0);
1145
1146         /*
1147          * Fallback to domain selective flush if no PSI support or the size is
1148          * too big.
1149          * PSI requires page size to be 2 ^ x, and the base address is naturally
1150          * aligned to the size
1151          */
1152         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1153                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1154                                                 DMA_TLB_DSI_FLUSH);
1155         else
1156                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1157                                                 DMA_TLB_PSI_FLUSH);
1158
1159         /*
1160          * In caching mode, changes of pages from non-present to present require
1161          * flush. However, device IOTLB doesn't need to be flushed in this case.
1162          */
1163         if (!cap_caching_mode(iommu->cap) || !map)
1164                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1165 }
1166
1167 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1168 {
1169         u32 pmen;
1170         unsigned long flags;
1171
1172         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1173         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1174         pmen &= ~DMA_PMEN_EPM;
1175         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1176
1177         /* wait for the protected region status bit to clear */
1178         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1179                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1180
1181         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1182 }
1183
1184 static int iommu_enable_translation(struct intel_iommu *iommu)
1185 {
1186         u32 sts;
1187         unsigned long flags;
1188
1189         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1190         iommu->gcmd |= DMA_GCMD_TE;
1191         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1192
1193         /* Make sure hardware complete it */
1194         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1195                       readl, (sts & DMA_GSTS_TES), sts);
1196
1197         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1198         return 0;
1199 }
1200
1201 static int iommu_disable_translation(struct intel_iommu *iommu)
1202 {
1203         u32 sts;
1204         unsigned long flag;
1205
1206         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1207         iommu->gcmd &= ~DMA_GCMD_TE;
1208         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1209
1210         /* Make sure hardware complete it */
1211         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1212                       readl, (!(sts & DMA_GSTS_TES)), sts);
1213
1214         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1215         return 0;
1216 }
1217
1218
1219 static int iommu_init_domains(struct intel_iommu *iommu)
1220 {
1221         unsigned long ndomains;
1222         unsigned long nlongs;
1223
1224         ndomains = cap_ndoms(iommu->cap);
1225         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1226                         ndomains);
1227         nlongs = BITS_TO_LONGS(ndomains);
1228
1229         spin_lock_init(&iommu->lock);
1230
1231         /* TBD: there might be 64K domains,
1232          * consider other allocation for future chip
1233          */
1234         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1235         if (!iommu->domain_ids) {
1236                 printk(KERN_ERR "Allocating domain id array failed\n");
1237                 return -ENOMEM;
1238         }
1239         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1240                         GFP_KERNEL);
1241         if (!iommu->domains) {
1242                 printk(KERN_ERR "Allocating domain array failed\n");
1243                 return -ENOMEM;
1244         }
1245
1246         /*
1247          * if Caching mode is set, then invalid translations are tagged
1248          * with domainid 0. Hence we need to pre-allocate it.
1249          */
1250         if (cap_caching_mode(iommu->cap))
1251                 set_bit(0, iommu->domain_ids);
1252         return 0;
1253 }
1254
1255
1256 static void domain_exit(struct dmar_domain *domain);
1257 static void vm_domain_exit(struct dmar_domain *domain);
1258
1259 void free_dmar_iommu(struct intel_iommu *iommu)
1260 {
1261         struct dmar_domain *domain;
1262         int i;
1263         unsigned long flags;
1264
1265         if ((iommu->domains) && (iommu->domain_ids)) {
1266                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1267                         domain = iommu->domains[i];
1268                         clear_bit(i, iommu->domain_ids);
1269
1270                         spin_lock_irqsave(&domain->iommu_lock, flags);
1271                         if (--domain->iommu_count == 0) {
1272                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1273                                         vm_domain_exit(domain);
1274                                 else
1275                                         domain_exit(domain);
1276                         }
1277                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1278                 }
1279         }
1280
1281         if (iommu->gcmd & DMA_GCMD_TE)
1282                 iommu_disable_translation(iommu);
1283
1284         if (iommu->irq) {
1285                 irq_set_handler_data(iommu->irq, NULL);
1286                 /* This will mask the irq */
1287                 free_irq(iommu->irq, iommu);
1288                 destroy_irq(iommu->irq);
1289         }
1290
1291         kfree(iommu->domains);
1292         kfree(iommu->domain_ids);
1293
1294         g_iommus[iommu->seq_id] = NULL;
1295
1296         /* if all iommus are freed, free g_iommus */
1297         for (i = 0; i < g_num_of_iommus; i++) {
1298                 if (g_iommus[i])
1299                         break;
1300         }
1301
1302         if (i == g_num_of_iommus)
1303                 kfree(g_iommus);
1304
1305         /* free context mapping */
1306         free_context_table(iommu);
1307 }
1308
1309 static struct dmar_domain *alloc_domain(void)
1310 {
1311         struct dmar_domain *domain;
1312
1313         domain = alloc_domain_mem();
1314         if (!domain)
1315                 return NULL;
1316
1317         domain->nid = -1;
1318         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1319         domain->flags = 0;
1320
1321         return domain;
1322 }
1323
1324 static int iommu_attach_domain(struct dmar_domain *domain,
1325                                struct intel_iommu *iommu)
1326 {
1327         int num;
1328         unsigned long ndomains;
1329         unsigned long flags;
1330
1331         ndomains = cap_ndoms(iommu->cap);
1332
1333         spin_lock_irqsave(&iommu->lock, flags);
1334
1335         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1336         if (num >= ndomains) {
1337                 spin_unlock_irqrestore(&iommu->lock, flags);
1338                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1339                 return -ENOMEM;
1340         }
1341
1342         domain->id = num;
1343         set_bit(num, iommu->domain_ids);
1344         set_bit(iommu->seq_id, &domain->iommu_bmp);
1345         iommu->domains[num] = domain;
1346         spin_unlock_irqrestore(&iommu->lock, flags);
1347
1348         return 0;
1349 }
1350
1351 static void iommu_detach_domain(struct dmar_domain *domain,
1352                                 struct intel_iommu *iommu)
1353 {
1354         unsigned long flags;
1355         int num, ndomains;
1356         int found = 0;
1357
1358         spin_lock_irqsave(&iommu->lock, flags);
1359         ndomains = cap_ndoms(iommu->cap);
1360         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1361                 if (iommu->domains[num] == domain) {
1362                         found = 1;
1363                         break;
1364                 }
1365         }
1366
1367         if (found) {
1368                 clear_bit(num, iommu->domain_ids);
1369                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1370                 iommu->domains[num] = NULL;
1371         }
1372         spin_unlock_irqrestore(&iommu->lock, flags);
1373 }
1374
1375 static struct iova_domain reserved_iova_list;
1376 static struct lock_class_key reserved_rbtree_key;
1377
1378 static int dmar_init_reserved_ranges(void)
1379 {
1380         struct pci_dev *pdev = NULL;
1381         struct iova *iova;
1382         int i;
1383
1384         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1385
1386         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1387                 &reserved_rbtree_key);
1388
1389         /* IOAPIC ranges shouldn't be accessed by DMA */
1390         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1391                 IOVA_PFN(IOAPIC_RANGE_END));
1392         if (!iova) {
1393                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1394                 return -ENODEV;
1395         }
1396
1397         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1398         for_each_pci_dev(pdev) {
1399                 struct resource *r;
1400
1401                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1402                         r = &pdev->resource[i];
1403                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1404                                 continue;
1405                         iova = reserve_iova(&reserved_iova_list,
1406                                             IOVA_PFN(r->start),
1407                                             IOVA_PFN(r->end));
1408                         if (!iova) {
1409                                 printk(KERN_ERR "Reserve iova failed\n");
1410                                 return -ENODEV;
1411                         }
1412                 }
1413         }
1414         return 0;
1415 }
1416
1417 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1418 {
1419         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1420 }
1421
1422 static inline int guestwidth_to_adjustwidth(int gaw)
1423 {
1424         int agaw;
1425         int r = (gaw - 12) % 9;
1426
1427         if (r == 0)
1428                 agaw = gaw;
1429         else
1430                 agaw = gaw + 9 - r;
1431         if (agaw > 64)
1432                 agaw = 64;
1433         return agaw;
1434 }
1435
1436 static int domain_init(struct dmar_domain *domain, int guest_width)
1437 {
1438         struct intel_iommu *iommu;
1439         int adjust_width, agaw;
1440         unsigned long sagaw;
1441
1442         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1443         spin_lock_init(&domain->iommu_lock);
1444
1445         domain_reserve_special_ranges(domain);
1446
1447         /* calculate AGAW */
1448         iommu = domain_get_iommu(domain);
1449         if (guest_width > cap_mgaw(iommu->cap))
1450                 guest_width = cap_mgaw(iommu->cap);
1451         domain->gaw = guest_width;
1452         adjust_width = guestwidth_to_adjustwidth(guest_width);
1453         agaw = width_to_agaw(adjust_width);
1454         sagaw = cap_sagaw(iommu->cap);
1455         if (!test_bit(agaw, &sagaw)) {
1456                 /* hardware doesn't support it, choose a bigger one */
1457                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1458                 agaw = find_next_bit(&sagaw, 5, agaw);
1459                 if (agaw >= 5)
1460                         return -ENODEV;
1461         }
1462         domain->agaw = agaw;
1463         INIT_LIST_HEAD(&domain->devices);
1464
1465         if (ecap_coherent(iommu->ecap))
1466                 domain->iommu_coherency = 1;
1467         else
1468                 domain->iommu_coherency = 0;
1469
1470         if (ecap_sc_support(iommu->ecap))
1471                 domain->iommu_snooping = 1;
1472         else
1473                 domain->iommu_snooping = 0;
1474
1475         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1476         domain->iommu_count = 1;
1477         domain->nid = iommu->node;
1478
1479         /* always allocate the top pgd */
1480         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1481         if (!domain->pgd)
1482                 return -ENOMEM;
1483         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1484         return 0;
1485 }
1486
1487 static void domain_exit(struct dmar_domain *domain)
1488 {
1489         struct dmar_drhd_unit *drhd;
1490         struct intel_iommu *iommu;
1491
1492         /* Domain 0 is reserved, so dont process it */
1493         if (!domain)
1494                 return;
1495
1496         /* Flush any lazy unmaps that may reference this domain */
1497         if (!intel_iommu_strict)
1498                 flush_unmaps_timeout(0);
1499
1500         domain_remove_dev_info(domain);
1501         /* destroy iovas */
1502         put_iova_domain(&domain->iovad);
1503
1504         /* clear ptes */
1505         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1506
1507         /* free page tables */
1508         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1509
1510         for_each_active_iommu(iommu, drhd)
1511                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1512                         iommu_detach_domain(domain, iommu);
1513
1514         free_domain_mem(domain);
1515 }
1516
1517 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1518                                  u8 bus, u8 devfn, int translation)
1519 {
1520         struct context_entry *context;
1521         unsigned long flags;
1522         struct intel_iommu *iommu;
1523         struct dma_pte *pgd;
1524         unsigned long num;
1525         unsigned long ndomains;
1526         int id;
1527         int agaw;
1528         struct device_domain_info *info = NULL;
1529
1530         pr_debug("Set context mapping for %02x:%02x.%d\n",
1531                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1532
1533         BUG_ON(!domain->pgd);
1534         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1535                translation != CONTEXT_TT_MULTI_LEVEL);
1536
1537         iommu = device_to_iommu(segment, bus, devfn);
1538         if (!iommu)
1539                 return -ENODEV;
1540
1541         context = device_to_context_entry(iommu, bus, devfn);
1542         if (!context)
1543                 return -ENOMEM;
1544         spin_lock_irqsave(&iommu->lock, flags);
1545         if (context_present(context)) {
1546                 spin_unlock_irqrestore(&iommu->lock, flags);
1547                 return 0;
1548         }
1549
1550         id = domain->id;
1551         pgd = domain->pgd;
1552
1553         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1554             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1555                 int found = 0;
1556
1557                 /* find an available domain id for this device in iommu */
1558                 ndomains = cap_ndoms(iommu->cap);
1559                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1560                         if (iommu->domains[num] == domain) {
1561                                 id = num;
1562                                 found = 1;
1563                                 break;
1564                         }
1565                 }
1566
1567                 if (found == 0) {
1568                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1569                         if (num >= ndomains) {
1570                                 spin_unlock_irqrestore(&iommu->lock, flags);
1571                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1572                                 return -EFAULT;
1573                         }
1574
1575                         set_bit(num, iommu->domain_ids);
1576                         iommu->domains[num] = domain;
1577                         id = num;
1578                 }
1579
1580                 /* Skip top levels of page tables for
1581                  * iommu which has less agaw than default.
1582                  * Unnecessary for PT mode.
1583                  */
1584                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1585                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1586                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1587                                 if (!dma_pte_present(pgd)) {
1588                                         spin_unlock_irqrestore(&iommu->lock, flags);
1589                                         return -ENOMEM;
1590                                 }
1591                         }
1592                 }
1593         }
1594
1595         context_set_domain_id(context, id);
1596
1597         if (translation != CONTEXT_TT_PASS_THROUGH) {
1598                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1599                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1600                                      CONTEXT_TT_MULTI_LEVEL;
1601         }
1602         /*
1603          * In pass through mode, AW must be programmed to indicate the largest
1604          * AGAW value supported by hardware. And ASR is ignored by hardware.
1605          */
1606         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1607                 context_set_address_width(context, iommu->msagaw);
1608         else {
1609                 context_set_address_root(context, virt_to_phys(pgd));
1610                 context_set_address_width(context, iommu->agaw);
1611         }
1612
1613         context_set_translation_type(context, translation);
1614         context_set_fault_enable(context);
1615         context_set_present(context);
1616         domain_flush_cache(domain, context, sizeof(*context));
1617
1618         /*
1619          * It's a non-present to present mapping. If hardware doesn't cache
1620          * non-present entry we only need to flush the write-buffer. If the
1621          * _does_ cache non-present entries, then it does so in the special
1622          * domain #0, which we have to flush:
1623          */
1624         if (cap_caching_mode(iommu->cap)) {
1625                 iommu->flush.flush_context(iommu, 0,
1626                                            (((u16)bus) << 8) | devfn,
1627                                            DMA_CCMD_MASK_NOBIT,
1628                                            DMA_CCMD_DEVICE_INVL);
1629                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1630         } else {
1631                 iommu_flush_write_buffer(iommu);
1632         }
1633         iommu_enable_dev_iotlb(info);
1634         spin_unlock_irqrestore(&iommu->lock, flags);
1635
1636         spin_lock_irqsave(&domain->iommu_lock, flags);
1637         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1638                 domain->iommu_count++;
1639                 if (domain->iommu_count == 1)
1640                         domain->nid = iommu->node;
1641                 domain_update_iommu_cap(domain);
1642         }
1643         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1644         return 0;
1645 }
1646
1647 static int
1648 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1649                         int translation)
1650 {
1651         int ret;
1652         struct pci_dev *tmp, *parent;
1653
1654         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1655                                          pdev->bus->number, pdev->devfn,
1656                                          translation);
1657         if (ret)
1658                 return ret;
1659
1660         /* dependent device mapping */
1661         tmp = pci_find_upstream_pcie_bridge(pdev);
1662         if (!tmp)
1663                 return 0;
1664         /* Secondary interface's bus number and devfn 0 */
1665         parent = pdev->bus->self;
1666         while (parent != tmp) {
1667                 ret = domain_context_mapping_one(domain,
1668                                                  pci_domain_nr(parent->bus),
1669                                                  parent->bus->number,
1670                                                  parent->devfn, translation);
1671                 if (ret)
1672                         return ret;
1673                 parent = parent->bus->self;
1674         }
1675         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1676                 return domain_context_mapping_one(domain,
1677                                         pci_domain_nr(tmp->subordinate),
1678                                         tmp->subordinate->number, 0,
1679                                         translation);
1680         else /* this is a legacy PCI bridge */
1681                 return domain_context_mapping_one(domain,
1682                                                   pci_domain_nr(tmp->bus),
1683                                                   tmp->bus->number,
1684                                                   tmp->devfn,
1685                                                   translation);
1686 }
1687
1688 static int domain_context_mapped(struct pci_dev *pdev)
1689 {
1690         int ret;
1691         struct pci_dev *tmp, *parent;
1692         struct intel_iommu *iommu;
1693
1694         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1695                                 pdev->devfn);
1696         if (!iommu)
1697                 return -ENODEV;
1698
1699         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1700         if (!ret)
1701                 return ret;
1702         /* dependent device mapping */
1703         tmp = pci_find_upstream_pcie_bridge(pdev);
1704         if (!tmp)
1705                 return ret;
1706         /* Secondary interface's bus number and devfn 0 */
1707         parent = pdev->bus->self;
1708         while (parent != tmp) {
1709                 ret = device_context_mapped(iommu, parent->bus->number,
1710                                             parent->devfn);
1711                 if (!ret)
1712                         return ret;
1713                 parent = parent->bus->self;
1714         }
1715         if (pci_is_pcie(tmp))
1716                 return device_context_mapped(iommu, tmp->subordinate->number,
1717                                              0);
1718         else
1719                 return device_context_mapped(iommu, tmp->bus->number,
1720                                              tmp->devfn);
1721 }
1722
1723 /* Returns a number of VTD pages, but aligned to MM page size */
1724 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1725                                             size_t size)
1726 {
1727         host_addr &= ~PAGE_MASK;
1728         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1729 }
1730
1731 /* Return largest possible superpage level for a given mapping */
1732 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1733                                           unsigned long iov_pfn,
1734                                           unsigned long phy_pfn,
1735                                           unsigned long pages)
1736 {
1737         int support, level = 1;
1738         unsigned long pfnmerge;
1739
1740         support = domain->iommu_superpage;
1741
1742         /* To use a large page, the virtual *and* physical addresses
1743            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1744            of them will mean we have to use smaller pages. So just
1745            merge them and check both at once. */
1746         pfnmerge = iov_pfn | phy_pfn;
1747
1748         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1749                 pages >>= VTD_STRIDE_SHIFT;
1750                 if (!pages)
1751                         break;
1752                 pfnmerge >>= VTD_STRIDE_SHIFT;
1753                 level++;
1754                 support--;
1755         }
1756         return level;
1757 }
1758
1759 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1760                             struct scatterlist *sg, unsigned long phys_pfn,
1761                             unsigned long nr_pages, int prot)
1762 {
1763         struct dma_pte *first_pte = NULL, *pte = NULL;
1764         phys_addr_t uninitialized_var(pteval);
1765         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1766         unsigned long sg_res;
1767         unsigned int largepage_lvl = 0;
1768         unsigned long lvl_pages = 0;
1769
1770         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1771
1772         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1773                 return -EINVAL;
1774
1775         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1776
1777         if (sg)
1778                 sg_res = 0;
1779         else {
1780                 sg_res = nr_pages + 1;
1781                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1782         }
1783
1784         while (nr_pages > 0) {
1785                 uint64_t tmp;
1786
1787                 if (!sg_res) {
1788                         sg_res = aligned_nrpages(sg->offset, sg->length);
1789                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1790                         sg->dma_length = sg->length;
1791                         pteval = page_to_phys(sg_page(sg)) | prot;
1792                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1793                 }
1794
1795                 if (!pte) {
1796                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1797
1798                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1799                         if (!pte)
1800                                 return -ENOMEM;
1801                         /* It is large page*/
1802                         if (largepage_lvl > 1)
1803                                 pteval |= DMA_PTE_LARGE_PAGE;
1804                         else
1805                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1806
1807                 }
1808                 /* We don't need lock here, nobody else
1809                  * touches the iova range
1810                  */
1811                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1812                 if (tmp) {
1813                         static int dumps = 5;
1814                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1815                                iov_pfn, tmp, (unsigned long long)pteval);
1816                         if (dumps) {
1817                                 dumps--;
1818                                 debug_dma_dump_mappings(NULL);
1819                         }
1820                         WARN_ON(1);
1821                 }
1822
1823                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1824
1825                 BUG_ON(nr_pages < lvl_pages);
1826                 BUG_ON(sg_res < lvl_pages);
1827
1828                 nr_pages -= lvl_pages;
1829                 iov_pfn += lvl_pages;
1830                 phys_pfn += lvl_pages;
1831                 pteval += lvl_pages * VTD_PAGE_SIZE;
1832                 sg_res -= lvl_pages;
1833
1834                 /* If the next PTE would be the first in a new page, then we
1835                    need to flush the cache on the entries we've just written.
1836                    And then we'll need to recalculate 'pte', so clear it and
1837                    let it get set again in the if (!pte) block above.
1838
1839                    If we're done (!nr_pages) we need to flush the cache too.
1840
1841                    Also if we've been setting superpages, we may need to
1842                    recalculate 'pte' and switch back to smaller pages for the
1843                    end of the mapping, if the trailing size is not enough to
1844                    use another superpage (i.e. sg_res < lvl_pages). */
1845                 pte++;
1846                 if (!nr_pages || first_pte_in_page(pte) ||
1847                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1848                         domain_flush_cache(domain, first_pte,
1849                                            (void *)pte - (void *)first_pte);
1850                         pte = NULL;
1851                 }
1852
1853                 if (!sg_res && nr_pages)
1854                         sg = sg_next(sg);
1855         }
1856         return 0;
1857 }
1858
1859 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1860                                     struct scatterlist *sg, unsigned long nr_pages,
1861                                     int prot)
1862 {
1863         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1864 }
1865
1866 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1867                                      unsigned long phys_pfn, unsigned long nr_pages,
1868                                      int prot)
1869 {
1870         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1871 }
1872
1873 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1874 {
1875         if (!iommu)
1876                 return;
1877
1878         clear_context_table(iommu, bus, devfn);
1879         iommu->flush.flush_context(iommu, 0, 0, 0,
1880                                            DMA_CCMD_GLOBAL_INVL);
1881         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1882 }
1883
1884 static void domain_remove_dev_info(struct dmar_domain *domain)
1885 {
1886         struct device_domain_info *info;
1887         unsigned long flags;
1888         struct intel_iommu *iommu;
1889
1890         spin_lock_irqsave(&device_domain_lock, flags);
1891         while (!list_empty(&domain->devices)) {
1892                 info = list_entry(domain->devices.next,
1893                         struct device_domain_info, link);
1894                 list_del(&info->link);
1895                 list_del(&info->global);
1896                 if (info->dev)
1897                         info->dev->dev.archdata.iommu = NULL;
1898                 spin_unlock_irqrestore(&device_domain_lock, flags);
1899
1900                 iommu_disable_dev_iotlb(info);
1901                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1902                 iommu_detach_dev(iommu, info->bus, info->devfn);
1903                 free_devinfo_mem(info);
1904
1905                 spin_lock_irqsave(&device_domain_lock, flags);
1906         }
1907         spin_unlock_irqrestore(&device_domain_lock, flags);
1908 }
1909
1910 /*
1911  * find_domain
1912  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1913  */
1914 static struct dmar_domain *
1915 find_domain(struct pci_dev *pdev)
1916 {
1917         struct device_domain_info *info;
1918
1919         /* No lock here, assumes no domain exit in normal case */
1920         info = pdev->dev.archdata.iommu;
1921         if (info)
1922                 return info->domain;
1923         return NULL;
1924 }
1925
1926 /* domain is initialized */
1927 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1928 {
1929         struct dmar_domain *domain, *found = NULL;
1930         struct intel_iommu *iommu;
1931         struct dmar_drhd_unit *drhd;
1932         struct device_domain_info *info, *tmp;
1933         struct pci_dev *dev_tmp;
1934         unsigned long flags;
1935         int bus = 0, devfn = 0;
1936         int segment;
1937         int ret;
1938
1939         domain = find_domain(pdev);
1940         if (domain)
1941                 return domain;
1942
1943         segment = pci_domain_nr(pdev->bus);
1944
1945         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1946         if (dev_tmp) {
1947                 if (pci_is_pcie(dev_tmp)) {
1948                         bus = dev_tmp->subordinate->number;
1949                         devfn = 0;
1950                 } else {
1951                         bus = dev_tmp->bus->number;
1952                         devfn = dev_tmp->devfn;
1953                 }
1954                 spin_lock_irqsave(&device_domain_lock, flags);
1955                 list_for_each_entry(info, &device_domain_list, global) {
1956                         if (info->segment == segment &&
1957                             info->bus == bus && info->devfn == devfn) {
1958                                 found = info->domain;
1959                                 break;
1960                         }
1961                 }
1962                 spin_unlock_irqrestore(&device_domain_lock, flags);
1963                 /* pcie-pci bridge already has a domain, uses it */
1964                 if (found) {
1965                         domain = found;
1966                         goto found_domain;
1967                 }
1968         }
1969
1970         domain = alloc_domain();
1971         if (!domain)
1972                 goto error;
1973
1974         /* Allocate new domain for the device */
1975         drhd = dmar_find_matched_drhd_unit(pdev);
1976         if (!drhd) {
1977                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1978                         pci_name(pdev));
1979                 return NULL;
1980         }
1981         iommu = drhd->iommu;
1982
1983         ret = iommu_attach_domain(domain, iommu);
1984         if (ret) {
1985                 free_domain_mem(domain);
1986                 goto error;
1987         }
1988
1989         if (domain_init(domain, gaw)) {
1990                 domain_exit(domain);
1991                 goto error;
1992         }
1993
1994         /* register pcie-to-pci device */
1995         if (dev_tmp) {
1996                 info = alloc_devinfo_mem();
1997                 if (!info) {
1998                         domain_exit(domain);
1999                         goto error;
2000                 }
2001                 info->segment = segment;
2002                 info->bus = bus;
2003                 info->devfn = devfn;
2004                 info->dev = NULL;
2005                 info->domain = domain;
2006                 /* This domain is shared by devices under p2p bridge */
2007                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2008
2009                 /* pcie-to-pci bridge already has a domain, uses it */
2010                 found = NULL;
2011                 spin_lock_irqsave(&device_domain_lock, flags);
2012                 list_for_each_entry(tmp, &device_domain_list, global) {
2013                         if (tmp->segment == segment &&
2014                             tmp->bus == bus && tmp->devfn == devfn) {
2015                                 found = tmp->domain;
2016                                 break;
2017                         }
2018                 }
2019                 if (found) {
2020                         spin_unlock_irqrestore(&device_domain_lock, flags);
2021                         free_devinfo_mem(info);
2022                         domain_exit(domain);
2023                         domain = found;
2024                 } else {
2025                         list_add(&info->link, &domain->devices);
2026                         list_add(&info->global, &device_domain_list);
2027                         spin_unlock_irqrestore(&device_domain_lock, flags);
2028                 }
2029         }
2030
2031 found_domain:
2032         info = alloc_devinfo_mem();
2033         if (!info)
2034                 goto error;
2035         info->segment = segment;
2036         info->bus = pdev->bus->number;
2037         info->devfn = pdev->devfn;
2038         info->dev = pdev;
2039         info->domain = domain;
2040         spin_lock_irqsave(&device_domain_lock, flags);
2041         /* somebody is fast */
2042         found = find_domain(pdev);
2043         if (found != NULL) {
2044                 spin_unlock_irqrestore(&device_domain_lock, flags);
2045                 if (found != domain) {
2046                         domain_exit(domain);
2047                         domain = found;
2048                 }
2049                 free_devinfo_mem(info);
2050                 return domain;
2051         }
2052         list_add(&info->link, &domain->devices);
2053         list_add(&info->global, &device_domain_list);
2054         pdev->dev.archdata.iommu = info;
2055         spin_unlock_irqrestore(&device_domain_lock, flags);
2056         return domain;
2057 error:
2058         /* recheck it here, maybe others set it */
2059         return find_domain(pdev);
2060 }
2061
2062 static int iommu_identity_mapping;
2063 #define IDENTMAP_ALL            1
2064 #define IDENTMAP_GFX            2
2065 #define IDENTMAP_AZALIA         4
2066
2067 static int iommu_domain_identity_map(struct dmar_domain *domain,
2068                                      unsigned long long start,
2069                                      unsigned long long end)
2070 {
2071         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2072         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2073
2074         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2075                           dma_to_mm_pfn(last_vpfn))) {
2076                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2077                 return -ENOMEM;
2078         }
2079
2080         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2081                  start, end, domain->id);
2082         /*
2083          * RMRR range might have overlap with physical memory range,
2084          * clear it first
2085          */
2086         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2087
2088         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2089                                   last_vpfn - first_vpfn + 1,
2090                                   DMA_PTE_READ|DMA_PTE_WRITE);
2091 }
2092
2093 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2094                                       unsigned long long start,
2095                                       unsigned long long end)
2096 {
2097         struct dmar_domain *domain;
2098         int ret;
2099
2100         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2101         if (!domain)
2102                 return -ENOMEM;
2103
2104         /* For _hardware_ passthrough, don't bother. But for software
2105            passthrough, we do it anyway -- it may indicate a memory
2106            range which is reserved in E820, so which didn't get set
2107            up to start with in si_domain */
2108         if (domain == si_domain && hw_pass_through) {
2109                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2110                        pci_name(pdev), start, end);
2111                 return 0;
2112         }
2113
2114         printk(KERN_INFO
2115                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2116                pci_name(pdev), start, end);
2117         
2118         if (end < start) {
2119                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2120                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2121                         dmi_get_system_info(DMI_BIOS_VENDOR),
2122                         dmi_get_system_info(DMI_BIOS_VERSION),
2123                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2124                 ret = -EIO;
2125                 goto error;
2126         }
2127
2128         if (end >> agaw_to_width(domain->agaw)) {
2129                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2130                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2131                      agaw_to_width(domain->agaw),
2132                      dmi_get_system_info(DMI_BIOS_VENDOR),
2133                      dmi_get_system_info(DMI_BIOS_VERSION),
2134                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2135                 ret = -EIO;
2136                 goto error;
2137         }
2138
2139         ret = iommu_domain_identity_map(domain, start, end);
2140         if (ret)
2141                 goto error;
2142
2143         /* context entry init */
2144         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2145         if (ret)
2146                 goto error;
2147
2148         return 0;
2149
2150  error:
2151         domain_exit(domain);
2152         return ret;
2153 }
2154
2155 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2156         struct pci_dev *pdev)
2157 {
2158         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2159                 return 0;
2160         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2161                 rmrr->end_address);
2162 }
2163
2164 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2165 static inline void iommu_prepare_isa(void)
2166 {
2167         struct pci_dev *pdev;
2168         int ret;
2169
2170         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2171         if (!pdev)
2172                 return;
2173
2174         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2175         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2176
2177         if (ret)
2178                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2179                        "floppy might not work\n");
2180
2181 }
2182 #else
2183 static inline void iommu_prepare_isa(void)
2184 {
2185         return;
2186 }
2187 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2188
2189 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2190
2191 static int __init si_domain_work_fn(unsigned long start_pfn,
2192                                     unsigned long end_pfn, void *datax)
2193 {
2194         int *ret = datax;
2195
2196         *ret = iommu_domain_identity_map(si_domain,
2197                                          (uint64_t)start_pfn << PAGE_SHIFT,
2198                                          (uint64_t)end_pfn << PAGE_SHIFT);
2199         return *ret;
2200
2201 }
2202
2203 static int __init si_domain_init(int hw)
2204 {
2205         struct dmar_drhd_unit *drhd;
2206         struct intel_iommu *iommu;
2207         int nid, ret = 0;
2208
2209         si_domain = alloc_domain();
2210         if (!si_domain)
2211                 return -EFAULT;
2212
2213         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2214
2215         for_each_active_iommu(iommu, drhd) {
2216                 ret = iommu_attach_domain(si_domain, iommu);
2217                 if (ret) {
2218                         domain_exit(si_domain);
2219                         return -EFAULT;
2220                 }
2221         }
2222
2223         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2224                 domain_exit(si_domain);
2225                 return -EFAULT;
2226         }
2227
2228         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2229
2230         if (hw)
2231                 return 0;
2232
2233         for_each_online_node(nid) {
2234                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2235                 if (ret)
2236                         return ret;
2237         }
2238
2239         return 0;
2240 }
2241
2242 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2243                                           struct pci_dev *pdev);
2244 static int identity_mapping(struct pci_dev *pdev)
2245 {
2246         struct device_domain_info *info;
2247
2248         if (likely(!iommu_identity_mapping))
2249                 return 0;
2250
2251         info = pdev->dev.archdata.iommu;
2252         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2253                 return (info->domain == si_domain);
2254
2255         return 0;
2256 }
2257
2258 static int domain_add_dev_info(struct dmar_domain *domain,
2259                                struct pci_dev *pdev,
2260                                int translation)
2261 {
2262         struct device_domain_info *info;
2263         unsigned long flags;
2264         int ret;
2265
2266         info = alloc_devinfo_mem();
2267         if (!info)
2268                 return -ENOMEM;
2269
2270         ret = domain_context_mapping(domain, pdev, translation);
2271         if (ret) {
2272                 free_devinfo_mem(info);
2273                 return ret;
2274         }
2275
2276         info->segment = pci_domain_nr(pdev->bus);
2277         info->bus = pdev->bus->number;
2278         info->devfn = pdev->devfn;
2279         info->dev = pdev;
2280         info->domain = domain;
2281
2282         spin_lock_irqsave(&device_domain_lock, flags);
2283         list_add(&info->link, &domain->devices);
2284         list_add(&info->global, &device_domain_list);
2285         pdev->dev.archdata.iommu = info;
2286         spin_unlock_irqrestore(&device_domain_lock, flags);
2287
2288         return 0;
2289 }
2290
2291 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2292 {
2293         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2294                 return 1;
2295
2296         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2297                 return 1;
2298
2299         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2300                 return 0;
2301
2302         /*
2303          * We want to start off with all devices in the 1:1 domain, and
2304          * take them out later if we find they can't access all of memory.
2305          *
2306          * However, we can't do this for PCI devices behind bridges,
2307          * because all PCI devices behind the same bridge will end up
2308          * with the same source-id on their transactions.
2309          *
2310          * Practically speaking, we can't change things around for these
2311          * devices at run-time, because we can't be sure there'll be no
2312          * DMA transactions in flight for any of their siblings.
2313          * 
2314          * So PCI devices (unless they're on the root bus) as well as
2315          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2316          * the 1:1 domain, just in _case_ one of their siblings turns out
2317          * not to be able to map all of memory.
2318          */
2319         if (!pci_is_pcie(pdev)) {
2320                 if (!pci_is_root_bus(pdev->bus))
2321                         return 0;
2322                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2323                         return 0;
2324         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2325                 return 0;
2326
2327         /* 
2328          * At boot time, we don't yet know if devices will be 64-bit capable.
2329          * Assume that they will -- if they turn out not to be, then we can 
2330          * take them out of the 1:1 domain later.
2331          */
2332         if (!startup) {
2333                 /*
2334                  * If the device's dma_mask is less than the system's memory
2335                  * size then this is not a candidate for identity mapping.
2336                  */
2337                 u64 dma_mask = pdev->dma_mask;
2338
2339                 if (pdev->dev.coherent_dma_mask &&
2340                     pdev->dev.coherent_dma_mask < dma_mask)
2341                         dma_mask = pdev->dev.coherent_dma_mask;
2342
2343                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2344         }
2345
2346         return 1;
2347 }
2348
2349 static int __init iommu_prepare_static_identity_mapping(int hw)
2350 {
2351         struct pci_dev *pdev = NULL;
2352         int ret;
2353
2354         ret = si_domain_init(hw);
2355         if (ret)
2356                 return -EFAULT;
2357
2358         for_each_pci_dev(pdev) {
2359                 /* Skip Host/PCI Bridge devices */
2360                 if (IS_BRIDGE_HOST_DEVICE(pdev))
2361                         continue;
2362                 if (iommu_should_identity_map(pdev, 1)) {
2363                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2364                                hw ? "hardware" : "software", pci_name(pdev));
2365
2366                         ret = domain_add_dev_info(si_domain, pdev,
2367                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2368                                                      CONTEXT_TT_MULTI_LEVEL);
2369                         if (ret)
2370                                 return ret;
2371                 }
2372         }
2373
2374         return 0;
2375 }
2376
2377 static int __init init_dmars(void)
2378 {
2379         struct dmar_drhd_unit *drhd;
2380         struct dmar_rmrr_unit *rmrr;
2381         struct pci_dev *pdev;
2382         struct intel_iommu *iommu;
2383         int i, ret;
2384
2385         /*
2386          * for each drhd
2387          *    allocate root
2388          *    initialize and program root entry to not present
2389          * endfor
2390          */
2391         for_each_drhd_unit(drhd) {
2392                 g_num_of_iommus++;
2393                 /*
2394                  * lock not needed as this is only incremented in the single
2395                  * threaded kernel __init code path all other access are read
2396                  * only
2397                  */
2398         }
2399
2400         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2401                         GFP_KERNEL);
2402         if (!g_iommus) {
2403                 printk(KERN_ERR "Allocating global iommu array failed\n");
2404                 ret = -ENOMEM;
2405                 goto error;
2406         }
2407
2408         deferred_flush = kzalloc(g_num_of_iommus *
2409                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2410         if (!deferred_flush) {
2411                 ret = -ENOMEM;
2412                 goto error;
2413         }
2414
2415         for_each_drhd_unit(drhd) {
2416                 if (drhd->ignored)
2417                         continue;
2418
2419                 iommu = drhd->iommu;
2420                 g_iommus[iommu->seq_id] = iommu;
2421
2422                 ret = iommu_init_domains(iommu);
2423                 if (ret)
2424                         goto error;
2425
2426                 /*
2427                  * TBD:
2428                  * we could share the same root & context tables
2429                  * among all IOMMU's. Need to Split it later.
2430                  */
2431                 ret = iommu_alloc_root_entry(iommu);
2432                 if (ret) {
2433                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2434                         goto error;
2435                 }
2436                 if (!ecap_pass_through(iommu->ecap))
2437                         hw_pass_through = 0;
2438         }
2439
2440         /*
2441          * Start from the sane iommu hardware state.
2442          */
2443         for_each_drhd_unit(drhd) {
2444                 if (drhd->ignored)
2445                         continue;
2446
2447                 iommu = drhd->iommu;
2448
2449                 /*
2450                  * If the queued invalidation is already initialized by us
2451                  * (for example, while enabling interrupt-remapping) then
2452                  * we got the things already rolling from a sane state.
2453                  */
2454                 if (iommu->qi)
2455                         continue;
2456
2457                 /*
2458                  * Clear any previous faults.
2459                  */
2460                 dmar_fault(-1, iommu);
2461                 /*
2462                  * Disable queued invalidation if supported and already enabled
2463                  * before OS handover.
2464                  */
2465                 dmar_disable_qi(iommu);
2466         }
2467
2468         for_each_drhd_unit(drhd) {
2469                 if (drhd->ignored)
2470                         continue;
2471
2472                 iommu = drhd->iommu;
2473
2474                 if (dmar_enable_qi(iommu)) {
2475                         /*
2476                          * Queued Invalidate not enabled, use Register Based
2477                          * Invalidate
2478                          */
2479                         iommu->flush.flush_context = __iommu_flush_context;
2480                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2481                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2482                                "invalidation\n",
2483                                 iommu->seq_id,
2484                                (unsigned long long)drhd->reg_base_addr);
2485                 } else {
2486                         iommu->flush.flush_context = qi_flush_context;
2487                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2488                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2489                                "invalidation\n",
2490                                 iommu->seq_id,
2491                                (unsigned long long)drhd->reg_base_addr);
2492                 }
2493         }
2494
2495         if (iommu_pass_through)
2496                 iommu_identity_mapping |= IDENTMAP_ALL;
2497
2498 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2499         iommu_identity_mapping |= IDENTMAP_GFX;
2500 #endif
2501
2502         check_tylersburg_isoch();
2503
2504         /*
2505          * If pass through is not set or not enabled, setup context entries for
2506          * identity mappings for rmrr, gfx, and isa and may fall back to static
2507          * identity mapping if iommu_identity_mapping is set.
2508          */
2509         if (iommu_identity_mapping) {
2510                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2511                 if (ret) {
2512                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2513                         goto error;
2514                 }
2515         }
2516         /*
2517          * For each rmrr
2518          *   for each dev attached to rmrr
2519          *   do
2520          *     locate drhd for dev, alloc domain for dev
2521          *     allocate free domain
2522          *     allocate page table entries for rmrr
2523          *     if context not allocated for bus
2524          *           allocate and init context
2525          *           set present in root table for this bus
2526          *     init context with domain, translation etc
2527          *    endfor
2528          * endfor
2529          */
2530         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2531         for_each_rmrr_units(rmrr) {
2532                 for (i = 0; i < rmrr->devices_cnt; i++) {
2533                         pdev = rmrr->devices[i];
2534                         /*
2535                          * some BIOS lists non-exist devices in DMAR
2536                          * table.
2537                          */
2538                         if (!pdev)
2539                                 continue;
2540                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2541                         if (ret)
2542                                 printk(KERN_ERR
2543                                        "IOMMU: mapping reserved region failed\n");
2544                 }
2545         }
2546
2547         iommu_prepare_isa();
2548
2549         /*
2550          * for each drhd
2551          *   enable fault log
2552          *   global invalidate context cache
2553          *   global invalidate iotlb
2554          *   enable translation
2555          */
2556         for_each_drhd_unit(drhd) {
2557                 if (drhd->ignored) {
2558                         /*
2559                          * we always have to disable PMRs or DMA may fail on
2560                          * this device
2561                          */
2562                         if (force_on)
2563                                 iommu_disable_protect_mem_regions(drhd->iommu);
2564                         continue;
2565                 }
2566                 iommu = drhd->iommu;
2567
2568                 iommu_flush_write_buffer(iommu);
2569
2570                 ret = dmar_set_interrupt(iommu);
2571                 if (ret)
2572                         goto error;
2573
2574                 iommu_set_root_entry(iommu);
2575
2576                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2577                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2578
2579                 ret = iommu_enable_translation(iommu);
2580                 if (ret)
2581                         goto error;
2582
2583                 iommu_disable_protect_mem_regions(iommu);
2584         }
2585
2586         return 0;
2587 error:
2588         for_each_drhd_unit(drhd) {
2589                 if (drhd->ignored)
2590                         continue;
2591                 iommu = drhd->iommu;
2592                 free_iommu(iommu);
2593         }
2594         kfree(g_iommus);
2595         return ret;
2596 }
2597
2598 /* This takes a number of _MM_ pages, not VTD pages */
2599 static struct iova *intel_alloc_iova(struct device *dev,
2600                                      struct dmar_domain *domain,
2601                                      unsigned long nrpages, uint64_t dma_mask)
2602 {
2603         struct pci_dev *pdev = to_pci_dev(dev);
2604         struct iova *iova = NULL;
2605
2606         /* Restrict dma_mask to the width that the iommu can handle */
2607         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2608
2609         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2610                 /*
2611                  * First try to allocate an io virtual address in
2612                  * DMA_BIT_MASK(32) and if that fails then try allocating
2613                  * from higher range
2614                  */
2615                 iova = alloc_iova(&domain->iovad, nrpages,
2616                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2617                 if (iova)
2618                         return iova;
2619         }
2620         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2621         if (unlikely(!iova)) {
2622                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2623                        nrpages, pci_name(pdev));
2624                 return NULL;
2625         }
2626
2627         return iova;
2628 }
2629
2630 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2631 {
2632         struct dmar_domain *domain;
2633         int ret;
2634
2635         domain = get_domain_for_dev(pdev,
2636                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2637         if (!domain) {
2638                 printk(KERN_ERR
2639                         "Allocating domain for %s failed", pci_name(pdev));
2640                 return NULL;
2641         }
2642
2643         /* make sure context mapping is ok */
2644         if (unlikely(!domain_context_mapped(pdev))) {
2645                 ret = domain_context_mapping(domain, pdev,
2646                                              CONTEXT_TT_MULTI_LEVEL);
2647                 if (ret) {
2648                         printk(KERN_ERR
2649                                 "Domain context map for %s failed",
2650                                 pci_name(pdev));
2651                         return NULL;
2652                 }
2653         }
2654
2655         return domain;
2656 }
2657
2658 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2659 {
2660         struct device_domain_info *info;
2661
2662         /* No lock here, assumes no domain exit in normal case */
2663         info = dev->dev.archdata.iommu;
2664         if (likely(info))
2665                 return info->domain;
2666
2667         return __get_valid_domain_for_dev(dev);
2668 }
2669
2670 static int iommu_dummy(struct pci_dev *pdev)
2671 {
2672         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2673 }
2674
2675 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2676 static int iommu_no_mapping(struct device *dev)
2677 {
2678         struct pci_dev *pdev;
2679         int found;
2680
2681         if (unlikely(dev->bus != &pci_bus_type))
2682                 return 1;
2683
2684         pdev = to_pci_dev(dev);
2685         if (iommu_dummy(pdev))
2686                 return 1;
2687
2688         if (!iommu_identity_mapping)
2689                 return 0;
2690
2691         found = identity_mapping(pdev);
2692         if (found) {
2693                 if (iommu_should_identity_map(pdev, 0))
2694                         return 1;
2695                 else {
2696                         /*
2697                          * 32 bit DMA is removed from si_domain and fall back
2698                          * to non-identity mapping.
2699                          */
2700                         domain_remove_one_dev_info(si_domain, pdev);
2701                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2702                                pci_name(pdev));
2703                         return 0;
2704                 }
2705         } else {
2706                 /*
2707                  * In case of a detached 64 bit DMA device from vm, the device
2708                  * is put into si_domain for identity mapping.
2709                  */
2710                 if (iommu_should_identity_map(pdev, 0)) {
2711                         int ret;
2712                         ret = domain_add_dev_info(si_domain, pdev,
2713                                                   hw_pass_through ?
2714                                                   CONTEXT_TT_PASS_THROUGH :
2715                                                   CONTEXT_TT_MULTI_LEVEL);
2716                         if (!ret) {
2717                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2718                                        pci_name(pdev));
2719                                 return 1;
2720                         }
2721                 }
2722         }
2723
2724         return 0;
2725 }
2726
2727 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2728                                      size_t size, int dir, u64 dma_mask)
2729 {
2730         struct pci_dev *pdev = to_pci_dev(hwdev);
2731         struct dmar_domain *domain;
2732         phys_addr_t start_paddr;
2733         struct iova *iova;
2734         int prot = 0;
2735         int ret;
2736         struct intel_iommu *iommu;
2737         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2738
2739         BUG_ON(dir == DMA_NONE);
2740
2741         if (iommu_no_mapping(hwdev))
2742                 return paddr;
2743
2744         domain = get_valid_domain_for_dev(pdev);
2745         if (!domain)
2746                 return 0;
2747
2748         iommu = domain_get_iommu(domain);
2749         size = aligned_nrpages(paddr, size);
2750
2751         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2752         if (!iova)
2753                 goto error;
2754
2755         /*
2756          * Check if DMAR supports zero-length reads on write only
2757          * mappings..
2758          */
2759         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2760                         !cap_zlr(iommu->cap))
2761                 prot |= DMA_PTE_READ;
2762         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2763                 prot |= DMA_PTE_WRITE;
2764         /*
2765          * paddr - (paddr + size) might be partial page, we should map the whole
2766          * page.  Note: if two part of one page are separately mapped, we
2767          * might have two guest_addr mapping to the same host paddr, but this
2768          * is not a big problem
2769          */
2770         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2771                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2772         if (ret)
2773                 goto error;
2774
2775         /* it's a non-present to present mapping. Only flush if caching mode */
2776         if (cap_caching_mode(iommu->cap))
2777                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2778         else
2779                 iommu_flush_write_buffer(iommu);
2780
2781         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2782         start_paddr += paddr & ~PAGE_MASK;
2783         return start_paddr;
2784
2785 error:
2786         if (iova)
2787                 __free_iova(&domain->iovad, iova);
2788         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2789                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2790         return 0;
2791 }
2792
2793 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2794                                  unsigned long offset, size_t size,
2795                                  enum dma_data_direction dir,
2796                                  struct dma_attrs *attrs)
2797 {
2798         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2799                                   dir, to_pci_dev(dev)->dma_mask);
2800 }
2801
2802 static void flush_unmaps(void)
2803 {
2804         int i, j;
2805
2806         timer_on = 0;
2807
2808         /* just flush them all */
2809         for (i = 0; i < g_num_of_iommus; i++) {
2810                 struct intel_iommu *iommu = g_iommus[i];
2811                 if (!iommu)
2812                         continue;
2813
2814                 if (!deferred_flush[i].next)
2815                         continue;
2816
2817                 /* In caching mode, global flushes turn emulation expensive */
2818                 if (!cap_caching_mode(iommu->cap))
2819                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2820                                          DMA_TLB_GLOBAL_FLUSH);
2821                 for (j = 0; j < deferred_flush[i].next; j++) {
2822                         unsigned long mask;
2823                         struct iova *iova = deferred_flush[i].iova[j];
2824                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2825
2826                         /* On real hardware multiple invalidations are expensive */
2827                         if (cap_caching_mode(iommu->cap))
2828                                 iommu_flush_iotlb_psi(iommu, domain->id,
2829                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2830                         else {
2831                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2832                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2833                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2834                         }
2835                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2836                 }
2837                 deferred_flush[i].next = 0;
2838         }
2839
2840         list_size = 0;
2841 }
2842
2843 static void flush_unmaps_timeout(unsigned long data)
2844 {
2845         unsigned long flags;
2846
2847         spin_lock_irqsave(&async_umap_flush_lock, flags);
2848         flush_unmaps();
2849         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2850 }
2851
2852 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2853 {
2854         unsigned long flags;
2855         int next, iommu_id;
2856         struct intel_iommu *iommu;
2857
2858         spin_lock_irqsave(&async_umap_flush_lock, flags);
2859         if (list_size == HIGH_WATER_MARK)
2860                 flush_unmaps();
2861
2862         iommu = domain_get_iommu(dom);
2863         iommu_id = iommu->seq_id;
2864
2865         next = deferred_flush[iommu_id].next;
2866         deferred_flush[iommu_id].domain[next] = dom;
2867         deferred_flush[iommu_id].iova[next] = iova;
2868         deferred_flush[iommu_id].next++;
2869
2870         if (!timer_on) {
2871                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2872                 timer_on = 1;
2873         }
2874         list_size++;
2875         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2876 }
2877
2878 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2879                              size_t size, enum dma_data_direction dir,
2880                              struct dma_attrs *attrs)
2881 {
2882         struct pci_dev *pdev = to_pci_dev(dev);
2883         struct dmar_domain *domain;
2884         unsigned long start_pfn, last_pfn;
2885         struct iova *iova;
2886         struct intel_iommu *iommu;
2887
2888         if (iommu_no_mapping(dev))
2889                 return;
2890
2891         domain = find_domain(pdev);
2892         BUG_ON(!domain);
2893
2894         iommu = domain_get_iommu(domain);
2895
2896         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2897         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2898                       (unsigned long long)dev_addr))
2899                 return;
2900
2901         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2902         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2903
2904         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2905                  pci_name(pdev), start_pfn, last_pfn);
2906
2907         /*  clear the whole page */
2908         dma_pte_clear_range(domain, start_pfn, last_pfn);
2909
2910         /* free page tables */
2911         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2912
2913         if (intel_iommu_strict) {
2914                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2915                                       last_pfn - start_pfn + 1, 0);
2916                 /* free iova */
2917                 __free_iova(&domain->iovad, iova);
2918         } else {
2919                 add_unmap(domain, iova);
2920                 /*
2921                  * queue up the release of the unmap to save the 1/6th of the
2922                  * cpu used up by the iotlb flush operation...
2923                  */
2924         }
2925 }
2926
2927 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2928                                   dma_addr_t *dma_handle, gfp_t flags)
2929 {
2930         void *vaddr;
2931         int order;
2932
2933         size = PAGE_ALIGN(size);
2934         order = get_order(size);
2935
2936         if (!iommu_no_mapping(hwdev))
2937                 flags &= ~(GFP_DMA | GFP_DMA32);
2938         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2939                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2940                         flags |= GFP_DMA;
2941                 else
2942                         flags |= GFP_DMA32;
2943         }
2944
2945         vaddr = (void *)__get_free_pages(flags, order);
2946         if (!vaddr)
2947                 return NULL;
2948         memset(vaddr, 0, size);
2949
2950         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2951                                          DMA_BIDIRECTIONAL,
2952                                          hwdev->coherent_dma_mask);
2953         if (*dma_handle)
2954                 return vaddr;
2955         free_pages((unsigned long)vaddr, order);
2956         return NULL;
2957 }
2958
2959 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2960                                 dma_addr_t dma_handle)
2961 {
2962         int order;
2963
2964         size = PAGE_ALIGN(size);
2965         order = get_order(size);
2966
2967         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2968         free_pages((unsigned long)vaddr, order);
2969 }
2970
2971 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2972                            int nelems, enum dma_data_direction dir,
2973                            struct dma_attrs *attrs)
2974 {
2975         struct pci_dev *pdev = to_pci_dev(hwdev);
2976         struct dmar_domain *domain;
2977         unsigned long start_pfn, last_pfn;
2978         struct iova *iova;
2979         struct intel_iommu *iommu;
2980
2981         if (iommu_no_mapping(hwdev))
2982                 return;
2983
2984         domain = find_domain(pdev);
2985         BUG_ON(!domain);
2986
2987         iommu = domain_get_iommu(domain);
2988
2989         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2990         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2991                       (unsigned long long)sglist[0].dma_address))
2992                 return;
2993
2994         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2995         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2996
2997         /*  clear the whole page */
2998         dma_pte_clear_range(domain, start_pfn, last_pfn);
2999
3000         /* free page tables */
3001         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3002
3003         if (intel_iommu_strict) {
3004                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3005                                       last_pfn - start_pfn + 1, 0);
3006                 /* free iova */
3007                 __free_iova(&domain->iovad, iova);
3008         } else {
3009                 add_unmap(domain, iova);
3010                 /*
3011                  * queue up the release of the unmap to save the 1/6th of the
3012                  * cpu used up by the iotlb flush operation...
3013                  */
3014         }
3015 }
3016
3017 static int intel_nontranslate_map_sg(struct device *hddev,
3018         struct scatterlist *sglist, int nelems, int dir)
3019 {
3020         int i;
3021         struct scatterlist *sg;
3022
3023         for_each_sg(sglist, sg, nelems, i) {
3024                 BUG_ON(!sg_page(sg));
3025                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3026                 sg->dma_length = sg->length;
3027         }
3028         return nelems;
3029 }
3030
3031 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3032                         enum dma_data_direction dir, struct dma_attrs *attrs)
3033 {
3034         int i;
3035         struct pci_dev *pdev = to_pci_dev(hwdev);
3036         struct dmar_domain *domain;
3037         size_t size = 0;
3038         int prot = 0;
3039         struct iova *iova = NULL;
3040         int ret;
3041         struct scatterlist *sg;
3042         unsigned long start_vpfn;
3043         struct intel_iommu *iommu;
3044
3045         BUG_ON(dir == DMA_NONE);
3046         if (iommu_no_mapping(hwdev))
3047                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3048
3049         domain = get_valid_domain_for_dev(pdev);
3050         if (!domain)
3051                 return 0;
3052
3053         iommu = domain_get_iommu(domain);
3054
3055         for_each_sg(sglist, sg, nelems, i)
3056                 size += aligned_nrpages(sg->offset, sg->length);
3057
3058         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3059                                 pdev->dma_mask);
3060         if (!iova) {
3061                 sglist->dma_length = 0;
3062                 return 0;
3063         }
3064
3065         /*
3066          * Check if DMAR supports zero-length reads on write only
3067          * mappings..
3068          */
3069         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3070                         !cap_zlr(iommu->cap))
3071                 prot |= DMA_PTE_READ;
3072         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3073                 prot |= DMA_PTE_WRITE;
3074
3075         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3076
3077         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3078         if (unlikely(ret)) {
3079                 /*  clear the page */
3080                 dma_pte_clear_range(domain, start_vpfn,
3081                                     start_vpfn + size - 1);
3082                 /* free page tables */
3083                 dma_pte_free_pagetable(domain, start_vpfn,
3084                                        start_vpfn + size - 1);
3085                 /* free iova */
3086                 __free_iova(&domain->iovad, iova);
3087                 return 0;
3088         }
3089
3090         /* it's a non-present to present mapping. Only flush if caching mode */
3091         if (cap_caching_mode(iommu->cap))
3092                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3093         else
3094                 iommu_flush_write_buffer(iommu);
3095
3096         return nelems;
3097 }
3098
3099 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3100 {
3101         return !dma_addr;
3102 }
3103
3104 struct dma_map_ops intel_dma_ops = {
3105         .alloc_coherent = intel_alloc_coherent,
3106         .free_coherent = intel_free_coherent,
3107         .map_sg = intel_map_sg,
3108         .unmap_sg = intel_unmap_sg,
3109         .map_page = intel_map_page,
3110         .unmap_page = intel_unmap_page,
3111         .mapping_error = intel_mapping_error,
3112 };
3113
3114 static inline int iommu_domain_cache_init(void)
3115 {
3116         int ret = 0;
3117
3118         iommu_domain_cache = kmem_cache_create("iommu_domain",
3119                                          sizeof(struct dmar_domain),
3120                                          0,
3121                                          SLAB_HWCACHE_ALIGN,
3122
3123                                          NULL);
3124         if (!iommu_domain_cache) {
3125                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3126                 ret = -ENOMEM;
3127         }
3128
3129         return ret;
3130 }
3131
3132 static inline int iommu_devinfo_cache_init(void)
3133 {
3134         int ret = 0;
3135
3136         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3137                                          sizeof(struct device_domain_info),
3138                                          0,
3139                                          SLAB_HWCACHE_ALIGN,
3140                                          NULL);
3141         if (!iommu_devinfo_cache) {
3142                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3143                 ret = -ENOMEM;
3144         }
3145
3146         return ret;
3147 }
3148
3149 static inline int iommu_iova_cache_init(void)
3150 {
3151         int ret = 0;
3152
3153         iommu_iova_cache = kmem_cache_create("iommu_iova",
3154                                          sizeof(struct iova),
3155                                          0,
3156                                          SLAB_HWCACHE_ALIGN,
3157                                          NULL);
3158         if (!iommu_iova_cache) {
3159                 printk(KERN_ERR "Couldn't create iova cache\n");
3160                 ret = -ENOMEM;
3161         }
3162
3163         return ret;
3164 }
3165
3166 static int __init iommu_init_mempool(void)
3167 {
3168         int ret;
3169         ret = iommu_iova_cache_init();
3170         if (ret)
3171                 return ret;
3172
3173         ret = iommu_domain_cache_init();
3174         if (ret)
3175                 goto domain_error;
3176
3177         ret = iommu_devinfo_cache_init();
3178         if (!ret)
3179                 return ret;
3180
3181         kmem_cache_destroy(iommu_domain_cache);
3182 domain_error:
3183         kmem_cache_destroy(iommu_iova_cache);
3184
3185         return -ENOMEM;
3186 }
3187
3188 static void __init iommu_exit_mempool(void)
3189 {
3190         kmem_cache_destroy(iommu_devinfo_cache);
3191         kmem_cache_destroy(iommu_domain_cache);
3192         kmem_cache_destroy(iommu_iova_cache);
3193
3194 }
3195
3196 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3197 {
3198         struct dmar_drhd_unit *drhd;
3199         u32 vtbar;
3200         int rc;
3201
3202         /* We know that this device on this chipset has its own IOMMU.
3203          * If we find it under a different IOMMU, then the BIOS is lying
3204          * to us. Hope that the IOMMU for this device is actually
3205          * disabled, and it needs no translation...
3206          */
3207         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3208         if (rc) {
3209                 /* "can't" happen */
3210                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3211                 return;
3212         }
3213         vtbar &= 0xffff0000;
3214
3215         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3216         drhd = dmar_find_matched_drhd_unit(pdev);
3217         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3218                             TAINT_FIRMWARE_WORKAROUND,
3219                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3220                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3221 }
3222 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3223
3224 static void __init init_no_remapping_devices(void)
3225 {
3226         struct dmar_drhd_unit *drhd;
3227
3228         for_each_drhd_unit(drhd) {
3229                 if (!drhd->include_all) {
3230                         int i;
3231                         for (i = 0; i < drhd->devices_cnt; i++)
3232                                 if (drhd->devices[i] != NULL)
3233                                         break;
3234                         /* ignore DMAR unit if no pci devices exist */
3235                         if (i == drhd->devices_cnt)
3236                                 drhd->ignored = 1;
3237                 }
3238         }
3239
3240         for_each_drhd_unit(drhd) {
3241                 int i;
3242                 if (drhd->ignored || drhd->include_all)
3243                         continue;
3244
3245                 for (i = 0; i < drhd->devices_cnt; i++)
3246                         if (drhd->devices[i] &&
3247                             !IS_GFX_DEVICE(drhd->devices[i]))
3248                                 break;
3249
3250                 if (i < drhd->devices_cnt)
3251                         continue;
3252
3253                 /* This IOMMU has *only* gfx devices. Either bypass it or
3254                    set the gfx_mapped flag, as appropriate */
3255                 if (dmar_map_gfx) {
3256                         intel_iommu_gfx_mapped = 1;
3257                 } else {
3258                         drhd->ignored = 1;
3259                         for (i = 0; i < drhd->devices_cnt; i++) {
3260                                 if (!drhd->devices[i])
3261                                         continue;
3262                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3263                         }
3264                 }
3265         }
3266 }
3267
3268 #ifdef CONFIG_SUSPEND
3269 static int init_iommu_hw(void)
3270 {
3271         struct dmar_drhd_unit *drhd;
3272         struct intel_iommu *iommu = NULL;
3273
3274         for_each_active_iommu(iommu, drhd)
3275                 if (iommu->qi)
3276                         dmar_reenable_qi(iommu);
3277
3278         for_each_iommu(iommu, drhd) {
3279                 if (drhd->ignored) {
3280                         /*
3281                          * we always have to disable PMRs or DMA may fail on
3282                          * this device
3283                          */
3284                         if (force_on)
3285                                 iommu_disable_protect_mem_regions(iommu);
3286                         continue;
3287                 }
3288         
3289                 iommu_flush_write_buffer(iommu);
3290
3291                 iommu_set_root_entry(iommu);
3292
3293                 iommu->flush.flush_context(iommu, 0, 0, 0,
3294                                            DMA_CCMD_GLOBAL_INVL);
3295                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3296                                          DMA_TLB_GLOBAL_FLUSH);
3297                 if (iommu_enable_translation(iommu))
3298                         return 1;
3299                 iommu_disable_protect_mem_regions(iommu);
3300         }
3301
3302         return 0;
3303 }
3304
3305 static void iommu_flush_all(void)
3306 {
3307         struct dmar_drhd_unit *drhd;
3308         struct intel_iommu *iommu;
3309
3310         for_each_active_iommu(iommu, drhd) {
3311                 iommu->flush.flush_context(iommu, 0, 0, 0,
3312                                            DMA_CCMD_GLOBAL_INVL);
3313                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3314                                          DMA_TLB_GLOBAL_FLUSH);
3315         }
3316 }
3317
3318 static int iommu_suspend(void)
3319 {
3320         struct dmar_drhd_unit *drhd;
3321         struct intel_iommu *iommu = NULL;
3322         unsigned long flag;
3323
3324         for_each_active_iommu(iommu, drhd) {
3325                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3326                                                  GFP_ATOMIC);
3327                 if (!iommu->iommu_state)
3328                         goto nomem;
3329         }
3330
3331         iommu_flush_all();
3332
3333         for_each_active_iommu(iommu, drhd) {
3334                 iommu_disable_translation(iommu);
3335
3336                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3337
3338                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3339                         readl(iommu->reg + DMAR_FECTL_REG);
3340                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3341                         readl(iommu->reg + DMAR_FEDATA_REG);
3342                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3343                         readl(iommu->reg + DMAR_FEADDR_REG);
3344                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3345                         readl(iommu->reg + DMAR_FEUADDR_REG);
3346
3347                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3348         }
3349         return 0;
3350
3351 nomem:
3352         for_each_active_iommu(iommu, drhd)
3353                 kfree(iommu->iommu_state);
3354
3355         return -ENOMEM;
3356 }
3357
3358 static void iommu_resume(void)
3359 {
3360         struct dmar_drhd_unit *drhd;
3361         struct intel_iommu *iommu = NULL;
3362         unsigned long flag;
3363
3364         if (init_iommu_hw()) {
3365                 if (force_on)
3366                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3367                 else
3368                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3369                 return;
3370         }
3371
3372         for_each_active_iommu(iommu, drhd) {
3373
3374                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3375
3376                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3377                         iommu->reg + DMAR_FECTL_REG);
3378                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3379                         iommu->reg + DMAR_FEDATA_REG);
3380                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3381                         iommu->reg + DMAR_FEADDR_REG);
3382                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3383                         iommu->reg + DMAR_FEUADDR_REG);
3384
3385                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3386         }
3387
3388         for_each_active_iommu(iommu, drhd)
3389                 kfree(iommu->iommu_state);
3390 }
3391
3392 static struct syscore_ops iommu_syscore_ops = {
3393         .resume         = iommu_resume,
3394         .suspend        = iommu_suspend,
3395 };
3396
3397 static void __init init_iommu_pm_ops(void)
3398 {
3399         register_syscore_ops(&iommu_syscore_ops);
3400 }
3401
3402 #else
3403 static inline void init_iommu_pm_ops(void) {}
3404 #endif  /* CONFIG_PM */
3405
3406 LIST_HEAD(dmar_rmrr_units);
3407
3408 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3409 {
3410         list_add(&rmrr->list, &dmar_rmrr_units);
3411 }
3412
3413
3414 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3415 {
3416         struct acpi_dmar_reserved_memory *rmrr;
3417         struct dmar_rmrr_unit *rmrru;
3418
3419         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3420         if (!rmrru)
3421                 return -ENOMEM;
3422
3423         rmrru->hdr = header;
3424         rmrr = (struct acpi_dmar_reserved_memory *)header;
3425         rmrru->base_address = rmrr->base_address;
3426         rmrru->end_address = rmrr->end_address;
3427
3428         dmar_register_rmrr_unit(rmrru);
3429         return 0;
3430 }
3431
3432 static int __init
3433 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3434 {
3435         struct acpi_dmar_reserved_memory *rmrr;
3436         int ret;
3437
3438         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3439         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3440                 ((void *)rmrr) + rmrr->header.length,
3441                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3442
3443         if (ret || (rmrru->devices_cnt == 0)) {
3444                 list_del(&rmrru->list);
3445                 kfree(rmrru);
3446         }
3447         return ret;
3448 }
3449
3450 static LIST_HEAD(dmar_atsr_units);
3451
3452 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3453 {
3454         struct acpi_dmar_atsr *atsr;
3455         struct dmar_atsr_unit *atsru;
3456
3457         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3458         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3459         if (!atsru)
3460                 return -ENOMEM;
3461
3462         atsru->hdr = hdr;
3463         atsru->include_all = atsr->flags & 0x1;
3464
3465         list_add(&atsru->list, &dmar_atsr_units);
3466
3467         return 0;
3468 }
3469
3470 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3471 {
3472         int rc;
3473         struct acpi_dmar_atsr *atsr;
3474
3475         if (atsru->include_all)
3476                 return 0;
3477
3478         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3479         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3480                                 (void *)atsr + atsr->header.length,
3481                                 &atsru->devices_cnt, &atsru->devices,
3482                                 atsr->segment);
3483         if (rc || !atsru->devices_cnt) {
3484                 list_del(&atsru->list);
3485                 kfree(atsru);
3486         }
3487
3488         return rc;
3489 }
3490
3491 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3492 {
3493         int i;
3494         struct pci_bus *bus;
3495         struct acpi_dmar_atsr *atsr;
3496         struct dmar_atsr_unit *atsru;
3497
3498         dev = pci_physfn(dev);
3499
3500         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3501                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3502                 if (atsr->segment == pci_domain_nr(dev->bus))
3503                         goto found;
3504         }
3505
3506         return 0;
3507
3508 found:
3509         for (bus = dev->bus; bus; bus = bus->parent) {
3510                 struct pci_dev *bridge = bus->self;
3511
3512                 if (!bridge || !pci_is_pcie(bridge) ||
3513                     bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3514                         return 0;
3515
3516                 if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3517                         for (i = 0; i < atsru->devices_cnt; i++)
3518                                 if (atsru->devices[i] == bridge)
3519                                         return 1;
3520                         break;
3521                 }
3522         }
3523
3524         if (atsru->include_all)
3525                 return 1;
3526
3527         return 0;
3528 }
3529
3530 int dmar_parse_rmrr_atsr_dev(void)
3531 {
3532         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3533         struct dmar_atsr_unit *atsr, *atsr_n;
3534         int ret = 0;
3535
3536         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3537                 ret = rmrr_parse_dev(rmrr);
3538                 if (ret)
3539                         return ret;
3540         }
3541
3542         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3543                 ret = atsr_parse_dev(atsr);
3544                 if (ret)
3545                         return ret;
3546         }
3547
3548         return ret;
3549 }
3550
3551 /*
3552  * Here we only respond to action of unbound device from driver.
3553  *
3554  * Added device is not attached to its DMAR domain here yet. That will happen
3555  * when mapping the device to iova.
3556  */
3557 static int device_notifier(struct notifier_block *nb,
3558                                   unsigned long action, void *data)
3559 {
3560         struct device *dev = data;
3561         struct pci_dev *pdev = to_pci_dev(dev);
3562         struct dmar_domain *domain;
3563
3564         if (iommu_no_mapping(dev))
3565                 return 0;
3566
3567         domain = find_domain(pdev);
3568         if (!domain)
3569                 return 0;
3570
3571         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3572                 domain_remove_one_dev_info(domain, pdev);
3573
3574                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3575                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3576                     list_empty(&domain->devices))
3577                         domain_exit(domain);
3578         }
3579
3580         return 0;
3581 }
3582
3583 static struct notifier_block device_nb = {
3584         .notifier_call = device_notifier,
3585 };
3586
3587 int __init intel_iommu_init(void)
3588 {
3589         int ret = 0;
3590
3591         /* VT-d is required for a TXT/tboot launch, so enforce that */
3592         force_on = tboot_force_iommu();
3593
3594         if (dmar_table_init()) {
3595                 if (force_on)
3596                         panic("tboot: Failed to initialize DMAR table\n");
3597                 return  -ENODEV;
3598         }
3599
3600         if (dmar_dev_scope_init() < 0) {
3601                 if (force_on)
3602                         panic("tboot: Failed to initialize DMAR device scope\n");
3603                 return  -ENODEV;
3604         }
3605
3606         if (no_iommu || dmar_disabled)
3607                 return -ENODEV;
3608
3609         if (iommu_init_mempool()) {
3610                 if (force_on)
3611                         panic("tboot: Failed to initialize iommu memory\n");
3612                 return  -ENODEV;
3613         }
3614
3615         if (list_empty(&dmar_rmrr_units))
3616                 printk(KERN_INFO "DMAR: No RMRR found\n");
3617
3618         if (list_empty(&dmar_atsr_units))
3619                 printk(KERN_INFO "DMAR: No ATSR found\n");
3620
3621         if (dmar_init_reserved_ranges()) {
3622                 if (force_on)
3623                         panic("tboot: Failed to reserve iommu ranges\n");
3624                 return  -ENODEV;
3625         }
3626
3627         init_no_remapping_devices();
3628
3629         ret = init_dmars();
3630         if (ret) {
3631                 if (force_on)
3632                         panic("tboot: Failed to initialize DMARs\n");
3633                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3634                 put_iova_domain(&reserved_iova_list);
3635                 iommu_exit_mempool();
3636                 return ret;
3637         }
3638         printk(KERN_INFO
3639         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3640
3641         init_timer(&unmap_timer);
3642 #ifdef CONFIG_SWIOTLB
3643         swiotlb = 0;
3644 #endif
3645         dma_ops = &intel_dma_ops;
3646
3647         init_iommu_pm_ops();
3648
3649         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3650
3651         bus_register_notifier(&pci_bus_type, &device_nb);
3652
3653         intel_iommu_enabled = 1;
3654
3655         return 0;
3656 }
3657
3658 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3659                                            struct pci_dev *pdev)
3660 {
3661         struct pci_dev *tmp, *parent;
3662
3663         if (!iommu || !pdev)
3664                 return;
3665
3666         /* dependent device detach */
3667         tmp = pci_find_upstream_pcie_bridge(pdev);
3668         /* Secondary interface's bus number and devfn 0 */
3669         if (tmp) {
3670                 parent = pdev->bus->self;
3671                 while (parent != tmp) {
3672                         iommu_detach_dev(iommu, parent->bus->number,
3673                                          parent->devfn);
3674                         parent = parent->bus->self;
3675                 }
3676                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3677                         iommu_detach_dev(iommu,
3678                                 tmp->subordinate->number, 0);
3679                 else /* this is a legacy PCI bridge */
3680                         iommu_detach_dev(iommu, tmp->bus->number,
3681                                          tmp->devfn);
3682         }
3683 }
3684
3685 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3686                                           struct pci_dev *pdev)
3687 {
3688         struct device_domain_info *info;
3689         struct intel_iommu *iommu;
3690         unsigned long flags;
3691         int found = 0;
3692         struct list_head *entry, *tmp;
3693
3694         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3695                                 pdev->devfn);
3696         if (!iommu)
3697                 return;
3698
3699         spin_lock_irqsave(&device_domain_lock, flags);
3700         list_for_each_safe(entry, tmp, &domain->devices) {
3701                 info = list_entry(entry, struct device_domain_info, link);
3702                 if (info->segment == pci_domain_nr(pdev->bus) &&
3703                     info->bus == pdev->bus->number &&
3704                     info->devfn == pdev->devfn) {
3705                         list_del(&info->link);
3706                         list_del(&info->global);
3707                         if (info->dev)
3708                                 info->dev->dev.archdata.iommu = NULL;
3709                         spin_unlock_irqrestore(&device_domain_lock, flags);
3710
3711                         iommu_disable_dev_iotlb(info);
3712                         iommu_detach_dev(iommu, info->bus, info->devfn);
3713                         iommu_detach_dependent_devices(iommu, pdev);
3714                         free_devinfo_mem(info);
3715
3716                         spin_lock_irqsave(&device_domain_lock, flags);
3717
3718                         if (found)
3719                                 break;
3720                         else
3721                                 continue;
3722                 }
3723
3724                 /* if there is no other devices under the same iommu
3725                  * owned by this domain, clear this iommu in iommu_bmp
3726                  * update iommu count and coherency
3727                  */
3728                 if (iommu == device_to_iommu(info->segment, info->bus,
3729                                             info->devfn))
3730                         found = 1;
3731         }
3732
3733         spin_unlock_irqrestore(&device_domain_lock, flags);
3734
3735         if (found == 0) {
3736                 unsigned long tmp_flags;
3737                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3738                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3739                 domain->iommu_count--;
3740                 domain_update_iommu_cap(domain);
3741                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3742
3743                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3744                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3745                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3746                         clear_bit(domain->id, iommu->domain_ids);
3747                         iommu->domains[domain->id] = NULL;
3748                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3749                 }
3750         }
3751 }
3752
3753 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3754 {
3755         struct device_domain_info *info;
3756         struct intel_iommu *iommu;
3757         unsigned long flags1, flags2;
3758
3759         spin_lock_irqsave(&device_domain_lock, flags1);
3760         while (!list_empty(&domain->devices)) {
3761                 info = list_entry(domain->devices.next,
3762                         struct device_domain_info, link);
3763                 list_del(&info->link);
3764                 list_del(&info->global);
3765                 if (info->dev)
3766                         info->dev->dev.archdata.iommu = NULL;
3767
3768                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3769
3770                 iommu_disable_dev_iotlb(info);
3771                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3772                 iommu_detach_dev(iommu, info->bus, info->devfn);
3773                 iommu_detach_dependent_devices(iommu, info->dev);
3774
3775                 /* clear this iommu in iommu_bmp, update iommu count
3776                  * and capabilities
3777                  */
3778                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3779                 if (test_and_clear_bit(iommu->seq_id,
3780                                        &domain->iommu_bmp)) {
3781                         domain->iommu_count--;
3782                         domain_update_iommu_cap(domain);
3783                 }
3784                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3785
3786                 free_devinfo_mem(info);
3787                 spin_lock_irqsave(&device_domain_lock, flags1);
3788         }
3789         spin_unlock_irqrestore(&device_domain_lock, flags1);
3790 }
3791
3792 /* domain id for virtual machine, it won't be set in context */
3793 static unsigned long vm_domid;
3794
3795 static struct dmar_domain *iommu_alloc_vm_domain(void)
3796 {
3797         struct dmar_domain *domain;
3798
3799         domain = alloc_domain_mem();
3800         if (!domain)
3801                 return NULL;
3802
3803         domain->id = vm_domid++;
3804         domain->nid = -1;
3805         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3806         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3807
3808         return domain;
3809 }
3810
3811 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3812 {
3813         int adjust_width;
3814
3815         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3816         spin_lock_init(&domain->iommu_lock);
3817
3818         domain_reserve_special_ranges(domain);
3819
3820         /* calculate AGAW */
3821         domain->gaw = guest_width;
3822         adjust_width = guestwidth_to_adjustwidth(guest_width);
3823         domain->agaw = width_to_agaw(adjust_width);
3824
3825         INIT_LIST_HEAD(&domain->devices);
3826
3827         domain->iommu_count = 0;
3828         domain->iommu_coherency = 0;
3829         domain->iommu_snooping = 0;
3830         domain->iommu_superpage = 0;
3831         domain->max_addr = 0;
3832         domain->nid = -1;
3833
3834         /* always allocate the top pgd */
3835         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3836         if (!domain->pgd)
3837                 return -ENOMEM;
3838         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3839         return 0;
3840 }
3841
3842 static void iommu_free_vm_domain(struct dmar_domain *domain)
3843 {
3844         unsigned long flags;
3845         struct dmar_drhd_unit *drhd;
3846         struct intel_iommu *iommu;
3847         unsigned long i;
3848         unsigned long ndomains;
3849
3850         for_each_drhd_unit(drhd) {
3851                 if (drhd->ignored)
3852                         continue;
3853                 iommu = drhd->iommu;
3854
3855                 ndomains = cap_ndoms(iommu->cap);
3856                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3857                         if (iommu->domains[i] == domain) {
3858                                 spin_lock_irqsave(&iommu->lock, flags);
3859                                 clear_bit(i, iommu->domain_ids);
3860                                 iommu->domains[i] = NULL;
3861                                 spin_unlock_irqrestore(&iommu->lock, flags);
3862                                 break;
3863                         }
3864                 }
3865         }
3866 }
3867
3868 static void vm_domain_exit(struct dmar_domain *domain)
3869 {
3870         /* Domain 0 is reserved, so dont process it */
3871         if (!domain)
3872                 return;
3873
3874         vm_domain_remove_all_dev_info(domain);
3875         /* destroy iovas */
3876         put_iova_domain(&domain->iovad);
3877
3878         /* clear ptes */
3879         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3880
3881         /* free page tables */
3882         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3883
3884         iommu_free_vm_domain(domain);
3885         free_domain_mem(domain);
3886 }
3887
3888 static int intel_iommu_domain_init(struct iommu_domain *domain)
3889 {
3890         struct dmar_domain *dmar_domain;
3891
3892         dmar_domain = iommu_alloc_vm_domain();
3893         if (!dmar_domain) {
3894                 printk(KERN_ERR
3895                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3896                 return -ENOMEM;
3897         }
3898         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3899                 printk(KERN_ERR
3900                         "intel_iommu_domain_init() failed\n");
3901                 vm_domain_exit(dmar_domain);
3902                 return -ENOMEM;
3903         }
3904         domain_update_iommu_cap(dmar_domain);
3905         domain->priv = dmar_domain;
3906
3907         return 0;
3908 }
3909
3910 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3911 {
3912         struct dmar_domain *dmar_domain = domain->priv;
3913
3914         domain->priv = NULL;
3915         vm_domain_exit(dmar_domain);
3916 }
3917
3918 static int intel_iommu_attach_device(struct iommu_domain *domain,
3919                                      struct device *dev)
3920 {
3921         struct dmar_domain *dmar_domain = domain->priv;
3922         struct pci_dev *pdev = to_pci_dev(dev);
3923         struct intel_iommu *iommu;
3924         int addr_width;
3925
3926         /* normally pdev is not mapped */
3927         if (unlikely(domain_context_mapped(pdev))) {
3928                 struct dmar_domain *old_domain;
3929
3930                 old_domain = find_domain(pdev);
3931                 if (old_domain) {
3932                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3933                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3934                                 domain_remove_one_dev_info(old_domain, pdev);
3935                         else
3936                                 domain_remove_dev_info(old_domain);
3937                 }
3938         }
3939
3940         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3941                                 pdev->devfn);
3942         if (!iommu)
3943                 return -ENODEV;
3944
3945         /* check if this iommu agaw is sufficient for max mapped address */
3946         addr_width = agaw_to_width(iommu->agaw);
3947         if (addr_width > cap_mgaw(iommu->cap))
3948                 addr_width = cap_mgaw(iommu->cap);
3949
3950         if (dmar_domain->max_addr > (1LL << addr_width)) {
3951                 printk(KERN_ERR "%s: iommu width (%d) is not "
3952                        "sufficient for the mapped address (%llx)\n",
3953                        __func__, addr_width, dmar_domain->max_addr);
3954                 return -EFAULT;
3955         }
3956         dmar_domain->gaw = addr_width;
3957
3958         /*
3959          * Knock out extra levels of page tables if necessary
3960          */
3961         while (iommu->agaw < dmar_domain->agaw) {
3962                 struct dma_pte *pte;
3963
3964                 pte = dmar_domain->pgd;
3965                 if (dma_pte_present(pte)) {
3966                         dmar_domain->pgd = (struct dma_pte *)
3967                                 phys_to_virt(dma_pte_addr(pte));
3968                         free_pgtable_page(pte);
3969                 }
3970                 dmar_domain->agaw--;
3971         }
3972
3973         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3974 }
3975
3976 static void intel_iommu_detach_device(struct iommu_domain *domain,
3977                                       struct device *dev)
3978 {
3979         struct dmar_domain *dmar_domain = domain->priv;
3980         struct pci_dev *pdev = to_pci_dev(dev);
3981
3982         domain_remove_one_dev_info(dmar_domain, pdev);
3983 }
3984
3985 static int intel_iommu_map(struct iommu_domain *domain,
3986                            unsigned long iova, phys_addr_t hpa,
3987                            int gfp_order, int iommu_prot)
3988 {
3989         struct dmar_domain *dmar_domain = domain->priv;
3990         u64 max_addr;
3991         int prot = 0;
3992         size_t size;
3993         int ret;
3994
3995         if (iommu_prot & IOMMU_READ)
3996                 prot |= DMA_PTE_READ;
3997         if (iommu_prot & IOMMU_WRITE)
3998                 prot |= DMA_PTE_WRITE;
3999         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4000                 prot |= DMA_PTE_SNP;
4001
4002         size     = PAGE_SIZE << gfp_order;
4003         max_addr = iova + size;
4004         if (dmar_domain->max_addr < max_addr) {
4005                 u64 end;
4006
4007                 /* check if minimum agaw is sufficient for mapped address */
4008                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4009                 if (end < max_addr) {
4010                         printk(KERN_ERR "%s: iommu width (%d) is not "
4011                                "sufficient for the mapped address (%llx)\n",
4012                                __func__, dmar_domain->gaw, max_addr);
4013                         return -EFAULT;
4014                 }
4015                 dmar_domain->max_addr = max_addr;
4016         }
4017         /* Round up size to next multiple of PAGE_SIZE, if it and
4018            the low bits of hpa would take us onto the next page */
4019         size = aligned_nrpages(hpa, size);
4020         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4021                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4022         return ret;
4023 }
4024
4025 static int intel_iommu_unmap(struct iommu_domain *domain,
4026                              unsigned long iova, int gfp_order)
4027 {
4028         struct dmar_domain *dmar_domain = domain->priv;
4029         size_t size = PAGE_SIZE << gfp_order;
4030         int order;
4031
4032         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4033                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4034
4035         if (dmar_domain->max_addr == iova + size)
4036                 dmar_domain->max_addr = iova;
4037
4038         return order;
4039 }
4040
4041 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4042                                             unsigned long iova)
4043 {
4044         struct dmar_domain *dmar_domain = domain->priv;
4045         struct dma_pte *pte;
4046         u64 phys = 0;
4047
4048         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4049         if (pte)
4050                 phys = dma_pte_addr(pte);
4051
4052         return phys;
4053 }
4054
4055 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4056                                       unsigned long cap)
4057 {
4058         struct dmar_domain *dmar_domain = domain->priv;
4059
4060         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4061                 return dmar_domain->iommu_snooping;
4062         if (cap == IOMMU_CAP_INTR_REMAP)
4063                 return intr_remapping_enabled;
4064
4065         return 0;
4066 }
4067
4068 static struct iommu_ops intel_iommu_ops = {
4069         .domain_init    = intel_iommu_domain_init,
4070         .domain_destroy = intel_iommu_domain_destroy,
4071         .attach_dev     = intel_iommu_attach_device,
4072         .detach_dev     = intel_iommu_detach_device,
4073         .map            = intel_iommu_map,
4074         .unmap          = intel_iommu_unmap,
4075         .iova_to_phys   = intel_iommu_iova_to_phys,
4076         .domain_has_cap = intel_iommu_domain_has_cap,
4077 };
4078
4079 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4080 {
4081         /*
4082          * Mobile 4 Series Chipset neglects to set RWBF capability,
4083          * but needs it:
4084          */
4085         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4086         rwbf_quirk = 1;
4087
4088         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4089         if (dev->revision == 0x07) {
4090                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4091                 dmar_map_gfx = 0;
4092         }
4093 }
4094
4095 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4096
4097 #define GGC 0x52
4098 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4099 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4100 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4101 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4102 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4103 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4104 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4105 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4106
4107 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4108 {
4109         unsigned short ggc;
4110
4111         if (pci_read_config_word(dev, GGC, &ggc))
4112                 return;
4113
4114         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4115                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4116                 dmar_map_gfx = 0;
4117         } else if (dmar_map_gfx) {
4118                 /* we have to ensure the gfx device is idle before we flush */
4119                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4120                 intel_iommu_strict = 1;
4121        }
4122 }
4123 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4124 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4125 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4126 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4127
4128 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4129    ISOCH DMAR unit for the Azalia sound device, but not give it any
4130    TLB entries, which causes it to deadlock. Check for that.  We do
4131    this in a function called from init_dmars(), instead of in a PCI
4132    quirk, because we don't want to print the obnoxious "BIOS broken"
4133    message if VT-d is actually disabled.
4134 */
4135 static void __init check_tylersburg_isoch(void)
4136 {
4137         struct pci_dev *pdev;
4138         uint32_t vtisochctrl;
4139
4140         /* If there's no Azalia in the system anyway, forget it. */
4141         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4142         if (!pdev)
4143                 return;
4144         pci_dev_put(pdev);
4145
4146         /* System Management Registers. Might be hidden, in which case
4147            we can't do the sanity check. But that's OK, because the
4148            known-broken BIOSes _don't_ actually hide it, so far. */
4149         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4150         if (!pdev)
4151                 return;
4152
4153         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4154                 pci_dev_put(pdev);
4155                 return;
4156         }
4157
4158         pci_dev_put(pdev);
4159
4160         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4161         if (vtisochctrl & 1)
4162                 return;
4163
4164         /* Drop all bits other than the number of TLB entries */
4165         vtisochctrl &= 0x1c;
4166
4167         /* If we have the recommended number of TLB entries (16), fine. */
4168         if (vtisochctrl == 0x10)
4169                 return;
4170
4171         /* Zero TLB entries? You get to ride the short bus to school. */
4172         if (!vtisochctrl) {
4173                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4174                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4175                      dmi_get_system_info(DMI_BIOS_VENDOR),
4176                      dmi_get_system_info(DMI_BIOS_VERSION),
4177                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4178                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4179                 return;
4180         }
4181         
4182         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4183                vtisochctrl);
4184 }