da273e4ef66c87f79a71a615d47ddfee99bed5a3
[linux-2.6.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
41 #include "pci.h"
42
43 #define ROOT_SIZE               VTD_PAGE_SIZE
44 #define CONTEXT_SIZE            VTD_PAGE_SIZE
45
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48
49 #define IOAPIC_RANGE_START      (0xfee00000)
50 #define IOAPIC_RANGE_END        (0xfeefffff)
51 #define IOVA_START_ADDR         (0x1000)
52
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56
57 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
60
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
63
64 /*
65  * 0: Present
66  * 1-11: Reserved
67  * 12-63: Context Ptr (12 - (haw-1))
68  * 64-127: Reserved
69  */
70 struct root_entry {
71         u64     val;
72         u64     rsvd1;
73 };
74 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
75 static inline bool root_present(struct root_entry *root)
76 {
77         return (root->val & 1);
78 }
79 static inline void set_root_present(struct root_entry *root)
80 {
81         root->val |= 1;
82 }
83 static inline void set_root_value(struct root_entry *root, unsigned long value)
84 {
85         root->val |= value & VTD_PAGE_MASK;
86 }
87
88 static inline struct context_entry *
89 get_context_addr_from_root(struct root_entry *root)
90 {
91         return (struct context_entry *)
92                 (root_present(root)?phys_to_virt(
93                 root->val & VTD_PAGE_MASK) :
94                 NULL);
95 }
96
97 /*
98  * low 64 bits:
99  * 0: present
100  * 1: fault processing disable
101  * 2-3: translation type
102  * 12-63: address space root
103  * high 64 bits:
104  * 0-2: address width
105  * 3-6: aval
106  * 8-23: domain id
107  */
108 struct context_entry {
109         u64 lo;
110         u64 hi;
111 };
112
113 static inline bool context_present(struct context_entry *context)
114 {
115         return (context->lo & 1);
116 }
117 static inline void context_set_present(struct context_entry *context)
118 {
119         context->lo |= 1;
120 }
121
122 static inline void context_set_fault_enable(struct context_entry *context)
123 {
124         context->lo &= (((u64)-1) << 2) | 1;
125 }
126
127 #define CONTEXT_TT_MULTI_LEVEL 0
128
129 static inline void context_set_translation_type(struct context_entry *context,
130                                                 unsigned long value)
131 {
132         context->lo &= (((u64)-1) << 4) | 3;
133         context->lo |= (value & 3) << 2;
134 }
135
136 static inline void context_set_address_root(struct context_entry *context,
137                                             unsigned long value)
138 {
139         context->lo |= value & VTD_PAGE_MASK;
140 }
141
142 static inline void context_set_address_width(struct context_entry *context,
143                                              unsigned long value)
144 {
145         context->hi |= value & 7;
146 }
147
148 static inline void context_set_domain_id(struct context_entry *context,
149                                          unsigned long value)
150 {
151         context->hi |= (value & ((1 << 16) - 1)) << 8;
152 }
153
154 static inline void context_clear_entry(struct context_entry *context)
155 {
156         context->lo = 0;
157         context->hi = 0;
158 }
159
160 /*
161  * 0: readable
162  * 1: writable
163  * 2-6: reserved
164  * 7: super page
165  * 8-11: available
166  * 12-63: Host physcial address
167  */
168 struct dma_pte {
169         u64 val;
170 };
171
172 static inline void dma_clear_pte(struct dma_pte *pte)
173 {
174         pte->val = 0;
175 }
176
177 static inline void dma_set_pte_readable(struct dma_pte *pte)
178 {
179         pte->val |= DMA_PTE_READ;
180 }
181
182 static inline void dma_set_pte_writable(struct dma_pte *pte)
183 {
184         pte->val |= DMA_PTE_WRITE;
185 }
186
187 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
188 {
189         pte->val = (pte->val & ~3) | (prot & 3);
190 }
191
192 static inline u64 dma_pte_addr(struct dma_pte *pte)
193 {
194         return (pte->val & VTD_PAGE_MASK);
195 }
196
197 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
198 {
199         pte->val |= (addr & VTD_PAGE_MASK);
200 }
201
202 static inline bool dma_pte_present(struct dma_pte *pte)
203 {
204         return (pte->val & 3) != 0;
205 }
206
207 /* devices under the same p2p bridge are owned in one domain */
208 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
209
210 /* domain represents a virtual machine, more than one devices
211  * across iommus may be owned in one domain, e.g. kvm guest.
212  */
213 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
214
215 struct dmar_domain {
216         int     id;                     /* domain id */
217         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
218
219         struct list_head devices;       /* all devices' list */
220         struct iova_domain iovad;       /* iova's that belong to this domain */
221
222         struct dma_pte  *pgd;           /* virtual address */
223         spinlock_t      mapping_lock;   /* page table lock */
224         int             gaw;            /* max guest address width */
225
226         /* adjusted guest address width, 0 is level 2 30-bit */
227         int             agaw;
228
229         int             flags;          /* flags to find out type of domain */
230
231         int             iommu_coherency;/* indicate coherency of iommu access */
232         int             iommu_count;    /* reference count of iommu */
233         spinlock_t      iommu_lock;     /* protect iommu set in domain */
234         u64             max_addr;       /* maximum mapped address */
235 };
236
237 /* PCI domain-device relationship */
238 struct device_domain_info {
239         struct list_head link;  /* link to domain siblings */
240         struct list_head global; /* link to global list */
241         u8 bus;                 /* PCI bus numer */
242         u8 devfn;               /* PCI devfn number */
243         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
244         struct dmar_domain *domain; /* pointer to domain */
245 };
246
247 static void flush_unmaps_timeout(unsigned long data);
248
249 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
250
251 #define HIGH_WATER_MARK 250
252 struct deferred_flush_tables {
253         int next;
254         struct iova *iova[HIGH_WATER_MARK];
255         struct dmar_domain *domain[HIGH_WATER_MARK];
256 };
257
258 static struct deferred_flush_tables *deferred_flush;
259
260 /* bitmap for indexing intel_iommus */
261 static int g_num_of_iommus;
262
263 static DEFINE_SPINLOCK(async_umap_flush_lock);
264 static LIST_HEAD(unmaps_to_do);
265
266 static int timer_on;
267 static long list_size;
268
269 static void domain_remove_dev_info(struct dmar_domain *domain);
270
271 int dmar_disabled;
272 static int __initdata dmar_map_gfx = 1;
273 static int dmar_forcedac;
274 static int intel_iommu_strict;
275
276 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
277 static DEFINE_SPINLOCK(device_domain_lock);
278 static LIST_HEAD(device_domain_list);
279
280 static struct iommu_ops intel_iommu_ops;
281
282 static int __init intel_iommu_setup(char *str)
283 {
284         if (!str)
285                 return -EINVAL;
286         while (*str) {
287                 if (!strncmp(str, "off", 3)) {
288                         dmar_disabled = 1;
289                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
290                 } else if (!strncmp(str, "igfx_off", 8)) {
291                         dmar_map_gfx = 0;
292                         printk(KERN_INFO
293                                 "Intel-IOMMU: disable GFX device mapping\n");
294                 } else if (!strncmp(str, "forcedac", 8)) {
295                         printk(KERN_INFO
296                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
297                         dmar_forcedac = 1;
298                 } else if (!strncmp(str, "strict", 6)) {
299                         printk(KERN_INFO
300                                 "Intel-IOMMU: disable batched IOTLB flush\n");
301                         intel_iommu_strict = 1;
302                 }
303
304                 str += strcspn(str, ",");
305                 while (*str == ',')
306                         str++;
307         }
308         return 0;
309 }
310 __setup("intel_iommu=", intel_iommu_setup);
311
312 static struct kmem_cache *iommu_domain_cache;
313 static struct kmem_cache *iommu_devinfo_cache;
314 static struct kmem_cache *iommu_iova_cache;
315
316 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
317 {
318         unsigned int flags;
319         void *vaddr;
320
321         /* trying to avoid low memory issues */
322         flags = current->flags & PF_MEMALLOC;
323         current->flags |= PF_MEMALLOC;
324         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
325         current->flags &= (~PF_MEMALLOC | flags);
326         return vaddr;
327 }
328
329
330 static inline void *alloc_pgtable_page(void)
331 {
332         unsigned int flags;
333         void *vaddr;
334
335         /* trying to avoid low memory issues */
336         flags = current->flags & PF_MEMALLOC;
337         current->flags |= PF_MEMALLOC;
338         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
339         current->flags &= (~PF_MEMALLOC | flags);
340         return vaddr;
341 }
342
343 static inline void free_pgtable_page(void *vaddr)
344 {
345         free_page((unsigned long)vaddr);
346 }
347
348 static inline void *alloc_domain_mem(void)
349 {
350         return iommu_kmem_cache_alloc(iommu_domain_cache);
351 }
352
353 static void free_domain_mem(void *vaddr)
354 {
355         kmem_cache_free(iommu_domain_cache, vaddr);
356 }
357
358 static inline void * alloc_devinfo_mem(void)
359 {
360         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
361 }
362
363 static inline void free_devinfo_mem(void *vaddr)
364 {
365         kmem_cache_free(iommu_devinfo_cache, vaddr);
366 }
367
368 struct iova *alloc_iova_mem(void)
369 {
370         return iommu_kmem_cache_alloc(iommu_iova_cache);
371 }
372
373 void free_iova_mem(struct iova *iova)
374 {
375         kmem_cache_free(iommu_iova_cache, iova);
376 }
377
378
379 static inline int width_to_agaw(int width);
380
381 /* calculate agaw for each iommu.
382  * "SAGAW" may be different across iommus, use a default agaw, and
383  * get a supported less agaw for iommus that don't support the default agaw.
384  */
385 int iommu_calculate_agaw(struct intel_iommu *iommu)
386 {
387         unsigned long sagaw;
388         int agaw = -1;
389
390         sagaw = cap_sagaw(iommu->cap);
391         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
392              agaw >= 0; agaw--) {
393                 if (test_bit(agaw, &sagaw))
394                         break;
395         }
396
397         return agaw;
398 }
399
400 /* in native case, each domain is related to only one iommu */
401 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
402 {
403         int iommu_id;
404
405         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
406
407         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
408         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
409                 return NULL;
410
411         return g_iommus[iommu_id];
412 }
413
414 /* "Coherency" capability may be different across iommus */
415 static void domain_update_iommu_coherency(struct dmar_domain *domain)
416 {
417         int i;
418
419         domain->iommu_coherency = 1;
420
421         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
422         for (; i < g_num_of_iommus; ) {
423                 if (!ecap_coherent(g_iommus[i]->ecap)) {
424                         domain->iommu_coherency = 0;
425                         break;
426                 }
427                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
428         }
429 }
430
431 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
432 {
433         struct dmar_drhd_unit *drhd = NULL;
434         int i;
435
436         for_each_drhd_unit(drhd) {
437                 if (drhd->ignored)
438                         continue;
439
440                 for (i = 0; i < drhd->devices_cnt; i++)
441                         if (drhd->devices[i]->bus->number == bus &&
442                             drhd->devices[i]->devfn == devfn)
443                                 return drhd->iommu;
444
445                 if (drhd->include_all)
446                         return drhd->iommu;
447         }
448
449         return NULL;
450 }
451
452 static void domain_flush_cache(struct dmar_domain *domain,
453                                void *addr, int size)
454 {
455         if (!domain->iommu_coherency)
456                 clflush_cache_range(addr, size);
457 }
458
459 /* Gets context entry for a given bus and devfn */
460 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
461                 u8 bus, u8 devfn)
462 {
463         struct root_entry *root;
464         struct context_entry *context;
465         unsigned long phy_addr;
466         unsigned long flags;
467
468         spin_lock_irqsave(&iommu->lock, flags);
469         root = &iommu->root_entry[bus];
470         context = get_context_addr_from_root(root);
471         if (!context) {
472                 context = (struct context_entry *)alloc_pgtable_page();
473                 if (!context) {
474                         spin_unlock_irqrestore(&iommu->lock, flags);
475                         return NULL;
476                 }
477                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
478                 phy_addr = virt_to_phys((void *)context);
479                 set_root_value(root, phy_addr);
480                 set_root_present(root);
481                 __iommu_flush_cache(iommu, root, sizeof(*root));
482         }
483         spin_unlock_irqrestore(&iommu->lock, flags);
484         return &context[devfn];
485 }
486
487 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
488 {
489         struct root_entry *root;
490         struct context_entry *context;
491         int ret;
492         unsigned long flags;
493
494         spin_lock_irqsave(&iommu->lock, flags);
495         root = &iommu->root_entry[bus];
496         context = get_context_addr_from_root(root);
497         if (!context) {
498                 ret = 0;
499                 goto out;
500         }
501         ret = context_present(&context[devfn]);
502 out:
503         spin_unlock_irqrestore(&iommu->lock, flags);
504         return ret;
505 }
506
507 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
508 {
509         struct root_entry *root;
510         struct context_entry *context;
511         unsigned long flags;
512
513         spin_lock_irqsave(&iommu->lock, flags);
514         root = &iommu->root_entry[bus];
515         context = get_context_addr_from_root(root);
516         if (context) {
517                 context_clear_entry(&context[devfn]);
518                 __iommu_flush_cache(iommu, &context[devfn], \
519                         sizeof(*context));
520         }
521         spin_unlock_irqrestore(&iommu->lock, flags);
522 }
523
524 static void free_context_table(struct intel_iommu *iommu)
525 {
526         struct root_entry *root;
527         int i;
528         unsigned long flags;
529         struct context_entry *context;
530
531         spin_lock_irqsave(&iommu->lock, flags);
532         if (!iommu->root_entry) {
533                 goto out;
534         }
535         for (i = 0; i < ROOT_ENTRY_NR; i++) {
536                 root = &iommu->root_entry[i];
537                 context = get_context_addr_from_root(root);
538                 if (context)
539                         free_pgtable_page(context);
540         }
541         free_pgtable_page(iommu->root_entry);
542         iommu->root_entry = NULL;
543 out:
544         spin_unlock_irqrestore(&iommu->lock, flags);
545 }
546
547 /* page table handling */
548 #define LEVEL_STRIDE            (9)
549 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
550
551 static inline int agaw_to_level(int agaw)
552 {
553         return agaw + 2;
554 }
555
556 static inline int agaw_to_width(int agaw)
557 {
558         return 30 + agaw * LEVEL_STRIDE;
559
560 }
561
562 static inline int width_to_agaw(int width)
563 {
564         return (width - 30) / LEVEL_STRIDE;
565 }
566
567 static inline unsigned int level_to_offset_bits(int level)
568 {
569         return (12 + (level - 1) * LEVEL_STRIDE);
570 }
571
572 static inline int address_level_offset(u64 addr, int level)
573 {
574         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
575 }
576
577 static inline u64 level_mask(int level)
578 {
579         return ((u64)-1 << level_to_offset_bits(level));
580 }
581
582 static inline u64 level_size(int level)
583 {
584         return ((u64)1 << level_to_offset_bits(level));
585 }
586
587 static inline u64 align_to_level(u64 addr, int level)
588 {
589         return ((addr + level_size(level) - 1) & level_mask(level));
590 }
591
592 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
593 {
594         int addr_width = agaw_to_width(domain->agaw);
595         struct dma_pte *parent, *pte = NULL;
596         int level = agaw_to_level(domain->agaw);
597         int offset;
598         unsigned long flags;
599
600         BUG_ON(!domain->pgd);
601
602         addr &= (((u64)1) << addr_width) - 1;
603         parent = domain->pgd;
604
605         spin_lock_irqsave(&domain->mapping_lock, flags);
606         while (level > 0) {
607                 void *tmp_page;
608
609                 offset = address_level_offset(addr, level);
610                 pte = &parent[offset];
611                 if (level == 1)
612                         break;
613
614                 if (!dma_pte_present(pte)) {
615                         tmp_page = alloc_pgtable_page();
616
617                         if (!tmp_page) {
618                                 spin_unlock_irqrestore(&domain->mapping_lock,
619                                         flags);
620                                 return NULL;
621                         }
622                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
623                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
624                         /*
625                          * high level table always sets r/w, last level page
626                          * table control read/write
627                          */
628                         dma_set_pte_readable(pte);
629                         dma_set_pte_writable(pte);
630                         domain_flush_cache(domain, pte, sizeof(*pte));
631                 }
632                 parent = phys_to_virt(dma_pte_addr(pte));
633                 level--;
634         }
635
636         spin_unlock_irqrestore(&domain->mapping_lock, flags);
637         return pte;
638 }
639
640 /* return address's pte at specific level */
641 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
642                 int level)
643 {
644         struct dma_pte *parent, *pte = NULL;
645         int total = agaw_to_level(domain->agaw);
646         int offset;
647
648         parent = domain->pgd;
649         while (level <= total) {
650                 offset = address_level_offset(addr, total);
651                 pte = &parent[offset];
652                 if (level == total)
653                         return pte;
654
655                 if (!dma_pte_present(pte))
656                         break;
657                 parent = phys_to_virt(dma_pte_addr(pte));
658                 total--;
659         }
660         return NULL;
661 }
662
663 /* clear one page's page table */
664 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
665 {
666         struct dma_pte *pte = NULL;
667
668         /* get last level pte */
669         pte = dma_addr_level_pte(domain, addr, 1);
670
671         if (pte) {
672                 dma_clear_pte(pte);
673                 domain_flush_cache(domain, pte, sizeof(*pte));
674         }
675 }
676
677 /* clear last level pte, a tlb flush should be followed */
678 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
679 {
680         int addr_width = agaw_to_width(domain->agaw);
681
682         start &= (((u64)1) << addr_width) - 1;
683         end &= (((u64)1) << addr_width) - 1;
684         /* in case it's partial page */
685         start = PAGE_ALIGN(start);
686         end &= PAGE_MASK;
687
688         /* we don't need lock here, nobody else touches the iova range */
689         while (start < end) {
690                 dma_pte_clear_one(domain, start);
691                 start += VTD_PAGE_SIZE;
692         }
693 }
694
695 /* free page table pages. last level pte should already be cleared */
696 static void dma_pte_free_pagetable(struct dmar_domain *domain,
697         u64 start, u64 end)
698 {
699         int addr_width = agaw_to_width(domain->agaw);
700         struct dma_pte *pte;
701         int total = agaw_to_level(domain->agaw);
702         int level;
703         u64 tmp;
704
705         start &= (((u64)1) << addr_width) - 1;
706         end &= (((u64)1) << addr_width) - 1;
707
708         /* we don't need lock here, nobody else touches the iova range */
709         level = 2;
710         while (level <= total) {
711                 tmp = align_to_level(start, level);
712                 if (tmp >= end || (tmp + level_size(level) > end))
713                         return;
714
715                 while (tmp < end) {
716                         pte = dma_addr_level_pte(domain, tmp, level);
717                         if (pte) {
718                                 free_pgtable_page(
719                                         phys_to_virt(dma_pte_addr(pte)));
720                                 dma_clear_pte(pte);
721                                 domain_flush_cache(domain, pte, sizeof(*pte));
722                         }
723                         tmp += level_size(level);
724                 }
725                 level++;
726         }
727         /* free pgd */
728         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
729                 free_pgtable_page(domain->pgd);
730                 domain->pgd = NULL;
731         }
732 }
733
734 /* iommu handling */
735 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
736 {
737         struct root_entry *root;
738         unsigned long flags;
739
740         root = (struct root_entry *)alloc_pgtable_page();
741         if (!root)
742                 return -ENOMEM;
743
744         __iommu_flush_cache(iommu, root, ROOT_SIZE);
745
746         spin_lock_irqsave(&iommu->lock, flags);
747         iommu->root_entry = root;
748         spin_unlock_irqrestore(&iommu->lock, flags);
749
750         return 0;
751 }
752
753 static void iommu_set_root_entry(struct intel_iommu *iommu)
754 {
755         void *addr;
756         u32 cmd, sts;
757         unsigned long flag;
758
759         addr = iommu->root_entry;
760
761         spin_lock_irqsave(&iommu->register_lock, flag);
762         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
763
764         cmd = iommu->gcmd | DMA_GCMD_SRTP;
765         writel(cmd, iommu->reg + DMAR_GCMD_REG);
766
767         /* Make sure hardware complete it */
768         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
769                 readl, (sts & DMA_GSTS_RTPS), sts);
770
771         spin_unlock_irqrestore(&iommu->register_lock, flag);
772 }
773
774 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
775 {
776         u32 val;
777         unsigned long flag;
778
779         if (!cap_rwbf(iommu->cap))
780                 return;
781         val = iommu->gcmd | DMA_GCMD_WBF;
782
783         spin_lock_irqsave(&iommu->register_lock, flag);
784         writel(val, iommu->reg + DMAR_GCMD_REG);
785
786         /* Make sure hardware complete it */
787         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
788                         readl, (!(val & DMA_GSTS_WBFS)), val);
789
790         spin_unlock_irqrestore(&iommu->register_lock, flag);
791 }
792
793 /* return value determine if we need a write buffer flush */
794 static int __iommu_flush_context(struct intel_iommu *iommu,
795         u16 did, u16 source_id, u8 function_mask, u64 type,
796         int non_present_entry_flush)
797 {
798         u64 val = 0;
799         unsigned long flag;
800
801         /*
802          * In the non-present entry flush case, if hardware doesn't cache
803          * non-present entry we do nothing and if hardware cache non-present
804          * entry, we flush entries of domain 0 (the domain id is used to cache
805          * any non-present entries)
806          */
807         if (non_present_entry_flush) {
808                 if (!cap_caching_mode(iommu->cap))
809                         return 1;
810                 else
811                         did = 0;
812         }
813
814         switch (type) {
815         case DMA_CCMD_GLOBAL_INVL:
816                 val = DMA_CCMD_GLOBAL_INVL;
817                 break;
818         case DMA_CCMD_DOMAIN_INVL:
819                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
820                 break;
821         case DMA_CCMD_DEVICE_INVL:
822                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
823                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
824                 break;
825         default:
826                 BUG();
827         }
828         val |= DMA_CCMD_ICC;
829
830         spin_lock_irqsave(&iommu->register_lock, flag);
831         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
832
833         /* Make sure hardware complete it */
834         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
835                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
836
837         spin_unlock_irqrestore(&iommu->register_lock, flag);
838
839         /* flush context entry will implicitly flush write buffer */
840         return 0;
841 }
842
843 /* return value determine if we need a write buffer flush */
844 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
845         u64 addr, unsigned int size_order, u64 type,
846         int non_present_entry_flush)
847 {
848         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
849         u64 val = 0, val_iva = 0;
850         unsigned long flag;
851
852         /*
853          * In the non-present entry flush case, if hardware doesn't cache
854          * non-present entry we do nothing and if hardware cache non-present
855          * entry, we flush entries of domain 0 (the domain id is used to cache
856          * any non-present entries)
857          */
858         if (non_present_entry_flush) {
859                 if (!cap_caching_mode(iommu->cap))
860                         return 1;
861                 else
862                         did = 0;
863         }
864
865         switch (type) {
866         case DMA_TLB_GLOBAL_FLUSH:
867                 /* global flush doesn't need set IVA_REG */
868                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
869                 break;
870         case DMA_TLB_DSI_FLUSH:
871                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
872                 break;
873         case DMA_TLB_PSI_FLUSH:
874                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
875                 /* Note: always flush non-leaf currently */
876                 val_iva = size_order | addr;
877                 break;
878         default:
879                 BUG();
880         }
881         /* Note: set drain read/write */
882 #if 0
883         /*
884          * This is probably to be super secure.. Looks like we can
885          * ignore it without any impact.
886          */
887         if (cap_read_drain(iommu->cap))
888                 val |= DMA_TLB_READ_DRAIN;
889 #endif
890         if (cap_write_drain(iommu->cap))
891                 val |= DMA_TLB_WRITE_DRAIN;
892
893         spin_lock_irqsave(&iommu->register_lock, flag);
894         /* Note: Only uses first TLB reg currently */
895         if (val_iva)
896                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
897         dmar_writeq(iommu->reg + tlb_offset + 8, val);
898
899         /* Make sure hardware complete it */
900         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
901                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
902
903         spin_unlock_irqrestore(&iommu->register_lock, flag);
904
905         /* check IOTLB invalidation granularity */
906         if (DMA_TLB_IAIG(val) == 0)
907                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
908         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
909                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
910                         (unsigned long long)DMA_TLB_IIRG(type),
911                         (unsigned long long)DMA_TLB_IAIG(val));
912         /* flush iotlb entry will implicitly flush write buffer */
913         return 0;
914 }
915
916 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
917         u64 addr, unsigned int pages, int non_present_entry_flush)
918 {
919         unsigned int mask;
920
921         BUG_ON(addr & (~VTD_PAGE_MASK));
922         BUG_ON(pages == 0);
923
924         /* Fallback to domain selective flush if no PSI support */
925         if (!cap_pgsel_inv(iommu->cap))
926                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
927                                                 DMA_TLB_DSI_FLUSH,
928                                                 non_present_entry_flush);
929
930         /*
931          * PSI requires page size to be 2 ^ x, and the base address is naturally
932          * aligned to the size
933          */
934         mask = ilog2(__roundup_pow_of_two(pages));
935         /* Fallback to domain selective flush if size is too big */
936         if (mask > cap_max_amask_val(iommu->cap))
937                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
938                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
939
940         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
941                                         DMA_TLB_PSI_FLUSH,
942                                         non_present_entry_flush);
943 }
944
945 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
946 {
947         u32 pmen;
948         unsigned long flags;
949
950         spin_lock_irqsave(&iommu->register_lock, flags);
951         pmen = readl(iommu->reg + DMAR_PMEN_REG);
952         pmen &= ~DMA_PMEN_EPM;
953         writel(pmen, iommu->reg + DMAR_PMEN_REG);
954
955         /* wait for the protected region status bit to clear */
956         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
957                 readl, !(pmen & DMA_PMEN_PRS), pmen);
958
959         spin_unlock_irqrestore(&iommu->register_lock, flags);
960 }
961
962 static int iommu_enable_translation(struct intel_iommu *iommu)
963 {
964         u32 sts;
965         unsigned long flags;
966
967         spin_lock_irqsave(&iommu->register_lock, flags);
968         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
969
970         /* Make sure hardware complete it */
971         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
972                 readl, (sts & DMA_GSTS_TES), sts);
973
974         iommu->gcmd |= DMA_GCMD_TE;
975         spin_unlock_irqrestore(&iommu->register_lock, flags);
976         return 0;
977 }
978
979 static int iommu_disable_translation(struct intel_iommu *iommu)
980 {
981         u32 sts;
982         unsigned long flag;
983
984         spin_lock_irqsave(&iommu->register_lock, flag);
985         iommu->gcmd &= ~DMA_GCMD_TE;
986         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
987
988         /* Make sure hardware complete it */
989         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
990                 readl, (!(sts & DMA_GSTS_TES)), sts);
991
992         spin_unlock_irqrestore(&iommu->register_lock, flag);
993         return 0;
994 }
995
996 /* iommu interrupt handling. Most stuff are MSI-like. */
997
998 static const char *fault_reason_strings[] =
999 {
1000         "Software",
1001         "Present bit in root entry is clear",
1002         "Present bit in context entry is clear",
1003         "Invalid context entry",
1004         "Access beyond MGAW",
1005         "PTE Write access is not set",
1006         "PTE Read access is not set",
1007         "Next page table ptr is invalid",
1008         "Root table address invalid",
1009         "Context table ptr is invalid",
1010         "non-zero reserved fields in RTP",
1011         "non-zero reserved fields in CTP",
1012         "non-zero reserved fields in PTE",
1013 };
1014 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
1015
1016 const char *dmar_get_fault_reason(u8 fault_reason)
1017 {
1018         if (fault_reason > MAX_FAULT_REASON_IDX)
1019                 return "Unknown";
1020         else
1021                 return fault_reason_strings[fault_reason];
1022 }
1023
1024 void dmar_msi_unmask(unsigned int irq)
1025 {
1026         struct intel_iommu *iommu = get_irq_data(irq);
1027         unsigned long flag;
1028
1029         /* unmask it */
1030         spin_lock_irqsave(&iommu->register_lock, flag);
1031         writel(0, iommu->reg + DMAR_FECTL_REG);
1032         /* Read a reg to force flush the post write */
1033         readl(iommu->reg + DMAR_FECTL_REG);
1034         spin_unlock_irqrestore(&iommu->register_lock, flag);
1035 }
1036
1037 void dmar_msi_mask(unsigned int irq)
1038 {
1039         unsigned long flag;
1040         struct intel_iommu *iommu = get_irq_data(irq);
1041
1042         /* mask it */
1043         spin_lock_irqsave(&iommu->register_lock, flag);
1044         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1045         /* Read a reg to force flush the post write */
1046         readl(iommu->reg + DMAR_FECTL_REG);
1047         spin_unlock_irqrestore(&iommu->register_lock, flag);
1048 }
1049
1050 void dmar_msi_write(int irq, struct msi_msg *msg)
1051 {
1052         struct intel_iommu *iommu = get_irq_data(irq);
1053         unsigned long flag;
1054
1055         spin_lock_irqsave(&iommu->register_lock, flag);
1056         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1057         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1058         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1059         spin_unlock_irqrestore(&iommu->register_lock, flag);
1060 }
1061
1062 void dmar_msi_read(int irq, struct msi_msg *msg)
1063 {
1064         struct intel_iommu *iommu = get_irq_data(irq);
1065         unsigned long flag;
1066
1067         spin_lock_irqsave(&iommu->register_lock, flag);
1068         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1069         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1070         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1071         spin_unlock_irqrestore(&iommu->register_lock, flag);
1072 }
1073
1074 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1075                 u8 fault_reason, u16 source_id, unsigned long long addr)
1076 {
1077         const char *reason;
1078
1079         reason = dmar_get_fault_reason(fault_reason);
1080
1081         printk(KERN_ERR
1082                 "DMAR:[%s] Request device [%02x:%02x.%d] "
1083                 "fault addr %llx \n"
1084                 "DMAR:[fault reason %02d] %s\n",
1085                 (type ? "DMA Read" : "DMA Write"),
1086                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1087                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1088         return 0;
1089 }
1090
1091 #define PRIMARY_FAULT_REG_LEN (16)
1092 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1093 {
1094         struct intel_iommu *iommu = dev_id;
1095         int reg, fault_index;
1096         u32 fault_status;
1097         unsigned long flag;
1098
1099         spin_lock_irqsave(&iommu->register_lock, flag);
1100         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1101
1102         /* TBD: ignore advanced fault log currently */
1103         if (!(fault_status & DMA_FSTS_PPF))
1104                 goto clear_overflow;
1105
1106         fault_index = dma_fsts_fault_record_index(fault_status);
1107         reg = cap_fault_reg_offset(iommu->cap);
1108         while (1) {
1109                 u8 fault_reason;
1110                 u16 source_id;
1111                 u64 guest_addr;
1112                 int type;
1113                 u32 data;
1114
1115                 /* highest 32 bits */
1116                 data = readl(iommu->reg + reg +
1117                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1118                 if (!(data & DMA_FRCD_F))
1119                         break;
1120
1121                 fault_reason = dma_frcd_fault_reason(data);
1122                 type = dma_frcd_type(data);
1123
1124                 data = readl(iommu->reg + reg +
1125                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1126                 source_id = dma_frcd_source_id(data);
1127
1128                 guest_addr = dmar_readq(iommu->reg + reg +
1129                                 fault_index * PRIMARY_FAULT_REG_LEN);
1130                 guest_addr = dma_frcd_page_addr(guest_addr);
1131                 /* clear the fault */
1132                 writel(DMA_FRCD_F, iommu->reg + reg +
1133                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1134
1135                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1136
1137                 iommu_page_fault_do_one(iommu, type, fault_reason,
1138                                 source_id, guest_addr);
1139
1140                 fault_index++;
1141                 if (fault_index > cap_num_fault_regs(iommu->cap))
1142                         fault_index = 0;
1143                 spin_lock_irqsave(&iommu->register_lock, flag);
1144         }
1145 clear_overflow:
1146         /* clear primary fault overflow */
1147         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1148         if (fault_status & DMA_FSTS_PFO)
1149                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1150
1151         spin_unlock_irqrestore(&iommu->register_lock, flag);
1152         return IRQ_HANDLED;
1153 }
1154
1155 int dmar_set_interrupt(struct intel_iommu *iommu)
1156 {
1157         int irq, ret;
1158
1159         irq = create_irq();
1160         if (!irq) {
1161                 printk(KERN_ERR "IOMMU: no free vectors\n");
1162                 return -EINVAL;
1163         }
1164
1165         set_irq_data(irq, iommu);
1166         iommu->irq = irq;
1167
1168         ret = arch_setup_dmar_msi(irq);
1169         if (ret) {
1170                 set_irq_data(irq, NULL);
1171                 iommu->irq = 0;
1172                 destroy_irq(irq);
1173                 return 0;
1174         }
1175
1176         /* Force fault register is cleared */
1177         iommu_page_fault(irq, iommu);
1178
1179         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1180         if (ret)
1181                 printk(KERN_ERR "IOMMU: can't request irq\n");
1182         return ret;
1183 }
1184
1185 static int iommu_init_domains(struct intel_iommu *iommu)
1186 {
1187         unsigned long ndomains;
1188         unsigned long nlongs;
1189
1190         ndomains = cap_ndoms(iommu->cap);
1191         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1192         nlongs = BITS_TO_LONGS(ndomains);
1193
1194         /* TBD: there might be 64K domains,
1195          * consider other allocation for future chip
1196          */
1197         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1198         if (!iommu->domain_ids) {
1199                 printk(KERN_ERR "Allocating domain id array failed\n");
1200                 return -ENOMEM;
1201         }
1202         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1203                         GFP_KERNEL);
1204         if (!iommu->domains) {
1205                 printk(KERN_ERR "Allocating domain array failed\n");
1206                 kfree(iommu->domain_ids);
1207                 return -ENOMEM;
1208         }
1209
1210         spin_lock_init(&iommu->lock);
1211
1212         /*
1213          * if Caching mode is set, then invalid translations are tagged
1214          * with domainid 0. Hence we need to pre-allocate it.
1215          */
1216         if (cap_caching_mode(iommu->cap))
1217                 set_bit(0, iommu->domain_ids);
1218         return 0;
1219 }
1220
1221
1222 static void domain_exit(struct dmar_domain *domain);
1223 static void vm_domain_exit(struct dmar_domain *domain);
1224
1225 void free_dmar_iommu(struct intel_iommu *iommu)
1226 {
1227         struct dmar_domain *domain;
1228         int i;
1229         unsigned long flags;
1230
1231         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1232         for (; i < cap_ndoms(iommu->cap); ) {
1233                 domain = iommu->domains[i];
1234                 clear_bit(i, iommu->domain_ids);
1235
1236                 spin_lock_irqsave(&domain->iommu_lock, flags);
1237                 if (--domain->iommu_count == 0) {
1238                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1239                                 vm_domain_exit(domain);
1240                         else
1241                                 domain_exit(domain);
1242                 }
1243                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1244
1245                 i = find_next_bit(iommu->domain_ids,
1246                         cap_ndoms(iommu->cap), i+1);
1247         }
1248
1249         if (iommu->gcmd & DMA_GCMD_TE)
1250                 iommu_disable_translation(iommu);
1251
1252         if (iommu->irq) {
1253                 set_irq_data(iommu->irq, NULL);
1254                 /* This will mask the irq */
1255                 free_irq(iommu->irq, iommu);
1256                 destroy_irq(iommu->irq);
1257         }
1258
1259         kfree(iommu->domains);
1260         kfree(iommu->domain_ids);
1261
1262         g_iommus[iommu->seq_id] = NULL;
1263
1264         /* if all iommus are freed, free g_iommus */
1265         for (i = 0; i < g_num_of_iommus; i++) {
1266                 if (g_iommus[i])
1267                         break;
1268         }
1269
1270         if (i == g_num_of_iommus)
1271                 kfree(g_iommus);
1272
1273         /* free context mapping */
1274         free_context_table(iommu);
1275 }
1276
1277 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1278 {
1279         unsigned long num;
1280         unsigned long ndomains;
1281         struct dmar_domain *domain;
1282         unsigned long flags;
1283
1284         domain = alloc_domain_mem();
1285         if (!domain)
1286                 return NULL;
1287
1288         ndomains = cap_ndoms(iommu->cap);
1289
1290         spin_lock_irqsave(&iommu->lock, flags);
1291         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1292         if (num >= ndomains) {
1293                 spin_unlock_irqrestore(&iommu->lock, flags);
1294                 free_domain_mem(domain);
1295                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1296                 return NULL;
1297         }
1298
1299         set_bit(num, iommu->domain_ids);
1300         domain->id = num;
1301         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1302         set_bit(iommu->seq_id, &domain->iommu_bmp);
1303         domain->flags = 0;
1304         iommu->domains[num] = domain;
1305         spin_unlock_irqrestore(&iommu->lock, flags);
1306
1307         return domain;
1308 }
1309
1310 static void iommu_free_domain(struct dmar_domain *domain)
1311 {
1312         unsigned long flags;
1313         struct intel_iommu *iommu;
1314
1315         iommu = domain_get_iommu(domain);
1316
1317         spin_lock_irqsave(&iommu->lock, flags);
1318         clear_bit(domain->id, iommu->domain_ids);
1319         spin_unlock_irqrestore(&iommu->lock, flags);
1320 }
1321
1322 static struct iova_domain reserved_iova_list;
1323 static struct lock_class_key reserved_alloc_key;
1324 static struct lock_class_key reserved_rbtree_key;
1325
1326 static void dmar_init_reserved_ranges(void)
1327 {
1328         struct pci_dev *pdev = NULL;
1329         struct iova *iova;
1330         int i;
1331         u64 addr, size;
1332
1333         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1334
1335         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1336                 &reserved_alloc_key);
1337         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1338                 &reserved_rbtree_key);
1339
1340         /* IOAPIC ranges shouldn't be accessed by DMA */
1341         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1342                 IOVA_PFN(IOAPIC_RANGE_END));
1343         if (!iova)
1344                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1345
1346         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1347         for_each_pci_dev(pdev) {
1348                 struct resource *r;
1349
1350                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1351                         r = &pdev->resource[i];
1352                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1353                                 continue;
1354                         addr = r->start;
1355                         addr &= PAGE_MASK;
1356                         size = r->end - addr;
1357                         size = PAGE_ALIGN(size);
1358                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1359                                 IOVA_PFN(size + addr) - 1);
1360                         if (!iova)
1361                                 printk(KERN_ERR "Reserve iova failed\n");
1362                 }
1363         }
1364
1365 }
1366
1367 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1368 {
1369         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1370 }
1371
1372 static inline int guestwidth_to_adjustwidth(int gaw)
1373 {
1374         int agaw;
1375         int r = (gaw - 12) % 9;
1376
1377         if (r == 0)
1378                 agaw = gaw;
1379         else
1380                 agaw = gaw + 9 - r;
1381         if (agaw > 64)
1382                 agaw = 64;
1383         return agaw;
1384 }
1385
1386 static int domain_init(struct dmar_domain *domain, int guest_width)
1387 {
1388         struct intel_iommu *iommu;
1389         int adjust_width, agaw;
1390         unsigned long sagaw;
1391
1392         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1393         spin_lock_init(&domain->mapping_lock);
1394         spin_lock_init(&domain->iommu_lock);
1395
1396         domain_reserve_special_ranges(domain);
1397
1398         /* calculate AGAW */
1399         iommu = domain_get_iommu(domain);
1400         if (guest_width > cap_mgaw(iommu->cap))
1401                 guest_width = cap_mgaw(iommu->cap);
1402         domain->gaw = guest_width;
1403         adjust_width = guestwidth_to_adjustwidth(guest_width);
1404         agaw = width_to_agaw(adjust_width);
1405         sagaw = cap_sagaw(iommu->cap);
1406         if (!test_bit(agaw, &sagaw)) {
1407                 /* hardware doesn't support it, choose a bigger one */
1408                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1409                 agaw = find_next_bit(&sagaw, 5, agaw);
1410                 if (agaw >= 5)
1411                         return -ENODEV;
1412         }
1413         domain->agaw = agaw;
1414         INIT_LIST_HEAD(&domain->devices);
1415
1416         if (ecap_coherent(iommu->ecap))
1417                 domain->iommu_coherency = 1;
1418         else
1419                 domain->iommu_coherency = 0;
1420
1421         domain->iommu_count = 1;
1422
1423         /* always allocate the top pgd */
1424         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1425         if (!domain->pgd)
1426                 return -ENOMEM;
1427         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1428         return 0;
1429 }
1430
1431 static void domain_exit(struct dmar_domain *domain)
1432 {
1433         u64 end;
1434
1435         /* Domain 0 is reserved, so dont process it */
1436         if (!domain)
1437                 return;
1438
1439         domain_remove_dev_info(domain);
1440         /* destroy iovas */
1441         put_iova_domain(&domain->iovad);
1442         end = DOMAIN_MAX_ADDR(domain->gaw);
1443         end = end & (~PAGE_MASK);
1444
1445         /* clear ptes */
1446         dma_pte_clear_range(domain, 0, end);
1447
1448         /* free page tables */
1449         dma_pte_free_pagetable(domain, 0, end);
1450
1451         iommu_free_domain(domain);
1452         free_domain_mem(domain);
1453 }
1454
1455 static int domain_context_mapping_one(struct dmar_domain *domain,
1456                 u8 bus, u8 devfn)
1457 {
1458         struct context_entry *context;
1459         unsigned long flags;
1460         struct intel_iommu *iommu;
1461         struct dma_pte *pgd;
1462         unsigned long num;
1463         unsigned long ndomains;
1464         int id;
1465         int agaw;
1466
1467         pr_debug("Set context mapping for %02x:%02x.%d\n",
1468                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1469         BUG_ON(!domain->pgd);
1470
1471         iommu = device_to_iommu(bus, devfn);
1472         if (!iommu)
1473                 return -ENODEV;
1474
1475         context = device_to_context_entry(iommu, bus, devfn);
1476         if (!context)
1477                 return -ENOMEM;
1478         spin_lock_irqsave(&iommu->lock, flags);
1479         if (context_present(context)) {
1480                 spin_unlock_irqrestore(&iommu->lock, flags);
1481                 return 0;
1482         }
1483
1484         id = domain->id;
1485         pgd = domain->pgd;
1486
1487         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1488                 int found = 0;
1489
1490                 /* find an available domain id for this device in iommu */
1491                 ndomains = cap_ndoms(iommu->cap);
1492                 num = find_first_bit(iommu->domain_ids, ndomains);
1493                 for (; num < ndomains; ) {
1494                         if (iommu->domains[num] == domain) {
1495                                 id = num;
1496                                 found = 1;
1497                                 break;
1498                         }
1499                         num = find_next_bit(iommu->domain_ids,
1500                                             cap_ndoms(iommu->cap), num+1);
1501                 }
1502
1503                 if (found == 0) {
1504                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1505                         if (num >= ndomains) {
1506                                 spin_unlock_irqrestore(&iommu->lock, flags);
1507                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1508                                 return -EFAULT;
1509                         }
1510
1511                         set_bit(num, iommu->domain_ids);
1512                         iommu->domains[num] = domain;
1513                         id = num;
1514                 }
1515
1516                 /* Skip top levels of page tables for
1517                  * iommu which has less agaw than default.
1518                  */
1519                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1520                         pgd = phys_to_virt(dma_pte_addr(pgd));
1521                         if (!dma_pte_present(pgd)) {
1522                                 spin_unlock_irqrestore(&iommu->lock, flags);
1523                                 return -ENOMEM;
1524                         }
1525                 }
1526         }
1527
1528         context_set_domain_id(context, id);
1529         context_set_address_width(context, iommu->agaw);
1530         context_set_address_root(context, virt_to_phys(pgd));
1531         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1532         context_set_fault_enable(context);
1533         context_set_present(context);
1534         domain_flush_cache(domain, context, sizeof(*context));
1535
1536         /* it's a non-present to present mapping */
1537         if (iommu->flush.flush_context(iommu, domain->id,
1538                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1539                 DMA_CCMD_DEVICE_INVL, 1))
1540                 iommu_flush_write_buffer(iommu);
1541         else
1542                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1543
1544         spin_unlock_irqrestore(&iommu->lock, flags);
1545
1546         spin_lock_irqsave(&domain->iommu_lock, flags);
1547         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1548                 domain->iommu_count++;
1549                 domain_update_iommu_coherency(domain);
1550         }
1551         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1552         return 0;
1553 }
1554
1555 static int
1556 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1557 {
1558         int ret;
1559         struct pci_dev *tmp, *parent;
1560
1561         ret = domain_context_mapping_one(domain, pdev->bus->number,
1562                 pdev->devfn);
1563         if (ret)
1564                 return ret;
1565
1566         /* dependent device mapping */
1567         tmp = pci_find_upstream_pcie_bridge(pdev);
1568         if (!tmp)
1569                 return 0;
1570         /* Secondary interface's bus number and devfn 0 */
1571         parent = pdev->bus->self;
1572         while (parent != tmp) {
1573                 ret = domain_context_mapping_one(domain, parent->bus->number,
1574                         parent->devfn);
1575                 if (ret)
1576                         return ret;
1577                 parent = parent->bus->self;
1578         }
1579         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1580                 return domain_context_mapping_one(domain,
1581                         tmp->subordinate->number, 0);
1582         else /* this is a legacy PCI bridge */
1583                 return domain_context_mapping_one(domain,
1584                         tmp->bus->number, tmp->devfn);
1585 }
1586
1587 static int domain_context_mapped(struct pci_dev *pdev)
1588 {
1589         int ret;
1590         struct pci_dev *tmp, *parent;
1591         struct intel_iommu *iommu;
1592
1593         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1594         if (!iommu)
1595                 return -ENODEV;
1596
1597         ret = device_context_mapped(iommu,
1598                 pdev->bus->number, pdev->devfn);
1599         if (!ret)
1600                 return ret;
1601         /* dependent device mapping */
1602         tmp = pci_find_upstream_pcie_bridge(pdev);
1603         if (!tmp)
1604                 return ret;
1605         /* Secondary interface's bus number and devfn 0 */
1606         parent = pdev->bus->self;
1607         while (parent != tmp) {
1608                 ret = device_context_mapped(iommu, parent->bus->number,
1609                         parent->devfn);
1610                 if (!ret)
1611                         return ret;
1612                 parent = parent->bus->self;
1613         }
1614         if (tmp->is_pcie)
1615                 return device_context_mapped(iommu,
1616                         tmp->subordinate->number, 0);
1617         else
1618                 return device_context_mapped(iommu,
1619                         tmp->bus->number, tmp->devfn);
1620 }
1621
1622 static int
1623 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1624                         u64 hpa, size_t size, int prot)
1625 {
1626         u64 start_pfn, end_pfn;
1627         struct dma_pte *pte;
1628         int index;
1629         int addr_width = agaw_to_width(domain->agaw);
1630
1631         hpa &= (((u64)1) << addr_width) - 1;
1632
1633         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1634                 return -EINVAL;
1635         iova &= PAGE_MASK;
1636         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1637         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1638         index = 0;
1639         while (start_pfn < end_pfn) {
1640                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1641                 if (!pte)
1642                         return -ENOMEM;
1643                 /* We don't need lock here, nobody else
1644                  * touches the iova range
1645                  */
1646                 BUG_ON(dma_pte_addr(pte));
1647                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1648                 dma_set_pte_prot(pte, prot);
1649                 domain_flush_cache(domain, pte, sizeof(*pte));
1650                 start_pfn++;
1651                 index++;
1652         }
1653         return 0;
1654 }
1655
1656 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1657 {
1658         if (!iommu)
1659                 return;
1660
1661         clear_context_table(iommu, bus, devfn);
1662         iommu->flush.flush_context(iommu, 0, 0, 0,
1663                                            DMA_CCMD_GLOBAL_INVL, 0);
1664         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1665                                          DMA_TLB_GLOBAL_FLUSH, 0);
1666 }
1667
1668 static void domain_remove_dev_info(struct dmar_domain *domain)
1669 {
1670         struct device_domain_info *info;
1671         unsigned long flags;
1672         struct intel_iommu *iommu;
1673
1674         spin_lock_irqsave(&device_domain_lock, flags);
1675         while (!list_empty(&domain->devices)) {
1676                 info = list_entry(domain->devices.next,
1677                         struct device_domain_info, link);
1678                 list_del(&info->link);
1679                 list_del(&info->global);
1680                 if (info->dev)
1681                         info->dev->dev.archdata.iommu = NULL;
1682                 spin_unlock_irqrestore(&device_domain_lock, flags);
1683
1684                 iommu = device_to_iommu(info->bus, info->devfn);
1685                 iommu_detach_dev(iommu, info->bus, info->devfn);
1686                 free_devinfo_mem(info);
1687
1688                 spin_lock_irqsave(&device_domain_lock, flags);
1689         }
1690         spin_unlock_irqrestore(&device_domain_lock, flags);
1691 }
1692
1693 /*
1694  * find_domain
1695  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1696  */
1697 static struct dmar_domain *
1698 find_domain(struct pci_dev *pdev)
1699 {
1700         struct device_domain_info *info;
1701
1702         /* No lock here, assumes no domain exit in normal case */
1703         info = pdev->dev.archdata.iommu;
1704         if (info)
1705                 return info->domain;
1706         return NULL;
1707 }
1708
1709 /* domain is initialized */
1710 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1711 {
1712         struct dmar_domain *domain, *found = NULL;
1713         struct intel_iommu *iommu;
1714         struct dmar_drhd_unit *drhd;
1715         struct device_domain_info *info, *tmp;
1716         struct pci_dev *dev_tmp;
1717         unsigned long flags;
1718         int bus = 0, devfn = 0;
1719
1720         domain = find_domain(pdev);
1721         if (domain)
1722                 return domain;
1723
1724         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1725         if (dev_tmp) {
1726                 if (dev_tmp->is_pcie) {
1727                         bus = dev_tmp->subordinate->number;
1728                         devfn = 0;
1729                 } else {
1730                         bus = dev_tmp->bus->number;
1731                         devfn = dev_tmp->devfn;
1732                 }
1733                 spin_lock_irqsave(&device_domain_lock, flags);
1734                 list_for_each_entry(info, &device_domain_list, global) {
1735                         if (info->bus == bus && info->devfn == devfn) {
1736                                 found = info->domain;
1737                                 break;
1738                         }
1739                 }
1740                 spin_unlock_irqrestore(&device_domain_lock, flags);
1741                 /* pcie-pci bridge already has a domain, uses it */
1742                 if (found) {
1743                         domain = found;
1744                         goto found_domain;
1745                 }
1746         }
1747
1748         /* Allocate new domain for the device */
1749         drhd = dmar_find_matched_drhd_unit(pdev);
1750         if (!drhd) {
1751                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1752                         pci_name(pdev));
1753                 return NULL;
1754         }
1755         iommu = drhd->iommu;
1756
1757         domain = iommu_alloc_domain(iommu);
1758         if (!domain)
1759                 goto error;
1760
1761         if (domain_init(domain, gaw)) {
1762                 domain_exit(domain);
1763                 goto error;
1764         }
1765
1766         /* register pcie-to-pci device */
1767         if (dev_tmp) {
1768                 info = alloc_devinfo_mem();
1769                 if (!info) {
1770                         domain_exit(domain);
1771                         goto error;
1772                 }
1773                 info->bus = bus;
1774                 info->devfn = devfn;
1775                 info->dev = NULL;
1776                 info->domain = domain;
1777                 /* This domain is shared by devices under p2p bridge */
1778                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1779
1780                 /* pcie-to-pci bridge already has a domain, uses it */
1781                 found = NULL;
1782                 spin_lock_irqsave(&device_domain_lock, flags);
1783                 list_for_each_entry(tmp, &device_domain_list, global) {
1784                         if (tmp->bus == bus && tmp->devfn == devfn) {
1785                                 found = tmp->domain;
1786                                 break;
1787                         }
1788                 }
1789                 if (found) {
1790                         free_devinfo_mem(info);
1791                         domain_exit(domain);
1792                         domain = found;
1793                 } else {
1794                         list_add(&info->link, &domain->devices);
1795                         list_add(&info->global, &device_domain_list);
1796                 }
1797                 spin_unlock_irqrestore(&device_domain_lock, flags);
1798         }
1799
1800 found_domain:
1801         info = alloc_devinfo_mem();
1802         if (!info)
1803                 goto error;
1804         info->bus = pdev->bus->number;
1805         info->devfn = pdev->devfn;
1806         info->dev = pdev;
1807         info->domain = domain;
1808         spin_lock_irqsave(&device_domain_lock, flags);
1809         /* somebody is fast */
1810         found = find_domain(pdev);
1811         if (found != NULL) {
1812                 spin_unlock_irqrestore(&device_domain_lock, flags);
1813                 if (found != domain) {
1814                         domain_exit(domain);
1815                         domain = found;
1816                 }
1817                 free_devinfo_mem(info);
1818                 return domain;
1819         }
1820         list_add(&info->link, &domain->devices);
1821         list_add(&info->global, &device_domain_list);
1822         pdev->dev.archdata.iommu = info;
1823         spin_unlock_irqrestore(&device_domain_lock, flags);
1824         return domain;
1825 error:
1826         /* recheck it here, maybe others set it */
1827         return find_domain(pdev);
1828 }
1829
1830 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1831                                       unsigned long long start,
1832                                       unsigned long long end)
1833 {
1834         struct dmar_domain *domain;
1835         unsigned long size;
1836         unsigned long long base;
1837         int ret;
1838
1839         printk(KERN_INFO
1840                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1841                 pci_name(pdev), start, end);
1842         /* page table init */
1843         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1844         if (!domain)
1845                 return -ENOMEM;
1846
1847         /* The address might not be aligned */
1848         base = start & PAGE_MASK;
1849         size = end - base;
1850         size = PAGE_ALIGN(size);
1851         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1852                         IOVA_PFN(base + size) - 1)) {
1853                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1854                 ret = -ENOMEM;
1855                 goto error;
1856         }
1857
1858         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1859                 size, base, pci_name(pdev));
1860         /*
1861          * RMRR range might have overlap with physical memory range,
1862          * clear it first
1863          */
1864         dma_pte_clear_range(domain, base, base + size);
1865
1866         ret = domain_page_mapping(domain, base, base, size,
1867                 DMA_PTE_READ|DMA_PTE_WRITE);
1868         if (ret)
1869                 goto error;
1870
1871         /* context entry init */
1872         ret = domain_context_mapping(domain, pdev);
1873         if (!ret)
1874                 return 0;
1875 error:
1876         domain_exit(domain);
1877         return ret;
1878
1879 }
1880
1881 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1882         struct pci_dev *pdev)
1883 {
1884         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1885                 return 0;
1886         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1887                 rmrr->end_address + 1);
1888 }
1889
1890 #ifdef CONFIG_DMAR_GFX_WA
1891 struct iommu_prepare_data {
1892         struct pci_dev *pdev;
1893         int ret;
1894 };
1895
1896 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1897                                          unsigned long end_pfn, void *datax)
1898 {
1899         struct iommu_prepare_data *data;
1900
1901         data = (struct iommu_prepare_data *)datax;
1902
1903         data->ret = iommu_prepare_identity_map(data->pdev,
1904                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1905         return data->ret;
1906
1907 }
1908
1909 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1910 {
1911         int nid;
1912         struct iommu_prepare_data data;
1913
1914         data.pdev = pdev;
1915         data.ret = 0;
1916
1917         for_each_online_node(nid) {
1918                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1919                 if (data.ret)
1920                         return data.ret;
1921         }
1922         return data.ret;
1923 }
1924
1925 static void __init iommu_prepare_gfx_mapping(void)
1926 {
1927         struct pci_dev *pdev = NULL;
1928         int ret;
1929
1930         for_each_pci_dev(pdev) {
1931                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1932                                 !IS_GFX_DEVICE(pdev))
1933                         continue;
1934                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1935                         pci_name(pdev));
1936                 ret = iommu_prepare_with_active_regions(pdev);
1937                 if (ret)
1938                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1939         }
1940 }
1941 #else /* !CONFIG_DMAR_GFX_WA */
1942 static inline void iommu_prepare_gfx_mapping(void)
1943 {
1944         return;
1945 }
1946 #endif
1947
1948 #ifdef CONFIG_DMAR_FLOPPY_WA
1949 static inline void iommu_prepare_isa(void)
1950 {
1951         struct pci_dev *pdev;
1952         int ret;
1953
1954         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1955         if (!pdev)
1956                 return;
1957
1958         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1959         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1960
1961         if (ret)
1962                 printk("IOMMU: Failed to create 0-64M identity map, "
1963                         "floppy might not work\n");
1964
1965 }
1966 #else
1967 static inline void iommu_prepare_isa(void)
1968 {
1969         return;
1970 }
1971 #endif /* !CONFIG_DMAR_FLPY_WA */
1972
1973 static int __init init_dmars(void)
1974 {
1975         struct dmar_drhd_unit *drhd;
1976         struct dmar_rmrr_unit *rmrr;
1977         struct pci_dev *pdev;
1978         struct intel_iommu *iommu;
1979         int i, ret, unit = 0;
1980
1981         /*
1982          * for each drhd
1983          *    allocate root
1984          *    initialize and program root entry to not present
1985          * endfor
1986          */
1987         for_each_drhd_unit(drhd) {
1988                 g_num_of_iommus++;
1989                 /*
1990                  * lock not needed as this is only incremented in the single
1991                  * threaded kernel __init code path all other access are read
1992                  * only
1993                  */
1994         }
1995
1996         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1997                         GFP_KERNEL);
1998         if (!g_iommus) {
1999                 printk(KERN_ERR "Allocating global iommu array failed\n");
2000                 ret = -ENOMEM;
2001                 goto error;
2002         }
2003
2004         deferred_flush = kzalloc(g_num_of_iommus *
2005                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2006         if (!deferred_flush) {
2007                 kfree(g_iommus);
2008                 ret = -ENOMEM;
2009                 goto error;
2010         }
2011
2012         for_each_drhd_unit(drhd) {
2013                 if (drhd->ignored)
2014                         continue;
2015
2016                 iommu = drhd->iommu;
2017                 g_iommus[iommu->seq_id] = iommu;
2018
2019                 ret = iommu_init_domains(iommu);
2020                 if (ret)
2021                         goto error;
2022
2023                 /*
2024                  * TBD:
2025                  * we could share the same root & context tables
2026                  * amoung all IOMMU's. Need to Split it later.
2027                  */
2028                 ret = iommu_alloc_root_entry(iommu);
2029                 if (ret) {
2030                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2031                         goto error;
2032                 }
2033         }
2034
2035         for_each_drhd_unit(drhd) {
2036                 if (drhd->ignored)
2037                         continue;
2038
2039                 iommu = drhd->iommu;
2040                 if (dmar_enable_qi(iommu)) {
2041                         /*
2042                          * Queued Invalidate not enabled, use Register Based
2043                          * Invalidate
2044                          */
2045                         iommu->flush.flush_context = __iommu_flush_context;
2046                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2047                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2048                                "invalidation\n",
2049                                (unsigned long long)drhd->reg_base_addr);
2050                 } else {
2051                         iommu->flush.flush_context = qi_flush_context;
2052                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2053                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2054                                "invalidation\n",
2055                                (unsigned long long)drhd->reg_base_addr);
2056                 }
2057         }
2058
2059         /*
2060          * For each rmrr
2061          *   for each dev attached to rmrr
2062          *   do
2063          *     locate drhd for dev, alloc domain for dev
2064          *     allocate free domain
2065          *     allocate page table entries for rmrr
2066          *     if context not allocated for bus
2067          *           allocate and init context
2068          *           set present in root table for this bus
2069          *     init context with domain, translation etc
2070          *    endfor
2071          * endfor
2072          */
2073         for_each_rmrr_units(rmrr) {
2074                 for (i = 0; i < rmrr->devices_cnt; i++) {
2075                         pdev = rmrr->devices[i];
2076                         /* some BIOS lists non-exist devices in DMAR table */
2077                         if (!pdev)
2078                                 continue;
2079                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2080                         if (ret)
2081                                 printk(KERN_ERR
2082                                  "IOMMU: mapping reserved region failed\n");
2083                 }
2084         }
2085
2086         iommu_prepare_gfx_mapping();
2087
2088         iommu_prepare_isa();
2089
2090         /*
2091          * for each drhd
2092          *   enable fault log
2093          *   global invalidate context cache
2094          *   global invalidate iotlb
2095          *   enable translation
2096          */
2097         for_each_drhd_unit(drhd) {
2098                 if (drhd->ignored)
2099                         continue;
2100                 iommu = drhd->iommu;
2101                 sprintf (iommu->name, "dmar%d", unit++);
2102
2103                 iommu_flush_write_buffer(iommu);
2104
2105                 ret = dmar_set_interrupt(iommu);
2106                 if (ret)
2107                         goto error;
2108
2109                 iommu_set_root_entry(iommu);
2110
2111                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2112                                            0);
2113                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2114                                          0);
2115                 iommu_disable_protect_mem_regions(iommu);
2116
2117                 ret = iommu_enable_translation(iommu);
2118                 if (ret)
2119                         goto error;
2120         }
2121
2122         return 0;
2123 error:
2124         for_each_drhd_unit(drhd) {
2125                 if (drhd->ignored)
2126                         continue;
2127                 iommu = drhd->iommu;
2128                 free_iommu(iommu);
2129         }
2130         kfree(g_iommus);
2131         return ret;
2132 }
2133
2134 static inline u64 aligned_size(u64 host_addr, size_t size)
2135 {
2136         u64 addr;
2137         addr = (host_addr & (~PAGE_MASK)) + size;
2138         return PAGE_ALIGN(addr);
2139 }
2140
2141 struct iova *
2142 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2143 {
2144         struct iova *piova;
2145
2146         /* Make sure it's in range */
2147         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2148         if (!size || (IOVA_START_ADDR + size > end))
2149                 return NULL;
2150
2151         piova = alloc_iova(&domain->iovad,
2152                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2153         return piova;
2154 }
2155
2156 static struct iova *
2157 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2158                    size_t size, u64 dma_mask)
2159 {
2160         struct pci_dev *pdev = to_pci_dev(dev);
2161         struct iova *iova = NULL;
2162
2163         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2164                 iova = iommu_alloc_iova(domain, size, dma_mask);
2165         else {
2166                 /*
2167                  * First try to allocate an io virtual address in
2168                  * DMA_32BIT_MASK and if that fails then try allocating
2169                  * from higher range
2170                  */
2171                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2172                 if (!iova)
2173                         iova = iommu_alloc_iova(domain, size, dma_mask);
2174         }
2175
2176         if (!iova) {
2177                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2178                 return NULL;
2179         }
2180
2181         return iova;
2182 }
2183
2184 static struct dmar_domain *
2185 get_valid_domain_for_dev(struct pci_dev *pdev)
2186 {
2187         struct dmar_domain *domain;
2188         int ret;
2189
2190         domain = get_domain_for_dev(pdev,
2191                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2192         if (!domain) {
2193                 printk(KERN_ERR
2194                         "Allocating domain for %s failed", pci_name(pdev));
2195                 return NULL;
2196         }
2197
2198         /* make sure context mapping is ok */
2199         if (unlikely(!domain_context_mapped(pdev))) {
2200                 ret = domain_context_mapping(domain, pdev);
2201                 if (ret) {
2202                         printk(KERN_ERR
2203                                 "Domain context map for %s failed",
2204                                 pci_name(pdev));
2205                         return NULL;
2206                 }
2207         }
2208
2209         return domain;
2210 }
2211
2212 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2213                                      size_t size, int dir, u64 dma_mask)
2214 {
2215         struct pci_dev *pdev = to_pci_dev(hwdev);
2216         struct dmar_domain *domain;
2217         phys_addr_t start_paddr;
2218         struct iova *iova;
2219         int prot = 0;
2220         int ret;
2221         struct intel_iommu *iommu;
2222
2223         BUG_ON(dir == DMA_NONE);
2224         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2225                 return paddr;
2226
2227         domain = get_valid_domain_for_dev(pdev);
2228         if (!domain)
2229                 return 0;
2230
2231         iommu = domain_get_iommu(domain);
2232         size = aligned_size((u64)paddr, size);
2233
2234         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2235         if (!iova)
2236                 goto error;
2237
2238         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2239
2240         /*
2241          * Check if DMAR supports zero-length reads on write only
2242          * mappings..
2243          */
2244         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2245                         !cap_zlr(iommu->cap))
2246                 prot |= DMA_PTE_READ;
2247         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2248                 prot |= DMA_PTE_WRITE;
2249         /*
2250          * paddr - (paddr + size) might be partial page, we should map the whole
2251          * page.  Note: if two part of one page are separately mapped, we
2252          * might have two guest_addr mapping to the same host paddr, but this
2253          * is not a big problem
2254          */
2255         ret = domain_page_mapping(domain, start_paddr,
2256                 ((u64)paddr) & PAGE_MASK, size, prot);
2257         if (ret)
2258                 goto error;
2259
2260         /* it's a non-present to present mapping */
2261         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2262                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2263         if (ret)
2264                 iommu_flush_write_buffer(iommu);
2265
2266         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2267
2268 error:
2269         if (iova)
2270                 __free_iova(&domain->iovad, iova);
2271         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2272                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2273         return 0;
2274 }
2275
2276 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2277                                  unsigned long offset, size_t size,
2278                                  enum dma_data_direction dir,
2279                                  struct dma_attrs *attrs)
2280 {
2281         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2282                                   dir, to_pci_dev(dev)->dma_mask);
2283 }
2284
2285 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2286                             size_t size, int dir)
2287 {
2288         return __intel_map_single(hwdev, paddr, size, dir,
2289                                   to_pci_dev(hwdev)->dma_mask);
2290 }
2291
2292 static void flush_unmaps(void)
2293 {
2294         int i, j;
2295
2296         timer_on = 0;
2297
2298         /* just flush them all */
2299         for (i = 0; i < g_num_of_iommus; i++) {
2300                 struct intel_iommu *iommu = g_iommus[i];
2301                 if (!iommu)
2302                         continue;
2303
2304                 if (deferred_flush[i].next) {
2305                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2306                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2307                         for (j = 0; j < deferred_flush[i].next; j++) {
2308                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2309                                                 deferred_flush[i].iova[j]);
2310                         }
2311                         deferred_flush[i].next = 0;
2312                 }
2313         }
2314
2315         list_size = 0;
2316 }
2317
2318 static void flush_unmaps_timeout(unsigned long data)
2319 {
2320         unsigned long flags;
2321
2322         spin_lock_irqsave(&async_umap_flush_lock, flags);
2323         flush_unmaps();
2324         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2325 }
2326
2327 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2328 {
2329         unsigned long flags;
2330         int next, iommu_id;
2331         struct intel_iommu *iommu;
2332
2333         spin_lock_irqsave(&async_umap_flush_lock, flags);
2334         if (list_size == HIGH_WATER_MARK)
2335                 flush_unmaps();
2336
2337         iommu = domain_get_iommu(dom);
2338         iommu_id = iommu->seq_id;
2339
2340         next = deferred_flush[iommu_id].next;
2341         deferred_flush[iommu_id].domain[next] = dom;
2342         deferred_flush[iommu_id].iova[next] = iova;
2343         deferred_flush[iommu_id].next++;
2344
2345         if (!timer_on) {
2346                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2347                 timer_on = 1;
2348         }
2349         list_size++;
2350         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2351 }
2352
2353 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2354                              size_t size, enum dma_data_direction dir,
2355                              struct dma_attrs *attrs)
2356 {
2357         struct pci_dev *pdev = to_pci_dev(dev);
2358         struct dmar_domain *domain;
2359         unsigned long start_addr;
2360         struct iova *iova;
2361         struct intel_iommu *iommu;
2362
2363         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2364                 return;
2365         domain = find_domain(pdev);
2366         BUG_ON(!domain);
2367
2368         iommu = domain_get_iommu(domain);
2369
2370         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2371         if (!iova)
2372                 return;
2373
2374         start_addr = iova->pfn_lo << PAGE_SHIFT;
2375         size = aligned_size((u64)dev_addr, size);
2376
2377         pr_debug("Device %s unmapping: %lx@%llx\n",
2378                 pci_name(pdev), size, (unsigned long long)start_addr);
2379
2380         /*  clear the whole page */
2381         dma_pte_clear_range(domain, start_addr, start_addr + size);
2382         /* free page tables */
2383         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2384         if (intel_iommu_strict) {
2385                 if (iommu_flush_iotlb_psi(iommu,
2386                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2387                         iommu_flush_write_buffer(iommu);
2388                 /* free iova */
2389                 __free_iova(&domain->iovad, iova);
2390         } else {
2391                 add_unmap(domain, iova);
2392                 /*
2393                  * queue up the release of the unmap to save the 1/6th of the
2394                  * cpu used up by the iotlb flush operation...
2395                  */
2396         }
2397 }
2398
2399 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2400                         int dir)
2401 {
2402         intel_unmap_page(dev, dev_addr, size, dir, NULL);
2403 }
2404
2405 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2406                            dma_addr_t *dma_handle, gfp_t flags)
2407 {
2408         void *vaddr;
2409         int order;
2410
2411         size = PAGE_ALIGN(size);
2412         order = get_order(size);
2413         flags &= ~(GFP_DMA | GFP_DMA32);
2414
2415         vaddr = (void *)__get_free_pages(flags, order);
2416         if (!vaddr)
2417                 return NULL;
2418         memset(vaddr, 0, size);
2419
2420         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2421                                          DMA_BIDIRECTIONAL,
2422                                          hwdev->coherent_dma_mask);
2423         if (*dma_handle)
2424                 return vaddr;
2425         free_pages((unsigned long)vaddr, order);
2426         return NULL;
2427 }
2428
2429 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2430                          dma_addr_t dma_handle)
2431 {
2432         int order;
2433
2434         size = PAGE_ALIGN(size);
2435         order = get_order(size);
2436
2437         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2438         free_pages((unsigned long)vaddr, order);
2439 }
2440
2441 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2442
2443 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2444                     int nelems, int dir)
2445 {
2446         int i;
2447         struct pci_dev *pdev = to_pci_dev(hwdev);
2448         struct dmar_domain *domain;
2449         unsigned long start_addr;
2450         struct iova *iova;
2451         size_t size = 0;
2452         void *addr;
2453         struct scatterlist *sg;
2454         struct intel_iommu *iommu;
2455
2456         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2457                 return;
2458
2459         domain = find_domain(pdev);
2460         BUG_ON(!domain);
2461
2462         iommu = domain_get_iommu(domain);
2463
2464         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2465         if (!iova)
2466                 return;
2467         for_each_sg(sglist, sg, nelems, i) {
2468                 addr = SG_ENT_VIRT_ADDRESS(sg);
2469                 size += aligned_size((u64)addr, sg->length);
2470         }
2471
2472         start_addr = iova->pfn_lo << PAGE_SHIFT;
2473
2474         /*  clear the whole page */
2475         dma_pte_clear_range(domain, start_addr, start_addr + size);
2476         /* free page tables */
2477         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2478
2479         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2480                         size >> VTD_PAGE_SHIFT, 0))
2481                 iommu_flush_write_buffer(iommu);
2482
2483         /* free iova */
2484         __free_iova(&domain->iovad, iova);
2485 }
2486
2487 static int intel_nontranslate_map_sg(struct device *hddev,
2488         struct scatterlist *sglist, int nelems, int dir)
2489 {
2490         int i;
2491         struct scatterlist *sg;
2492
2493         for_each_sg(sglist, sg, nelems, i) {
2494                 BUG_ON(!sg_page(sg));
2495                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2496                 sg->dma_length = sg->length;
2497         }
2498         return nelems;
2499 }
2500
2501 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2502                  int dir)
2503 {
2504         void *addr;
2505         int i;
2506         struct pci_dev *pdev = to_pci_dev(hwdev);
2507         struct dmar_domain *domain;
2508         size_t size = 0;
2509         int prot = 0;
2510         size_t offset = 0;
2511         struct iova *iova = NULL;
2512         int ret;
2513         struct scatterlist *sg;
2514         unsigned long start_addr;
2515         struct intel_iommu *iommu;
2516
2517         BUG_ON(dir == DMA_NONE);
2518         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2519                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2520
2521         domain = get_valid_domain_for_dev(pdev);
2522         if (!domain)
2523                 return 0;
2524
2525         iommu = domain_get_iommu(domain);
2526
2527         for_each_sg(sglist, sg, nelems, i) {
2528                 addr = SG_ENT_VIRT_ADDRESS(sg);
2529                 addr = (void *)virt_to_phys(addr);
2530                 size += aligned_size((u64)addr, sg->length);
2531         }
2532
2533         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2534         if (!iova) {
2535                 sglist->dma_length = 0;
2536                 return 0;
2537         }
2538
2539         /*
2540          * Check if DMAR supports zero-length reads on write only
2541          * mappings..
2542          */
2543         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2544                         !cap_zlr(iommu->cap))
2545                 prot |= DMA_PTE_READ;
2546         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2547                 prot |= DMA_PTE_WRITE;
2548
2549         start_addr = iova->pfn_lo << PAGE_SHIFT;
2550         offset = 0;
2551         for_each_sg(sglist, sg, nelems, i) {
2552                 addr = SG_ENT_VIRT_ADDRESS(sg);
2553                 addr = (void *)virt_to_phys(addr);
2554                 size = aligned_size((u64)addr, sg->length);
2555                 ret = domain_page_mapping(domain, start_addr + offset,
2556                         ((u64)addr) & PAGE_MASK,
2557                         size, prot);
2558                 if (ret) {
2559                         /*  clear the page */
2560                         dma_pte_clear_range(domain, start_addr,
2561                                   start_addr + offset);
2562                         /* free page tables */
2563                         dma_pte_free_pagetable(domain, start_addr,
2564                                   start_addr + offset);
2565                         /* free iova */
2566                         __free_iova(&domain->iovad, iova);
2567                         return 0;
2568                 }
2569                 sg->dma_address = start_addr + offset +
2570                                 ((u64)addr & (~PAGE_MASK));
2571                 sg->dma_length = sg->length;
2572                 offset += size;
2573         }
2574
2575         /* it's a non-present to present mapping */
2576         if (iommu_flush_iotlb_psi(iommu, domain->id,
2577                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2578                 iommu_flush_write_buffer(iommu);
2579         return nelems;
2580 }
2581
2582 static struct dma_mapping_ops intel_dma_ops = {
2583         .alloc_coherent = intel_alloc_coherent,
2584         .free_coherent = intel_free_coherent,
2585         .map_sg = intel_map_sg,
2586         .unmap_sg = intel_unmap_sg,
2587 #ifdef CONFIG_X86_64
2588         .map_page = intel_map_page,
2589         .unmap_page = intel_unmap_page,
2590 #endif
2591 };
2592
2593 static inline int iommu_domain_cache_init(void)
2594 {
2595         int ret = 0;
2596
2597         iommu_domain_cache = kmem_cache_create("iommu_domain",
2598                                          sizeof(struct dmar_domain),
2599                                          0,
2600                                          SLAB_HWCACHE_ALIGN,
2601
2602                                          NULL);
2603         if (!iommu_domain_cache) {
2604                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2605                 ret = -ENOMEM;
2606         }
2607
2608         return ret;
2609 }
2610
2611 static inline int iommu_devinfo_cache_init(void)
2612 {
2613         int ret = 0;
2614
2615         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2616                                          sizeof(struct device_domain_info),
2617                                          0,
2618                                          SLAB_HWCACHE_ALIGN,
2619                                          NULL);
2620         if (!iommu_devinfo_cache) {
2621                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2622                 ret = -ENOMEM;
2623         }
2624
2625         return ret;
2626 }
2627
2628 static inline int iommu_iova_cache_init(void)
2629 {
2630         int ret = 0;
2631
2632         iommu_iova_cache = kmem_cache_create("iommu_iova",
2633                                          sizeof(struct iova),
2634                                          0,
2635                                          SLAB_HWCACHE_ALIGN,
2636                                          NULL);
2637         if (!iommu_iova_cache) {
2638                 printk(KERN_ERR "Couldn't create iova cache\n");
2639                 ret = -ENOMEM;
2640         }
2641
2642         return ret;
2643 }
2644
2645 static int __init iommu_init_mempool(void)
2646 {
2647         int ret;
2648         ret = iommu_iova_cache_init();
2649         if (ret)
2650                 return ret;
2651
2652         ret = iommu_domain_cache_init();
2653         if (ret)
2654                 goto domain_error;
2655
2656         ret = iommu_devinfo_cache_init();
2657         if (!ret)
2658                 return ret;
2659
2660         kmem_cache_destroy(iommu_domain_cache);
2661 domain_error:
2662         kmem_cache_destroy(iommu_iova_cache);
2663
2664         return -ENOMEM;
2665 }
2666
2667 static void __init iommu_exit_mempool(void)
2668 {
2669         kmem_cache_destroy(iommu_devinfo_cache);
2670         kmem_cache_destroy(iommu_domain_cache);
2671         kmem_cache_destroy(iommu_iova_cache);
2672
2673 }
2674
2675 static void __init init_no_remapping_devices(void)
2676 {
2677         struct dmar_drhd_unit *drhd;
2678
2679         for_each_drhd_unit(drhd) {
2680                 if (!drhd->include_all) {
2681                         int i;
2682                         for (i = 0; i < drhd->devices_cnt; i++)
2683                                 if (drhd->devices[i] != NULL)
2684                                         break;
2685                         /* ignore DMAR unit if no pci devices exist */
2686                         if (i == drhd->devices_cnt)
2687                                 drhd->ignored = 1;
2688                 }
2689         }
2690
2691         if (dmar_map_gfx)
2692                 return;
2693
2694         for_each_drhd_unit(drhd) {
2695                 int i;
2696                 if (drhd->ignored || drhd->include_all)
2697                         continue;
2698
2699                 for (i = 0; i < drhd->devices_cnt; i++)
2700                         if (drhd->devices[i] &&
2701                                 !IS_GFX_DEVICE(drhd->devices[i]))
2702                                 break;
2703
2704                 if (i < drhd->devices_cnt)
2705                         continue;
2706
2707                 /* bypass IOMMU if it is just for gfx devices */
2708                 drhd->ignored = 1;
2709                 for (i = 0; i < drhd->devices_cnt; i++) {
2710                         if (!drhd->devices[i])
2711                                 continue;
2712                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2713                 }
2714         }
2715 }
2716
2717 int __init intel_iommu_init(void)
2718 {
2719         int ret = 0;
2720
2721         if (dmar_table_init())
2722                 return  -ENODEV;
2723
2724         if (dmar_dev_scope_init())
2725                 return  -ENODEV;
2726
2727         /*
2728          * Check the need for DMA-remapping initialization now.
2729          * Above initialization will also be used by Interrupt-remapping.
2730          */
2731         if (no_iommu || swiotlb || dmar_disabled)
2732                 return -ENODEV;
2733
2734         iommu_init_mempool();
2735         dmar_init_reserved_ranges();
2736
2737         init_no_remapping_devices();
2738
2739         ret = init_dmars();
2740         if (ret) {
2741                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2742                 put_iova_domain(&reserved_iova_list);
2743                 iommu_exit_mempool();
2744                 return ret;
2745         }
2746         printk(KERN_INFO
2747         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2748
2749         init_timer(&unmap_timer);
2750         force_iommu = 1;
2751         dma_ops = &intel_dma_ops;
2752
2753         register_iommu(&intel_iommu_ops);
2754
2755         return 0;
2756 }
2757
2758 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2759                                   struct pci_dev *pdev)
2760 {
2761         struct device_domain_info *info;
2762         unsigned long flags;
2763
2764         info = alloc_devinfo_mem();
2765         if (!info)
2766                 return -ENOMEM;
2767
2768         info->bus = pdev->bus->number;
2769         info->devfn = pdev->devfn;
2770         info->dev = pdev;
2771         info->domain = domain;
2772
2773         spin_lock_irqsave(&device_domain_lock, flags);
2774         list_add(&info->link, &domain->devices);
2775         list_add(&info->global, &device_domain_list);
2776         pdev->dev.archdata.iommu = info;
2777         spin_unlock_irqrestore(&device_domain_lock, flags);
2778
2779         return 0;
2780 }
2781
2782 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2783                                           struct pci_dev *pdev)
2784 {
2785         struct device_domain_info *info;
2786         struct intel_iommu *iommu;
2787         unsigned long flags;
2788         int found = 0;
2789         struct list_head *entry, *tmp;
2790
2791         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2792         if (!iommu)
2793                 return;
2794
2795         spin_lock_irqsave(&device_domain_lock, flags);
2796         list_for_each_safe(entry, tmp, &domain->devices) {
2797                 info = list_entry(entry, struct device_domain_info, link);
2798                 if (info->bus == pdev->bus->number &&
2799                     info->devfn == pdev->devfn) {
2800                         list_del(&info->link);
2801                         list_del(&info->global);
2802                         if (info->dev)
2803                                 info->dev->dev.archdata.iommu = NULL;
2804                         spin_unlock_irqrestore(&device_domain_lock, flags);
2805
2806                         iommu_detach_dev(iommu, info->bus, info->devfn);
2807                         free_devinfo_mem(info);
2808
2809                         spin_lock_irqsave(&device_domain_lock, flags);
2810
2811                         if (found)
2812                                 break;
2813                         else
2814                                 continue;
2815                 }
2816
2817                 /* if there is no other devices under the same iommu
2818                  * owned by this domain, clear this iommu in iommu_bmp
2819                  * update iommu count and coherency
2820                  */
2821                 if (device_to_iommu(info->bus, info->devfn) == iommu)
2822                         found = 1;
2823         }
2824
2825         if (found == 0) {
2826                 unsigned long tmp_flags;
2827                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2828                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2829                 domain->iommu_count--;
2830                 domain_update_iommu_coherency(domain);
2831                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2832         }
2833
2834         spin_unlock_irqrestore(&device_domain_lock, flags);
2835 }
2836
2837 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2838 {
2839         struct device_domain_info *info;
2840         struct intel_iommu *iommu;
2841         unsigned long flags1, flags2;
2842
2843         spin_lock_irqsave(&device_domain_lock, flags1);
2844         while (!list_empty(&domain->devices)) {
2845                 info = list_entry(domain->devices.next,
2846                         struct device_domain_info, link);
2847                 list_del(&info->link);
2848                 list_del(&info->global);
2849                 if (info->dev)
2850                         info->dev->dev.archdata.iommu = NULL;
2851
2852                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2853
2854                 iommu = device_to_iommu(info->bus, info->devfn);
2855                 iommu_detach_dev(iommu, info->bus, info->devfn);
2856
2857                 /* clear this iommu in iommu_bmp, update iommu count
2858                  * and coherency
2859                  */
2860                 spin_lock_irqsave(&domain->iommu_lock, flags2);
2861                 if (test_and_clear_bit(iommu->seq_id,
2862                                        &domain->iommu_bmp)) {
2863                         domain->iommu_count--;
2864                         domain_update_iommu_coherency(domain);
2865                 }
2866                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2867
2868                 free_devinfo_mem(info);
2869                 spin_lock_irqsave(&device_domain_lock, flags1);
2870         }
2871         spin_unlock_irqrestore(&device_domain_lock, flags1);
2872 }
2873
2874 /* domain id for virtual machine, it won't be set in context */
2875 static unsigned long vm_domid;
2876
2877 static int vm_domain_min_agaw(struct dmar_domain *domain)
2878 {
2879         int i;
2880         int min_agaw = domain->agaw;
2881
2882         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2883         for (; i < g_num_of_iommus; ) {
2884                 if (min_agaw > g_iommus[i]->agaw)
2885                         min_agaw = g_iommus[i]->agaw;
2886
2887                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2888         }
2889
2890         return min_agaw;
2891 }
2892
2893 static struct dmar_domain *iommu_alloc_vm_domain(void)
2894 {
2895         struct dmar_domain *domain;
2896
2897         domain = alloc_domain_mem();
2898         if (!domain)
2899                 return NULL;
2900
2901         domain->id = vm_domid++;
2902         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2903         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2904
2905         return domain;
2906 }
2907
2908 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2909 {
2910         int adjust_width;
2911
2912         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2913         spin_lock_init(&domain->mapping_lock);
2914         spin_lock_init(&domain->iommu_lock);
2915
2916         domain_reserve_special_ranges(domain);
2917
2918         /* calculate AGAW */
2919         domain->gaw = guest_width;
2920         adjust_width = guestwidth_to_adjustwidth(guest_width);
2921         domain->agaw = width_to_agaw(adjust_width);
2922
2923         INIT_LIST_HEAD(&domain->devices);
2924
2925         domain->iommu_count = 0;
2926         domain->iommu_coherency = 0;
2927         domain->max_addr = 0;
2928
2929         /* always allocate the top pgd */
2930         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2931         if (!domain->pgd)
2932                 return -ENOMEM;
2933         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2934         return 0;
2935 }
2936
2937 static void iommu_free_vm_domain(struct dmar_domain *domain)
2938 {
2939         unsigned long flags;
2940         struct dmar_drhd_unit *drhd;
2941         struct intel_iommu *iommu;
2942         unsigned long i;
2943         unsigned long ndomains;
2944
2945         for_each_drhd_unit(drhd) {
2946                 if (drhd->ignored)
2947                         continue;
2948                 iommu = drhd->iommu;
2949
2950                 ndomains = cap_ndoms(iommu->cap);
2951                 i = find_first_bit(iommu->domain_ids, ndomains);
2952                 for (; i < ndomains; ) {
2953                         if (iommu->domains[i] == domain) {
2954                                 spin_lock_irqsave(&iommu->lock, flags);
2955                                 clear_bit(i, iommu->domain_ids);
2956                                 iommu->domains[i] = NULL;
2957                                 spin_unlock_irqrestore(&iommu->lock, flags);
2958                                 break;
2959                         }
2960                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2961                 }
2962         }
2963 }
2964
2965 static void vm_domain_exit(struct dmar_domain *domain)
2966 {
2967         u64 end;
2968
2969         /* Domain 0 is reserved, so dont process it */
2970         if (!domain)
2971                 return;
2972
2973         vm_domain_remove_all_dev_info(domain);
2974         /* destroy iovas */
2975         put_iova_domain(&domain->iovad);
2976         end = DOMAIN_MAX_ADDR(domain->gaw);
2977         end = end & (~VTD_PAGE_MASK);
2978
2979         /* clear ptes */
2980         dma_pte_clear_range(domain, 0, end);
2981
2982         /* free page tables */
2983         dma_pte_free_pagetable(domain, 0, end);
2984
2985         iommu_free_vm_domain(domain);
2986         free_domain_mem(domain);
2987 }
2988
2989 static int intel_iommu_domain_init(struct iommu_domain *domain)
2990 {
2991         struct dmar_domain *dmar_domain;
2992
2993         dmar_domain = iommu_alloc_vm_domain();
2994         if (!dmar_domain) {
2995                 printk(KERN_ERR
2996                         "intel_iommu_domain_init: dmar_domain == NULL\n");
2997                 return -ENOMEM;
2998         }
2999         if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3000                 printk(KERN_ERR
3001                         "intel_iommu_domain_init() failed\n");
3002                 vm_domain_exit(dmar_domain);
3003                 return -ENOMEM;
3004         }
3005         domain->priv = dmar_domain;
3006
3007         return 0;
3008 }
3009
3010 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3011 {
3012         struct dmar_domain *dmar_domain = domain->priv;
3013
3014         domain->priv = NULL;
3015         vm_domain_exit(dmar_domain);
3016 }
3017
3018 static int intel_iommu_attach_device(struct iommu_domain *domain,
3019                                      struct device *dev)
3020 {
3021         struct dmar_domain *dmar_domain = domain->priv;
3022         struct pci_dev *pdev = to_pci_dev(dev);
3023         struct intel_iommu *iommu;
3024         int addr_width;
3025         u64 end;
3026         int ret;
3027
3028         /* normally pdev is not mapped */
3029         if (unlikely(domain_context_mapped(pdev))) {
3030                 struct dmar_domain *old_domain;
3031
3032                 old_domain = find_domain(pdev);
3033                 if (old_domain) {
3034                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3035                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3036                         else
3037                                 domain_remove_dev_info(old_domain);
3038                 }
3039         }
3040
3041         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3042         if (!iommu)
3043                 return -ENODEV;
3044
3045         /* check if this iommu agaw is sufficient for max mapped address */
3046         addr_width = agaw_to_width(iommu->agaw);
3047         end = DOMAIN_MAX_ADDR(addr_width);
3048         end = end & VTD_PAGE_MASK;
3049         if (end < dmar_domain->max_addr) {
3050                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3051                        "sufficient for the mapped address (%llx)\n",
3052                        __func__, iommu->agaw, dmar_domain->max_addr);
3053                 return -EFAULT;
3054         }
3055
3056         ret = domain_context_mapping(dmar_domain, pdev);
3057         if (ret)
3058                 return ret;
3059
3060         ret = vm_domain_add_dev_info(dmar_domain, pdev);
3061         return ret;
3062 }
3063
3064 static void intel_iommu_detach_device(struct iommu_domain *domain,
3065                                       struct device *dev)
3066 {
3067         struct dmar_domain *dmar_domain = domain->priv;
3068         struct pci_dev *pdev = to_pci_dev(dev);
3069
3070         vm_domain_remove_one_dev_info(dmar_domain, pdev);
3071 }
3072
3073 static int intel_iommu_map_range(struct iommu_domain *domain,
3074                                  unsigned long iova, phys_addr_t hpa,
3075                                  size_t size, int iommu_prot)
3076 {
3077         struct dmar_domain *dmar_domain = domain->priv;
3078         u64 max_addr;
3079         int addr_width;
3080         int prot = 0;
3081         int ret;
3082
3083         if (iommu_prot & IOMMU_READ)
3084                 prot |= DMA_PTE_READ;
3085         if (iommu_prot & IOMMU_WRITE)
3086                 prot |= DMA_PTE_WRITE;
3087
3088         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3089         if (dmar_domain->max_addr < max_addr) {
3090                 int min_agaw;
3091                 u64 end;
3092
3093                 /* check if minimum agaw is sufficient for mapped address */
3094                 min_agaw = vm_domain_min_agaw(dmar_domain);
3095                 addr_width = agaw_to_width(min_agaw);
3096                 end = DOMAIN_MAX_ADDR(addr_width);
3097                 end = end & VTD_PAGE_MASK;
3098                 if (end < max_addr) {
3099                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3100                                "sufficient for the mapped address (%llx)\n",
3101                                __func__, min_agaw, max_addr);
3102                         return -EFAULT;
3103                 }
3104                 dmar_domain->max_addr = max_addr;
3105         }
3106
3107         ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3108         return ret;
3109 }
3110
3111 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3112                                     unsigned long iova, size_t size)
3113 {
3114         struct dmar_domain *dmar_domain = domain->priv;
3115         dma_addr_t base;
3116
3117         /* The address might not be aligned */
3118         base = iova & VTD_PAGE_MASK;
3119         size = VTD_PAGE_ALIGN(size);
3120         dma_pte_clear_range(dmar_domain, base, base + size);
3121
3122         if (dmar_domain->max_addr == base + size)
3123                 dmar_domain->max_addr = base;
3124 }
3125
3126 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3127                                             unsigned long iova)
3128 {
3129         struct dmar_domain *dmar_domain = domain->priv;
3130         struct dma_pte *pte;
3131         u64 phys = 0;
3132
3133         pte = addr_to_dma_pte(dmar_domain, iova);
3134         if (pte)
3135                 phys = dma_pte_addr(pte);
3136
3137         return phys;
3138 }
3139
3140 static struct iommu_ops intel_iommu_ops = {
3141         .domain_init    = intel_iommu_domain_init,
3142         .domain_destroy = intel_iommu_domain_destroy,
3143         .attach_dev     = intel_iommu_attach_device,
3144         .detach_dev     = intel_iommu_detach_device,
3145         .map            = intel_iommu_map_range,
3146         .unmap          = intel_iommu_unmap_range,
3147         .iova_to_phys   = intel_iommu_iova_to_phys,
3148 };