PCI: iova: lockdep false alarm fix
[linux-2.6.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  */
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/slab.h>
26 #include <linux/irq.h>
27 #include <linux/interrupt.h>
28 #include <linux/sysdev.h>
29 #include <linux/spinlock.h>
30 #include <linux/pci.h>
31 #include <linux/dmar.h>
32 #include <linux/dma-mapping.h>
33 #include <linux/mempool.h>
34 #include "iova.h"
35 #include "intel-iommu.h"
36 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
37 #include <asm/cacheflush.h>
38 #include <asm/gart.h>
39 #include "pci.h"
40
41 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
42 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
43
44 #define IOAPIC_RANGE_START      (0xfee00000)
45 #define IOAPIC_RANGE_END        (0xfeefffff)
46 #define IOVA_START_ADDR         (0x1000)
47
48 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
49
50 #define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */
51
52 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
53
54 static void domain_remove_dev_info(struct dmar_domain *domain);
55
56 static int dmar_disabled;
57 static int __initdata dmar_map_gfx = 1;
58 static int dmar_forcedac;
59
60 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
61 static DEFINE_SPINLOCK(device_domain_lock);
62 static LIST_HEAD(device_domain_list);
63
64 static int __init intel_iommu_setup(char *str)
65 {
66         if (!str)
67                 return -EINVAL;
68         while (*str) {
69                 if (!strncmp(str, "off", 3)) {
70                         dmar_disabled = 1;
71                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
72                 } else if (!strncmp(str, "igfx_off", 8)) {
73                         dmar_map_gfx = 0;
74                         printk(KERN_INFO
75                                 "Intel-IOMMU: disable GFX device mapping\n");
76                 } else if (!strncmp(str, "forcedac", 8)) {
77                         printk (KERN_INFO
78                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
79                         dmar_forcedac = 1;
80                 }
81
82                 str += strcspn(str, ",");
83                 while (*str == ',')
84                         str++;
85         }
86         return 0;
87 }
88 __setup("intel_iommu=", intel_iommu_setup);
89
90 static struct kmem_cache *iommu_domain_cache;
91 static struct kmem_cache *iommu_devinfo_cache;
92 static struct kmem_cache *iommu_iova_cache;
93
94 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
95 {
96         unsigned int flags;
97         void *vaddr;
98
99         /* trying to avoid low memory issues */
100         flags = current->flags & PF_MEMALLOC;
101         current->flags |= PF_MEMALLOC;
102         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
103         current->flags &= (~PF_MEMALLOC | flags);
104         return vaddr;
105 }
106
107
108 static inline void *alloc_pgtable_page(void)
109 {
110         unsigned int flags;
111         void *vaddr;
112
113         /* trying to avoid low memory issues */
114         flags = current->flags & PF_MEMALLOC;
115         current->flags |= PF_MEMALLOC;
116         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
117         current->flags &= (~PF_MEMALLOC | flags);
118         return vaddr;
119 }
120
121 static inline void free_pgtable_page(void *vaddr)
122 {
123         free_page((unsigned long)vaddr);
124 }
125
126 static inline void *alloc_domain_mem(void)
127 {
128         return iommu_kmem_cache_alloc(iommu_domain_cache);
129 }
130
131 static inline void free_domain_mem(void *vaddr)
132 {
133         kmem_cache_free(iommu_domain_cache, vaddr);
134 }
135
136 static inline void * alloc_devinfo_mem(void)
137 {
138         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
139 }
140
141 static inline void free_devinfo_mem(void *vaddr)
142 {
143         kmem_cache_free(iommu_devinfo_cache, vaddr);
144 }
145
146 struct iova *alloc_iova_mem(void)
147 {
148         return iommu_kmem_cache_alloc(iommu_iova_cache);
149 }
150
151 void free_iova_mem(struct iova *iova)
152 {
153         kmem_cache_free(iommu_iova_cache, iova);
154 }
155
156 static inline void __iommu_flush_cache(
157         struct intel_iommu *iommu, void *addr, int size)
158 {
159         if (!ecap_coherent(iommu->ecap))
160                 clflush_cache_range(addr, size);
161 }
162
163 /* Gets context entry for a given bus and devfn */
164 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
165                 u8 bus, u8 devfn)
166 {
167         struct root_entry *root;
168         struct context_entry *context;
169         unsigned long phy_addr;
170         unsigned long flags;
171
172         spin_lock_irqsave(&iommu->lock, flags);
173         root = &iommu->root_entry[bus];
174         context = get_context_addr_from_root(root);
175         if (!context) {
176                 context = (struct context_entry *)alloc_pgtable_page();
177                 if (!context) {
178                         spin_unlock_irqrestore(&iommu->lock, flags);
179                         return NULL;
180                 }
181                 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
182                 phy_addr = virt_to_phys((void *)context);
183                 set_root_value(root, phy_addr);
184                 set_root_present(root);
185                 __iommu_flush_cache(iommu, root, sizeof(*root));
186         }
187         spin_unlock_irqrestore(&iommu->lock, flags);
188         return &context[devfn];
189 }
190
191 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
192 {
193         struct root_entry *root;
194         struct context_entry *context;
195         int ret;
196         unsigned long flags;
197
198         spin_lock_irqsave(&iommu->lock, flags);
199         root = &iommu->root_entry[bus];
200         context = get_context_addr_from_root(root);
201         if (!context) {
202                 ret = 0;
203                 goto out;
204         }
205         ret = context_present(context[devfn]);
206 out:
207         spin_unlock_irqrestore(&iommu->lock, flags);
208         return ret;
209 }
210
211 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
212 {
213         struct root_entry *root;
214         struct context_entry *context;
215         unsigned long flags;
216
217         spin_lock_irqsave(&iommu->lock, flags);
218         root = &iommu->root_entry[bus];
219         context = get_context_addr_from_root(root);
220         if (context) {
221                 context_clear_entry(context[devfn]);
222                 __iommu_flush_cache(iommu, &context[devfn], \
223                         sizeof(*context));
224         }
225         spin_unlock_irqrestore(&iommu->lock, flags);
226 }
227
228 static void free_context_table(struct intel_iommu *iommu)
229 {
230         struct root_entry *root;
231         int i;
232         unsigned long flags;
233         struct context_entry *context;
234
235         spin_lock_irqsave(&iommu->lock, flags);
236         if (!iommu->root_entry) {
237                 goto out;
238         }
239         for (i = 0; i < ROOT_ENTRY_NR; i++) {
240                 root = &iommu->root_entry[i];
241                 context = get_context_addr_from_root(root);
242                 if (context)
243                         free_pgtable_page(context);
244         }
245         free_pgtable_page(iommu->root_entry);
246         iommu->root_entry = NULL;
247 out:
248         spin_unlock_irqrestore(&iommu->lock, flags);
249 }
250
251 /* page table handling */
252 #define LEVEL_STRIDE            (9)
253 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
254
255 static inline int agaw_to_level(int agaw)
256 {
257         return agaw + 2;
258 }
259
260 static inline int agaw_to_width(int agaw)
261 {
262         return 30 + agaw * LEVEL_STRIDE;
263
264 }
265
266 static inline int width_to_agaw(int width)
267 {
268         return (width - 30) / LEVEL_STRIDE;
269 }
270
271 static inline unsigned int level_to_offset_bits(int level)
272 {
273         return (12 + (level - 1) * LEVEL_STRIDE);
274 }
275
276 static inline int address_level_offset(u64 addr, int level)
277 {
278         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
279 }
280
281 static inline u64 level_mask(int level)
282 {
283         return ((u64)-1 << level_to_offset_bits(level));
284 }
285
286 static inline u64 level_size(int level)
287 {
288         return ((u64)1 << level_to_offset_bits(level));
289 }
290
291 static inline u64 align_to_level(u64 addr, int level)
292 {
293         return ((addr + level_size(level) - 1) & level_mask(level));
294 }
295
296 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
297 {
298         int addr_width = agaw_to_width(domain->agaw);
299         struct dma_pte *parent, *pte = NULL;
300         int level = agaw_to_level(domain->agaw);
301         int offset;
302         unsigned long flags;
303
304         BUG_ON(!domain->pgd);
305
306         addr &= (((u64)1) << addr_width) - 1;
307         parent = domain->pgd;
308
309         spin_lock_irqsave(&domain->mapping_lock, flags);
310         while (level > 0) {
311                 void *tmp_page;
312
313                 offset = address_level_offset(addr, level);
314                 pte = &parent[offset];
315                 if (level == 1)
316                         break;
317
318                 if (!dma_pte_present(*pte)) {
319                         tmp_page = alloc_pgtable_page();
320
321                         if (!tmp_page) {
322                                 spin_unlock_irqrestore(&domain->mapping_lock,
323                                         flags);
324                                 return NULL;
325                         }
326                         __iommu_flush_cache(domain->iommu, tmp_page,
327                                         PAGE_SIZE_4K);
328                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
329                         /*
330                          * high level table always sets r/w, last level page
331                          * table control read/write
332                          */
333                         dma_set_pte_readable(*pte);
334                         dma_set_pte_writable(*pte);
335                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
336                 }
337                 parent = phys_to_virt(dma_pte_addr(*pte));
338                 level--;
339         }
340
341         spin_unlock_irqrestore(&domain->mapping_lock, flags);
342         return pte;
343 }
344
345 /* return address's pte at specific level */
346 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
347                 int level)
348 {
349         struct dma_pte *parent, *pte = NULL;
350         int total = agaw_to_level(domain->agaw);
351         int offset;
352
353         parent = domain->pgd;
354         while (level <= total) {
355                 offset = address_level_offset(addr, total);
356                 pte = &parent[offset];
357                 if (level == total)
358                         return pte;
359
360                 if (!dma_pte_present(*pte))
361                         break;
362                 parent = phys_to_virt(dma_pte_addr(*pte));
363                 total--;
364         }
365         return NULL;
366 }
367
368 /* clear one page's page table */
369 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
370 {
371         struct dma_pte *pte = NULL;
372
373         /* get last level pte */
374         pte = dma_addr_level_pte(domain, addr, 1);
375
376         if (pte) {
377                 dma_clear_pte(*pte);
378                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
379         }
380 }
381
382 /* clear last level pte, a tlb flush should be followed */
383 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
384 {
385         int addr_width = agaw_to_width(domain->agaw);
386
387         start &= (((u64)1) << addr_width) - 1;
388         end &= (((u64)1) << addr_width) - 1;
389         /* in case it's partial page */
390         start = PAGE_ALIGN_4K(start);
391         end &= PAGE_MASK_4K;
392
393         /* we don't need lock here, nobody else touches the iova range */
394         while (start < end) {
395                 dma_pte_clear_one(domain, start);
396                 start += PAGE_SIZE_4K;
397         }
398 }
399
400 /* free page table pages. last level pte should already be cleared */
401 static void dma_pte_free_pagetable(struct dmar_domain *domain,
402         u64 start, u64 end)
403 {
404         int addr_width = agaw_to_width(domain->agaw);
405         struct dma_pte *pte;
406         int total = agaw_to_level(domain->agaw);
407         int level;
408         u64 tmp;
409
410         start &= (((u64)1) << addr_width) - 1;
411         end &= (((u64)1) << addr_width) - 1;
412
413         /* we don't need lock here, nobody else touches the iova range */
414         level = 2;
415         while (level <= total) {
416                 tmp = align_to_level(start, level);
417                 if (tmp >= end || (tmp + level_size(level) > end))
418                         return;
419
420                 while (tmp < end) {
421                         pte = dma_addr_level_pte(domain, tmp, level);
422                         if (pte) {
423                                 free_pgtable_page(
424                                         phys_to_virt(dma_pte_addr(*pte)));
425                                 dma_clear_pte(*pte);
426                                 __iommu_flush_cache(domain->iommu,
427                                                 pte, sizeof(*pte));
428                         }
429                         tmp += level_size(level);
430                 }
431                 level++;
432         }
433         /* free pgd */
434         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
435                 free_pgtable_page(domain->pgd);
436                 domain->pgd = NULL;
437         }
438 }
439
440 /* iommu handling */
441 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
442 {
443         struct root_entry *root;
444         unsigned long flags;
445
446         root = (struct root_entry *)alloc_pgtable_page();
447         if (!root)
448                 return -ENOMEM;
449
450         __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
451
452         spin_lock_irqsave(&iommu->lock, flags);
453         iommu->root_entry = root;
454         spin_unlock_irqrestore(&iommu->lock, flags);
455
456         return 0;
457 }
458
459 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
460 {\
461         unsigned long start_time = jiffies;\
462         while (1) {\
463                 sts = op (iommu->reg + offset);\
464                 if (cond)\
465                         break;\
466                 if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\
467                         panic("DMAR hardware is malfunctioning\n");\
468                 cpu_relax();\
469         }\
470 }
471
472 static void iommu_set_root_entry(struct intel_iommu *iommu)
473 {
474         void *addr;
475         u32 cmd, sts;
476         unsigned long flag;
477
478         addr = iommu->root_entry;
479
480         spin_lock_irqsave(&iommu->register_lock, flag);
481         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
482
483         cmd = iommu->gcmd | DMA_GCMD_SRTP;
484         writel(cmd, iommu->reg + DMAR_GCMD_REG);
485
486         /* Make sure hardware complete it */
487         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
488                 readl, (sts & DMA_GSTS_RTPS), sts);
489
490         spin_unlock_irqrestore(&iommu->register_lock, flag);
491 }
492
493 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
494 {
495         u32 val;
496         unsigned long flag;
497
498         if (!cap_rwbf(iommu->cap))
499                 return;
500         val = iommu->gcmd | DMA_GCMD_WBF;
501
502         spin_lock_irqsave(&iommu->register_lock, flag);
503         writel(val, iommu->reg + DMAR_GCMD_REG);
504
505         /* Make sure hardware complete it */
506         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
507                         readl, (!(val & DMA_GSTS_WBFS)), val);
508
509         spin_unlock_irqrestore(&iommu->register_lock, flag);
510 }
511
512 /* return value determine if we need a write buffer flush */
513 static int __iommu_flush_context(struct intel_iommu *iommu,
514         u16 did, u16 source_id, u8 function_mask, u64 type,
515         int non_present_entry_flush)
516 {
517         u64 val = 0;
518         unsigned long flag;
519
520         /*
521          * In the non-present entry flush case, if hardware doesn't cache
522          * non-present entry we do nothing and if hardware cache non-present
523          * entry, we flush entries of domain 0 (the domain id is used to cache
524          * any non-present entries)
525          */
526         if (non_present_entry_flush) {
527                 if (!cap_caching_mode(iommu->cap))
528                         return 1;
529                 else
530                         did = 0;
531         }
532
533         switch (type) {
534         case DMA_CCMD_GLOBAL_INVL:
535                 val = DMA_CCMD_GLOBAL_INVL;
536                 break;
537         case DMA_CCMD_DOMAIN_INVL:
538                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
539                 break;
540         case DMA_CCMD_DEVICE_INVL:
541                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
542                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
543                 break;
544         default:
545                 BUG();
546         }
547         val |= DMA_CCMD_ICC;
548
549         spin_lock_irqsave(&iommu->register_lock, flag);
550         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
551
552         /* Make sure hardware complete it */
553         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
554                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
555
556         spin_unlock_irqrestore(&iommu->register_lock, flag);
557
558         /* flush context entry will implictly flush write buffer */
559         return 0;
560 }
561
562 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
563         int non_present_entry_flush)
564 {
565         return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
566                 non_present_entry_flush);
567 }
568
569 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
570         int non_present_entry_flush)
571 {
572         return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
573                 non_present_entry_flush);
574 }
575
576 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
577         u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
578 {
579         return __iommu_flush_context(iommu, did, source_id, function_mask,
580                 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
581 }
582
583 /* return value determine if we need a write buffer flush */
584 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
585         u64 addr, unsigned int size_order, u64 type,
586         int non_present_entry_flush)
587 {
588         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
589         u64 val = 0, val_iva = 0;
590         unsigned long flag;
591
592         /*
593          * In the non-present entry flush case, if hardware doesn't cache
594          * non-present entry we do nothing and if hardware cache non-present
595          * entry, we flush entries of domain 0 (the domain id is used to cache
596          * any non-present entries)
597          */
598         if (non_present_entry_flush) {
599                 if (!cap_caching_mode(iommu->cap))
600                         return 1;
601                 else
602                         did = 0;
603         }
604
605         switch (type) {
606         case DMA_TLB_GLOBAL_FLUSH:
607                 /* global flush doesn't need set IVA_REG */
608                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
609                 break;
610         case DMA_TLB_DSI_FLUSH:
611                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
612                 break;
613         case DMA_TLB_PSI_FLUSH:
614                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
615                 /* Note: always flush non-leaf currently */
616                 val_iva = size_order | addr;
617                 break;
618         default:
619                 BUG();
620         }
621         /* Note: set drain read/write */
622 #if 0
623         /*
624          * This is probably to be super secure.. Looks like we can
625          * ignore it without any impact.
626          */
627         if (cap_read_drain(iommu->cap))
628                 val |= DMA_TLB_READ_DRAIN;
629 #endif
630         if (cap_write_drain(iommu->cap))
631                 val |= DMA_TLB_WRITE_DRAIN;
632
633         spin_lock_irqsave(&iommu->register_lock, flag);
634         /* Note: Only uses first TLB reg currently */
635         if (val_iva)
636                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
637         dmar_writeq(iommu->reg + tlb_offset + 8, val);
638
639         /* Make sure hardware complete it */
640         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
641                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
642
643         spin_unlock_irqrestore(&iommu->register_lock, flag);
644
645         /* check IOTLB invalidation granularity */
646         if (DMA_TLB_IAIG(val) == 0)
647                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
648         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
649                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
650                         DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
651         /* flush context entry will implictly flush write buffer */
652         return 0;
653 }
654
655 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
656         int non_present_entry_flush)
657 {
658         return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
659                 non_present_entry_flush);
660 }
661
662 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
663         int non_present_entry_flush)
664 {
665         return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
666                 non_present_entry_flush);
667 }
668
669 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
670         u64 addr, unsigned int pages, int non_present_entry_flush)
671 {
672         unsigned int mask;
673
674         BUG_ON(addr & (~PAGE_MASK_4K));
675         BUG_ON(pages == 0);
676
677         /* Fallback to domain selective flush if no PSI support */
678         if (!cap_pgsel_inv(iommu->cap))
679                 return iommu_flush_iotlb_dsi(iommu, did,
680                         non_present_entry_flush);
681
682         /*
683          * PSI requires page size to be 2 ^ x, and the base address is naturally
684          * aligned to the size
685          */
686         mask = ilog2(__roundup_pow_of_two(pages));
687         /* Fallback to domain selective flush if size is too big */
688         if (mask > cap_max_amask_val(iommu->cap))
689                 return iommu_flush_iotlb_dsi(iommu, did,
690                         non_present_entry_flush);
691
692         return __iommu_flush_iotlb(iommu, did, addr, mask,
693                 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
694 }
695
696 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
697 {
698         u32 pmen;
699         unsigned long flags;
700
701         spin_lock_irqsave(&iommu->register_lock, flags);
702         pmen = readl(iommu->reg + DMAR_PMEN_REG);
703         pmen &= ~DMA_PMEN_EPM;
704         writel(pmen, iommu->reg + DMAR_PMEN_REG);
705
706         /* wait for the protected region status bit to clear */
707         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
708                 readl, !(pmen & DMA_PMEN_PRS), pmen);
709
710         spin_unlock_irqrestore(&iommu->register_lock, flags);
711 }
712
713 static int iommu_enable_translation(struct intel_iommu *iommu)
714 {
715         u32 sts;
716         unsigned long flags;
717
718         spin_lock_irqsave(&iommu->register_lock, flags);
719         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
720
721         /* Make sure hardware complete it */
722         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
723                 readl, (sts & DMA_GSTS_TES), sts);
724
725         iommu->gcmd |= DMA_GCMD_TE;
726         spin_unlock_irqrestore(&iommu->register_lock, flags);
727         return 0;
728 }
729
730 static int iommu_disable_translation(struct intel_iommu *iommu)
731 {
732         u32 sts;
733         unsigned long flag;
734
735         spin_lock_irqsave(&iommu->register_lock, flag);
736         iommu->gcmd &= ~DMA_GCMD_TE;
737         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
738
739         /* Make sure hardware complete it */
740         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
741                 readl, (!(sts & DMA_GSTS_TES)), sts);
742
743         spin_unlock_irqrestore(&iommu->register_lock, flag);
744         return 0;
745 }
746
747 /* iommu interrupt handling. Most stuff are MSI-like. */
748
749 static const char *fault_reason_strings[] =
750 {
751         "Software",
752         "Present bit in root entry is clear",
753         "Present bit in context entry is clear",
754         "Invalid context entry",
755         "Access beyond MGAW",
756         "PTE Write access is not set",
757         "PTE Read access is not set",
758         "Next page table ptr is invalid",
759         "Root table address invalid",
760         "Context table ptr is invalid",
761         "non-zero reserved fields in RTP",
762         "non-zero reserved fields in CTP",
763         "non-zero reserved fields in PTE",
764 };
765 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
766
767 const char *dmar_get_fault_reason(u8 fault_reason)
768 {
769         if (fault_reason > MAX_FAULT_REASON_IDX)
770                 return "Unknown";
771         else
772                 return fault_reason_strings[fault_reason];
773 }
774
775 void dmar_msi_unmask(unsigned int irq)
776 {
777         struct intel_iommu *iommu = get_irq_data(irq);
778         unsigned long flag;
779
780         /* unmask it */
781         spin_lock_irqsave(&iommu->register_lock, flag);
782         writel(0, iommu->reg + DMAR_FECTL_REG);
783         /* Read a reg to force flush the post write */
784         readl(iommu->reg + DMAR_FECTL_REG);
785         spin_unlock_irqrestore(&iommu->register_lock, flag);
786 }
787
788 void dmar_msi_mask(unsigned int irq)
789 {
790         unsigned long flag;
791         struct intel_iommu *iommu = get_irq_data(irq);
792
793         /* mask it */
794         spin_lock_irqsave(&iommu->register_lock, flag);
795         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
796         /* Read a reg to force flush the post write */
797         readl(iommu->reg + DMAR_FECTL_REG);
798         spin_unlock_irqrestore(&iommu->register_lock, flag);
799 }
800
801 void dmar_msi_write(int irq, struct msi_msg *msg)
802 {
803         struct intel_iommu *iommu = get_irq_data(irq);
804         unsigned long flag;
805
806         spin_lock_irqsave(&iommu->register_lock, flag);
807         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
808         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
809         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
810         spin_unlock_irqrestore(&iommu->register_lock, flag);
811 }
812
813 void dmar_msi_read(int irq, struct msi_msg *msg)
814 {
815         struct intel_iommu *iommu = get_irq_data(irq);
816         unsigned long flag;
817
818         spin_lock_irqsave(&iommu->register_lock, flag);
819         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
820         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
821         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
822         spin_unlock_irqrestore(&iommu->register_lock, flag);
823 }
824
825 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
826                 u8 fault_reason, u16 source_id, u64 addr)
827 {
828         const char *reason;
829
830         reason = dmar_get_fault_reason(fault_reason);
831
832         printk(KERN_ERR
833                 "DMAR:[%s] Request device [%02x:%02x.%d] "
834                 "fault addr %llx \n"
835                 "DMAR:[fault reason %02d] %s\n",
836                 (type ? "DMA Read" : "DMA Write"),
837                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
838                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
839         return 0;
840 }
841
842 #define PRIMARY_FAULT_REG_LEN (16)
843 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
844 {
845         struct intel_iommu *iommu = dev_id;
846         int reg, fault_index;
847         u32 fault_status;
848         unsigned long flag;
849
850         spin_lock_irqsave(&iommu->register_lock, flag);
851         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
852
853         /* TBD: ignore advanced fault log currently */
854         if (!(fault_status & DMA_FSTS_PPF))
855                 goto clear_overflow;
856
857         fault_index = dma_fsts_fault_record_index(fault_status);
858         reg = cap_fault_reg_offset(iommu->cap);
859         while (1) {
860                 u8 fault_reason;
861                 u16 source_id;
862                 u64 guest_addr;
863                 int type;
864                 u32 data;
865
866                 /* highest 32 bits */
867                 data = readl(iommu->reg + reg +
868                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
869                 if (!(data & DMA_FRCD_F))
870                         break;
871
872                 fault_reason = dma_frcd_fault_reason(data);
873                 type = dma_frcd_type(data);
874
875                 data = readl(iommu->reg + reg +
876                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
877                 source_id = dma_frcd_source_id(data);
878
879                 guest_addr = dmar_readq(iommu->reg + reg +
880                                 fault_index * PRIMARY_FAULT_REG_LEN);
881                 guest_addr = dma_frcd_page_addr(guest_addr);
882                 /* clear the fault */
883                 writel(DMA_FRCD_F, iommu->reg + reg +
884                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
885
886                 spin_unlock_irqrestore(&iommu->register_lock, flag);
887
888                 iommu_page_fault_do_one(iommu, type, fault_reason,
889                                 source_id, guest_addr);
890
891                 fault_index++;
892                 if (fault_index > cap_num_fault_regs(iommu->cap))
893                         fault_index = 0;
894                 spin_lock_irqsave(&iommu->register_lock, flag);
895         }
896 clear_overflow:
897         /* clear primary fault overflow */
898         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
899         if (fault_status & DMA_FSTS_PFO)
900                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
901
902         spin_unlock_irqrestore(&iommu->register_lock, flag);
903         return IRQ_HANDLED;
904 }
905
906 int dmar_set_interrupt(struct intel_iommu *iommu)
907 {
908         int irq, ret;
909
910         irq = create_irq();
911         if (!irq) {
912                 printk(KERN_ERR "IOMMU: no free vectors\n");
913                 return -EINVAL;
914         }
915
916         set_irq_data(irq, iommu);
917         iommu->irq = irq;
918
919         ret = arch_setup_dmar_msi(irq);
920         if (ret) {
921                 set_irq_data(irq, NULL);
922                 iommu->irq = 0;
923                 destroy_irq(irq);
924                 return 0;
925         }
926
927         /* Force fault register is cleared */
928         iommu_page_fault(irq, iommu);
929
930         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
931         if (ret)
932                 printk(KERN_ERR "IOMMU: can't request irq\n");
933         return ret;
934 }
935
936 static int iommu_init_domains(struct intel_iommu *iommu)
937 {
938         unsigned long ndomains;
939         unsigned long nlongs;
940
941         ndomains = cap_ndoms(iommu->cap);
942         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
943         nlongs = BITS_TO_LONGS(ndomains);
944
945         /* TBD: there might be 64K domains,
946          * consider other allocation for future chip
947          */
948         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
949         if (!iommu->domain_ids) {
950                 printk(KERN_ERR "Allocating domain id array failed\n");
951                 return -ENOMEM;
952         }
953         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
954                         GFP_KERNEL);
955         if (!iommu->domains) {
956                 printk(KERN_ERR "Allocating domain array failed\n");
957                 kfree(iommu->domain_ids);
958                 return -ENOMEM;
959         }
960
961         /*
962          * if Caching mode is set, then invalid translations are tagged
963          * with domainid 0. Hence we need to pre-allocate it.
964          */
965         if (cap_caching_mode(iommu->cap))
966                 set_bit(0, iommu->domain_ids);
967         return 0;
968 }
969
970 static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
971 {
972         struct intel_iommu *iommu;
973         int ret;
974         int map_size;
975         u32 ver;
976
977         iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
978         if (!iommu)
979                 return NULL;
980         iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
981         if (!iommu->reg) {
982                 printk(KERN_ERR "IOMMU: can't map the region\n");
983                 goto error;
984         }
985         iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
986         iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
987
988         /* the registers might be more than one page */
989         map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
990                 cap_max_fault_reg_offset(iommu->cap));
991         map_size = PAGE_ALIGN_4K(map_size);
992         if (map_size > PAGE_SIZE_4K) {
993                 iounmap(iommu->reg);
994                 iommu->reg = ioremap(drhd->reg_base_addr, map_size);
995                 if (!iommu->reg) {
996                         printk(KERN_ERR "IOMMU: can't map the region\n");
997                         goto error;
998                 }
999         }
1000
1001         ver = readl(iommu->reg + DMAR_VER_REG);
1002         pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
1003                 drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
1004                 iommu->cap, iommu->ecap);
1005         ret = iommu_init_domains(iommu);
1006         if (ret)
1007                 goto error_unmap;
1008         spin_lock_init(&iommu->lock);
1009         spin_lock_init(&iommu->register_lock);
1010
1011         drhd->iommu = iommu;
1012         return iommu;
1013 error_unmap:
1014         iounmap(iommu->reg);
1015 error:
1016         kfree(iommu);
1017         return NULL;
1018 }
1019
1020 static void domain_exit(struct dmar_domain *domain);
1021 static void free_iommu(struct intel_iommu *iommu)
1022 {
1023         struct dmar_domain *domain;
1024         int i;
1025
1026         if (!iommu)
1027                 return;
1028
1029         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1030         for (; i < cap_ndoms(iommu->cap); ) {
1031                 domain = iommu->domains[i];
1032                 clear_bit(i, iommu->domain_ids);
1033                 domain_exit(domain);
1034                 i = find_next_bit(iommu->domain_ids,
1035                         cap_ndoms(iommu->cap), i+1);
1036         }
1037
1038         if (iommu->gcmd & DMA_GCMD_TE)
1039                 iommu_disable_translation(iommu);
1040
1041         if (iommu->irq) {
1042                 set_irq_data(iommu->irq, NULL);
1043                 /* This will mask the irq */
1044                 free_irq(iommu->irq, iommu);
1045                 destroy_irq(iommu->irq);
1046         }
1047
1048         kfree(iommu->domains);
1049         kfree(iommu->domain_ids);
1050
1051         /* free context mapping */
1052         free_context_table(iommu);
1053
1054         if (iommu->reg)
1055                 iounmap(iommu->reg);
1056         kfree(iommu);
1057 }
1058
1059 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1060 {
1061         unsigned long num;
1062         unsigned long ndomains;
1063         struct dmar_domain *domain;
1064         unsigned long flags;
1065
1066         domain = alloc_domain_mem();
1067         if (!domain)
1068                 return NULL;
1069
1070         ndomains = cap_ndoms(iommu->cap);
1071
1072         spin_lock_irqsave(&iommu->lock, flags);
1073         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1074         if (num >= ndomains) {
1075                 spin_unlock_irqrestore(&iommu->lock, flags);
1076                 free_domain_mem(domain);
1077                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1078                 return NULL;
1079         }
1080
1081         set_bit(num, iommu->domain_ids);
1082         domain->id = num;
1083         domain->iommu = iommu;
1084         iommu->domains[num] = domain;
1085         spin_unlock_irqrestore(&iommu->lock, flags);
1086
1087         return domain;
1088 }
1089
1090 static void iommu_free_domain(struct dmar_domain *domain)
1091 {
1092         unsigned long flags;
1093
1094         spin_lock_irqsave(&domain->iommu->lock, flags);
1095         clear_bit(domain->id, domain->iommu->domain_ids);
1096         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1097 }
1098
1099 static struct iova_domain reserved_iova_list;
1100 static struct lock_class_key reserved_alloc_key;
1101 static struct lock_class_key reserved_rbtree_key;
1102
1103 static void dmar_init_reserved_ranges(void)
1104 {
1105         struct pci_dev *pdev = NULL;
1106         struct iova *iova;
1107         int i;
1108         u64 addr, size;
1109
1110         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1111
1112         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1113                 &reserved_alloc_key);
1114         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1115                 &reserved_rbtree_key);
1116
1117         /* IOAPIC ranges shouldn't be accessed by DMA */
1118         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1119                 IOVA_PFN(IOAPIC_RANGE_END));
1120         if (!iova)
1121                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1122
1123         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1124         for_each_pci_dev(pdev) {
1125                 struct resource *r;
1126
1127                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1128                         r = &pdev->resource[i];
1129                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1130                                 continue;
1131                         addr = r->start;
1132                         addr &= PAGE_MASK_4K;
1133                         size = r->end - addr;
1134                         size = PAGE_ALIGN_4K(size);
1135                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1136                                 IOVA_PFN(size + addr) - 1);
1137                         if (!iova)
1138                                 printk(KERN_ERR "Reserve iova failed\n");
1139                 }
1140         }
1141
1142 }
1143
1144 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1145 {
1146         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1147 }
1148
1149 static inline int guestwidth_to_adjustwidth(int gaw)
1150 {
1151         int agaw;
1152         int r = (gaw - 12) % 9;
1153
1154         if (r == 0)
1155                 agaw = gaw;
1156         else
1157                 agaw = gaw + 9 - r;
1158         if (agaw > 64)
1159                 agaw = 64;
1160         return agaw;
1161 }
1162
1163 static int domain_init(struct dmar_domain *domain, int guest_width)
1164 {
1165         struct intel_iommu *iommu;
1166         int adjust_width, agaw;
1167         unsigned long sagaw;
1168
1169         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1170         spin_lock_init(&domain->mapping_lock);
1171
1172         domain_reserve_special_ranges(domain);
1173
1174         /* calculate AGAW */
1175         iommu = domain->iommu;
1176         if (guest_width > cap_mgaw(iommu->cap))
1177                 guest_width = cap_mgaw(iommu->cap);
1178         domain->gaw = guest_width;
1179         adjust_width = guestwidth_to_adjustwidth(guest_width);
1180         agaw = width_to_agaw(adjust_width);
1181         sagaw = cap_sagaw(iommu->cap);
1182         if (!test_bit(agaw, &sagaw)) {
1183                 /* hardware doesn't support it, choose a bigger one */
1184                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1185                 agaw = find_next_bit(&sagaw, 5, agaw);
1186                 if (agaw >= 5)
1187                         return -ENODEV;
1188         }
1189         domain->agaw = agaw;
1190         INIT_LIST_HEAD(&domain->devices);
1191
1192         /* always allocate the top pgd */
1193         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1194         if (!domain->pgd)
1195                 return -ENOMEM;
1196         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1197         return 0;
1198 }
1199
1200 static void domain_exit(struct dmar_domain *domain)
1201 {
1202         u64 end;
1203
1204         /* Domain 0 is reserved, so dont process it */
1205         if (!domain)
1206                 return;
1207
1208         domain_remove_dev_info(domain);
1209         /* destroy iovas */
1210         put_iova_domain(&domain->iovad);
1211         end = DOMAIN_MAX_ADDR(domain->gaw);
1212         end = end & (~PAGE_MASK_4K);
1213
1214         /* clear ptes */
1215         dma_pte_clear_range(domain, 0, end);
1216
1217         /* free page tables */
1218         dma_pte_free_pagetable(domain, 0, end);
1219
1220         iommu_free_domain(domain);
1221         free_domain_mem(domain);
1222 }
1223
1224 static int domain_context_mapping_one(struct dmar_domain *domain,
1225                 u8 bus, u8 devfn)
1226 {
1227         struct context_entry *context;
1228         struct intel_iommu *iommu = domain->iommu;
1229         unsigned long flags;
1230
1231         pr_debug("Set context mapping for %02x:%02x.%d\n",
1232                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1233         BUG_ON(!domain->pgd);
1234         context = device_to_context_entry(iommu, bus, devfn);
1235         if (!context)
1236                 return -ENOMEM;
1237         spin_lock_irqsave(&iommu->lock, flags);
1238         if (context_present(*context)) {
1239                 spin_unlock_irqrestore(&iommu->lock, flags);
1240                 return 0;
1241         }
1242
1243         context_set_domain_id(*context, domain->id);
1244         context_set_address_width(*context, domain->agaw);
1245         context_set_address_root(*context, virt_to_phys(domain->pgd));
1246         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1247         context_set_fault_enable(*context);
1248         context_set_present(*context);
1249         __iommu_flush_cache(iommu, context, sizeof(*context));
1250
1251         /* it's a non-present to present mapping */
1252         if (iommu_flush_context_device(iommu, domain->id,
1253                         (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1254                 iommu_flush_write_buffer(iommu);
1255         else
1256                 iommu_flush_iotlb_dsi(iommu, 0, 0);
1257         spin_unlock_irqrestore(&iommu->lock, flags);
1258         return 0;
1259 }
1260
1261 static int
1262 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1263 {
1264         int ret;
1265         struct pci_dev *tmp, *parent;
1266
1267         ret = domain_context_mapping_one(domain, pdev->bus->number,
1268                 pdev->devfn);
1269         if (ret)
1270                 return ret;
1271
1272         /* dependent device mapping */
1273         tmp = pci_find_upstream_pcie_bridge(pdev);
1274         if (!tmp)
1275                 return 0;
1276         /* Secondary interface's bus number and devfn 0 */
1277         parent = pdev->bus->self;
1278         while (parent != tmp) {
1279                 ret = domain_context_mapping_one(domain, parent->bus->number,
1280                         parent->devfn);
1281                 if (ret)
1282                         return ret;
1283                 parent = parent->bus->self;
1284         }
1285         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1286                 return domain_context_mapping_one(domain,
1287                         tmp->subordinate->number, 0);
1288         else /* this is a legacy PCI bridge */
1289                 return domain_context_mapping_one(domain,
1290                         tmp->bus->number, tmp->devfn);
1291 }
1292
1293 static int domain_context_mapped(struct dmar_domain *domain,
1294         struct pci_dev *pdev)
1295 {
1296         int ret;
1297         struct pci_dev *tmp, *parent;
1298
1299         ret = device_context_mapped(domain->iommu,
1300                 pdev->bus->number, pdev->devfn);
1301         if (!ret)
1302                 return ret;
1303         /* dependent device mapping */
1304         tmp = pci_find_upstream_pcie_bridge(pdev);
1305         if (!tmp)
1306                 return ret;
1307         /* Secondary interface's bus number and devfn 0 */
1308         parent = pdev->bus->self;
1309         while (parent != tmp) {
1310                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1311                         parent->devfn);
1312                 if (!ret)
1313                         return ret;
1314                 parent = parent->bus->self;
1315         }
1316         if (tmp->is_pcie)
1317                 return device_context_mapped(domain->iommu,
1318                         tmp->subordinate->number, 0);
1319         else
1320                 return device_context_mapped(domain->iommu,
1321                         tmp->bus->number, tmp->devfn);
1322 }
1323
1324 static int
1325 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1326                         u64 hpa, size_t size, int prot)
1327 {
1328         u64 start_pfn, end_pfn;
1329         struct dma_pte *pte;
1330         int index;
1331
1332         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1333                 return -EINVAL;
1334         iova &= PAGE_MASK_4K;
1335         start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1336         end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1337         index = 0;
1338         while (start_pfn < end_pfn) {
1339                 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1340                 if (!pte)
1341                         return -ENOMEM;
1342                 /* We don't need lock here, nobody else
1343                  * touches the iova range
1344                  */
1345                 BUG_ON(dma_pte_addr(*pte));
1346                 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1347                 dma_set_pte_prot(*pte, prot);
1348                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1349                 start_pfn++;
1350                 index++;
1351         }
1352         return 0;
1353 }
1354
1355 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1356 {
1357         clear_context_table(domain->iommu, bus, devfn);
1358         iommu_flush_context_global(domain->iommu, 0);
1359         iommu_flush_iotlb_global(domain->iommu, 0);
1360 }
1361
1362 static void domain_remove_dev_info(struct dmar_domain *domain)
1363 {
1364         struct device_domain_info *info;
1365         unsigned long flags;
1366
1367         spin_lock_irqsave(&device_domain_lock, flags);
1368         while (!list_empty(&domain->devices)) {
1369                 info = list_entry(domain->devices.next,
1370                         struct device_domain_info, link);
1371                 list_del(&info->link);
1372                 list_del(&info->global);
1373                 if (info->dev)
1374                         info->dev->dev.archdata.iommu = NULL;
1375                 spin_unlock_irqrestore(&device_domain_lock, flags);
1376
1377                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1378                 free_devinfo_mem(info);
1379
1380                 spin_lock_irqsave(&device_domain_lock, flags);
1381         }
1382         spin_unlock_irqrestore(&device_domain_lock, flags);
1383 }
1384
1385 /*
1386  * find_domain
1387  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1388  */
1389 struct dmar_domain *
1390 find_domain(struct pci_dev *pdev)
1391 {
1392         struct device_domain_info *info;
1393
1394         /* No lock here, assumes no domain exit in normal case */
1395         info = pdev->dev.archdata.iommu;
1396         if (info)
1397                 return info->domain;
1398         return NULL;
1399 }
1400
1401 static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
1402      struct pci_dev *dev)
1403 {
1404         int index;
1405
1406         while (dev) {
1407                 for (index = 0; index < cnt; index ++)
1408                         if (dev == devices[index])
1409                                 return 1;
1410
1411                 /* Check our parent */
1412                 dev = dev->bus->self;
1413         }
1414
1415         return 0;
1416 }
1417
1418 static struct dmar_drhd_unit *
1419 dmar_find_matched_drhd_unit(struct pci_dev *dev)
1420 {
1421         struct dmar_drhd_unit *drhd = NULL;
1422
1423         list_for_each_entry(drhd, &dmar_drhd_units, list) {
1424                 if (drhd->include_all || dmar_pci_device_match(drhd->devices,
1425                                                 drhd->devices_cnt, dev))
1426                         return drhd;
1427         }
1428
1429         return NULL;
1430 }
1431
1432 /* domain is initialized */
1433 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1434 {
1435         struct dmar_domain *domain, *found = NULL;
1436         struct intel_iommu *iommu;
1437         struct dmar_drhd_unit *drhd;
1438         struct device_domain_info *info, *tmp;
1439         struct pci_dev *dev_tmp;
1440         unsigned long flags;
1441         int bus = 0, devfn = 0;
1442
1443         domain = find_domain(pdev);
1444         if (domain)
1445                 return domain;
1446
1447         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1448         if (dev_tmp) {
1449                 if (dev_tmp->is_pcie) {
1450                         bus = dev_tmp->subordinate->number;
1451                         devfn = 0;
1452                 } else {
1453                         bus = dev_tmp->bus->number;
1454                         devfn = dev_tmp->devfn;
1455                 }
1456                 spin_lock_irqsave(&device_domain_lock, flags);
1457                 list_for_each_entry(info, &device_domain_list, global) {
1458                         if (info->bus == bus && info->devfn == devfn) {
1459                                 found = info->domain;
1460                                 break;
1461                         }
1462                 }
1463                 spin_unlock_irqrestore(&device_domain_lock, flags);
1464                 /* pcie-pci bridge already has a domain, uses it */
1465                 if (found) {
1466                         domain = found;
1467                         goto found_domain;
1468                 }
1469         }
1470
1471         /* Allocate new domain for the device */
1472         drhd = dmar_find_matched_drhd_unit(pdev);
1473         if (!drhd) {
1474                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1475                         pci_name(pdev));
1476                 return NULL;
1477         }
1478         iommu = drhd->iommu;
1479
1480         domain = iommu_alloc_domain(iommu);
1481         if (!domain)
1482                 goto error;
1483
1484         if (domain_init(domain, gaw)) {
1485                 domain_exit(domain);
1486                 goto error;
1487         }
1488
1489         /* register pcie-to-pci device */
1490         if (dev_tmp) {
1491                 info = alloc_devinfo_mem();
1492                 if (!info) {
1493                         domain_exit(domain);
1494                         goto error;
1495                 }
1496                 info->bus = bus;
1497                 info->devfn = devfn;
1498                 info->dev = NULL;
1499                 info->domain = domain;
1500                 /* This domain is shared by devices under p2p bridge */
1501                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1502
1503                 /* pcie-to-pci bridge already has a domain, uses it */
1504                 found = NULL;
1505                 spin_lock_irqsave(&device_domain_lock, flags);
1506                 list_for_each_entry(tmp, &device_domain_list, global) {
1507                         if (tmp->bus == bus && tmp->devfn == devfn) {
1508                                 found = tmp->domain;
1509                                 break;
1510                         }
1511                 }
1512                 if (found) {
1513                         free_devinfo_mem(info);
1514                         domain_exit(domain);
1515                         domain = found;
1516                 } else {
1517                         list_add(&info->link, &domain->devices);
1518                         list_add(&info->global, &device_domain_list);
1519                 }
1520                 spin_unlock_irqrestore(&device_domain_lock, flags);
1521         }
1522
1523 found_domain:
1524         info = alloc_devinfo_mem();
1525         if (!info)
1526                 goto error;
1527         info->bus = pdev->bus->number;
1528         info->devfn = pdev->devfn;
1529         info->dev = pdev;
1530         info->domain = domain;
1531         spin_lock_irqsave(&device_domain_lock, flags);
1532         /* somebody is fast */
1533         found = find_domain(pdev);
1534         if (found != NULL) {
1535                 spin_unlock_irqrestore(&device_domain_lock, flags);
1536                 if (found != domain) {
1537                         domain_exit(domain);
1538                         domain = found;
1539                 }
1540                 free_devinfo_mem(info);
1541                 return domain;
1542         }
1543         list_add(&info->link, &domain->devices);
1544         list_add(&info->global, &device_domain_list);
1545         pdev->dev.archdata.iommu = info;
1546         spin_unlock_irqrestore(&device_domain_lock, flags);
1547         return domain;
1548 error:
1549         /* recheck it here, maybe others set it */
1550         return find_domain(pdev);
1551 }
1552
1553 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1554 {
1555         struct dmar_domain *domain;
1556         unsigned long size;
1557         u64 base;
1558         int ret;
1559
1560         printk(KERN_INFO
1561                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1562                 pci_name(pdev), start, end);
1563         /* page table init */
1564         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1565         if (!domain)
1566                 return -ENOMEM;
1567
1568         /* The address might not be aligned */
1569         base = start & PAGE_MASK_4K;
1570         size = end - base;
1571         size = PAGE_ALIGN_4K(size);
1572         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1573                         IOVA_PFN(base + size) - 1)) {
1574                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1575                 ret = -ENOMEM;
1576                 goto error;
1577         }
1578
1579         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1580                 size, base, pci_name(pdev));
1581         /*
1582          * RMRR range might have overlap with physical memory range,
1583          * clear it first
1584          */
1585         dma_pte_clear_range(domain, base, base + size);
1586
1587         ret = domain_page_mapping(domain, base, base, size,
1588                 DMA_PTE_READ|DMA_PTE_WRITE);
1589         if (ret)
1590                 goto error;
1591
1592         /* context entry init */
1593         ret = domain_context_mapping(domain, pdev);
1594         if (!ret)
1595                 return 0;
1596 error:
1597         domain_exit(domain);
1598         return ret;
1599
1600 }
1601
1602 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1603         struct pci_dev *pdev)
1604 {
1605         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1606                 return 0;
1607         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1608                 rmrr->end_address + 1);
1609 }
1610
1611 #ifdef CONFIG_DMAR_GFX_WA
1612 extern int arch_get_ram_range(int slot, u64 *addr, u64 *size);
1613 static void __init iommu_prepare_gfx_mapping(void)
1614 {
1615         struct pci_dev *pdev = NULL;
1616         u64 base, size;
1617         int slot;
1618         int ret;
1619
1620         for_each_pci_dev(pdev) {
1621                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1622                                 !IS_GFX_DEVICE(pdev))
1623                         continue;
1624                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1625                         pci_name(pdev));
1626                 slot = arch_get_ram_range(0, &base, &size);
1627                 while (slot >= 0) {
1628                         ret = iommu_prepare_identity_map(pdev,
1629                                         base, base + size);
1630                         if (ret)
1631                                 goto error;
1632                         slot = arch_get_ram_range(slot, &base, &size);
1633                 }
1634                 continue;
1635 error:
1636                 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1637         }
1638 }
1639 #endif
1640
1641 #ifdef CONFIG_DMAR_FLOPPY_WA
1642 static inline void iommu_prepare_isa(void)
1643 {
1644         struct pci_dev *pdev;
1645         int ret;
1646
1647         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1648         if (!pdev)
1649                 return;
1650
1651         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1652         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1653
1654         if (ret)
1655                 printk("IOMMU: Failed to create 0-64M identity map, "
1656                         "floppy might not work\n");
1657
1658 }
1659 #else
1660 static inline void iommu_prepare_isa(void)
1661 {
1662         return;
1663 }
1664 #endif /* !CONFIG_DMAR_FLPY_WA */
1665
1666 int __init init_dmars(void)
1667 {
1668         struct dmar_drhd_unit *drhd;
1669         struct dmar_rmrr_unit *rmrr;
1670         struct pci_dev *pdev;
1671         struct intel_iommu *iommu;
1672         int ret, unit = 0;
1673
1674         /*
1675          * for each drhd
1676          *    allocate root
1677          *    initialize and program root entry to not present
1678          * endfor
1679          */
1680         for_each_drhd_unit(drhd) {
1681                 if (drhd->ignored)
1682                         continue;
1683                 iommu = alloc_iommu(drhd);
1684                 if (!iommu) {
1685                         ret = -ENOMEM;
1686                         goto error;
1687                 }
1688
1689                 /*
1690                  * TBD:
1691                  * we could share the same root & context tables
1692                  * amoung all IOMMU's. Need to Split it later.
1693                  */
1694                 ret = iommu_alloc_root_entry(iommu);
1695                 if (ret) {
1696                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1697                         goto error;
1698                 }
1699         }
1700
1701         /*
1702          * For each rmrr
1703          *   for each dev attached to rmrr
1704          *   do
1705          *     locate drhd for dev, alloc domain for dev
1706          *     allocate free domain
1707          *     allocate page table entries for rmrr
1708          *     if context not allocated for bus
1709          *           allocate and init context
1710          *           set present in root table for this bus
1711          *     init context with domain, translation etc
1712          *    endfor
1713          * endfor
1714          */
1715         for_each_rmrr_units(rmrr) {
1716                 int i;
1717                 for (i = 0; i < rmrr->devices_cnt; i++) {
1718                         pdev = rmrr->devices[i];
1719                         /* some BIOS lists non-exist devices in DMAR table */
1720                         if (!pdev)
1721                                 continue;
1722                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1723                         if (ret)
1724                                 printk(KERN_ERR
1725                                  "IOMMU: mapping reserved region failed\n");
1726                 }
1727         }
1728
1729         iommu_prepare_gfx_mapping();
1730
1731         iommu_prepare_isa();
1732
1733         /*
1734          * for each drhd
1735          *   enable fault log
1736          *   global invalidate context cache
1737          *   global invalidate iotlb
1738          *   enable translation
1739          */
1740         for_each_drhd_unit(drhd) {
1741                 if (drhd->ignored)
1742                         continue;
1743                 iommu = drhd->iommu;
1744                 sprintf (iommu->name, "dmar%d", unit++);
1745
1746                 iommu_flush_write_buffer(iommu);
1747
1748                 ret = dmar_set_interrupt(iommu);
1749                 if (ret)
1750                         goto error;
1751
1752                 iommu_set_root_entry(iommu);
1753
1754                 iommu_flush_context_global(iommu, 0);
1755                 iommu_flush_iotlb_global(iommu, 0);
1756
1757                 iommu_disable_protect_mem_regions(iommu);
1758
1759                 ret = iommu_enable_translation(iommu);
1760                 if (ret)
1761                         goto error;
1762         }
1763
1764         return 0;
1765 error:
1766         for_each_drhd_unit(drhd) {
1767                 if (drhd->ignored)
1768                         continue;
1769                 iommu = drhd->iommu;
1770                 free_iommu(iommu);
1771         }
1772         return ret;
1773 }
1774
1775 static inline u64 aligned_size(u64 host_addr, size_t size)
1776 {
1777         u64 addr;
1778         addr = (host_addr & (~PAGE_MASK_4K)) + size;
1779         return PAGE_ALIGN_4K(addr);
1780 }
1781
1782 struct iova *
1783 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1784 {
1785         struct iova *piova;
1786
1787         /* Make sure it's in range */
1788         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1789         if (!size || (IOVA_START_ADDR + size > end))
1790                 return NULL;
1791
1792         piova = alloc_iova(&domain->iovad,
1793                         size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1794         return piova;
1795 }
1796
1797 static struct iova *
1798 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1799                 size_t size)
1800 {
1801         struct pci_dev *pdev = to_pci_dev(dev);
1802         struct iova *iova = NULL;
1803
1804         if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1805                 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1806         } else  {
1807                 /*
1808                  * First try to allocate an io virtual address in
1809                  * DMA_32BIT_MASK and if that fails then try allocating
1810                  * from higher range
1811                  */
1812                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1813                 if (!iova)
1814                         iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1815         }
1816
1817         if (!iova) {
1818                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1819                 return NULL;
1820         }
1821
1822         return iova;
1823 }
1824
1825 static struct dmar_domain *
1826 get_valid_domain_for_dev(struct pci_dev *pdev)
1827 {
1828         struct dmar_domain *domain;
1829         int ret;
1830
1831         domain = get_domain_for_dev(pdev,
1832                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1833         if (!domain) {
1834                 printk(KERN_ERR
1835                         "Allocating domain for %s failed", pci_name(pdev));
1836                 return NULL;
1837         }
1838
1839         /* make sure context mapping is ok */
1840         if (unlikely(!domain_context_mapped(domain, pdev))) {
1841                 ret = domain_context_mapping(domain, pdev);
1842                 if (ret) {
1843                         printk(KERN_ERR
1844                                 "Domain context map for %s failed",
1845                                 pci_name(pdev));
1846                         return NULL;
1847                 }
1848         }
1849
1850         return domain;
1851 }
1852
1853 static dma_addr_t intel_map_single(struct device *hwdev, void *addr,
1854         size_t size, int dir)
1855 {
1856         struct pci_dev *pdev = to_pci_dev(hwdev);
1857         int ret;
1858         struct dmar_domain *domain;
1859         unsigned long start_addr;
1860         struct iova *iova;
1861         int prot = 0;
1862
1863         BUG_ON(dir == DMA_NONE);
1864         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1865                 return virt_to_bus(addr);
1866
1867         domain = get_valid_domain_for_dev(pdev);
1868         if (!domain)
1869                 return 0;
1870
1871         addr = (void *)virt_to_phys(addr);
1872         size = aligned_size((u64)addr, size);
1873
1874         iova = __intel_alloc_iova(hwdev, domain, size);
1875         if (!iova)
1876                 goto error;
1877
1878         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1879
1880         /*
1881          * Check if DMAR supports zero-length reads on write only
1882          * mappings..
1883          */
1884         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1885                         !cap_zlr(domain->iommu->cap))
1886                 prot |= DMA_PTE_READ;
1887         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1888                 prot |= DMA_PTE_WRITE;
1889         /*
1890          * addr - (addr + size) might be partial page, we should map the whole
1891          * page.  Note: if two part of one page are separately mapped, we
1892          * might have two guest_addr mapping to the same host addr, but this
1893          * is not a big problem
1894          */
1895         ret = domain_page_mapping(domain, start_addr,
1896                 ((u64)addr) & PAGE_MASK_4K, size, prot);
1897         if (ret)
1898                 goto error;
1899
1900         pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1901                 pci_name(pdev), size, (u64)addr,
1902                 size, (u64)start_addr, dir);
1903
1904         /* it's a non-present to present mapping */
1905         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1906                         start_addr, size >> PAGE_SHIFT_4K, 1);
1907         if (ret)
1908                 iommu_flush_write_buffer(domain->iommu);
1909
1910         return (start_addr + ((u64)addr & (~PAGE_MASK_4K)));
1911
1912 error:
1913         if (iova)
1914                 __free_iova(&domain->iovad, iova);
1915         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1916                 pci_name(pdev), size, (u64)addr, dir);
1917         return 0;
1918 }
1919
1920 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1921         size_t size, int dir)
1922 {
1923         struct pci_dev *pdev = to_pci_dev(dev);
1924         struct dmar_domain *domain;
1925         unsigned long start_addr;
1926         struct iova *iova;
1927
1928         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1929                 return;
1930         domain = find_domain(pdev);
1931         BUG_ON(!domain);
1932
1933         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1934         if (!iova)
1935                 return;
1936
1937         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1938         size = aligned_size((u64)dev_addr, size);
1939
1940         pr_debug("Device %s unmapping: %lx@%llx\n",
1941                 pci_name(pdev), size, (u64)start_addr);
1942
1943         /*  clear the whole page */
1944         dma_pte_clear_range(domain, start_addr, start_addr + size);
1945         /* free page tables */
1946         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1947
1948         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
1949                         size >> PAGE_SHIFT_4K, 0))
1950                 iommu_flush_write_buffer(domain->iommu);
1951
1952         /* free iova */
1953         __free_iova(&domain->iovad, iova);
1954 }
1955
1956 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
1957                        dma_addr_t *dma_handle, gfp_t flags)
1958 {
1959         void *vaddr;
1960         int order;
1961
1962         size = PAGE_ALIGN_4K(size);
1963         order = get_order(size);
1964         flags &= ~(GFP_DMA | GFP_DMA32);
1965
1966         vaddr = (void *)__get_free_pages(flags, order);
1967         if (!vaddr)
1968                 return NULL;
1969         memset(vaddr, 0, size);
1970
1971         *dma_handle = intel_map_single(hwdev, vaddr, size, DMA_BIDIRECTIONAL);
1972         if (*dma_handle)
1973                 return vaddr;
1974         free_pages((unsigned long)vaddr, order);
1975         return NULL;
1976 }
1977
1978 static void intel_free_coherent(struct device *hwdev, size_t size,
1979         void *vaddr, dma_addr_t dma_handle)
1980 {
1981         int order;
1982
1983         size = PAGE_ALIGN_4K(size);
1984         order = get_order(size);
1985
1986         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
1987         free_pages((unsigned long)vaddr, order);
1988 }
1989
1990 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
1991 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
1992         int nelems, int dir)
1993 {
1994         int i;
1995         struct pci_dev *pdev = to_pci_dev(hwdev);
1996         struct dmar_domain *domain;
1997         unsigned long start_addr;
1998         struct iova *iova;
1999         size_t size = 0;
2000         void *addr;
2001         struct scatterlist *sg;
2002
2003         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2004                 return;
2005
2006         domain = find_domain(pdev);
2007
2008         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2009         if (!iova)
2010                 return;
2011         for_each_sg(sglist, sg, nelems, i) {
2012                 addr = SG_ENT_VIRT_ADDRESS(sg);
2013                 size += aligned_size((u64)addr, sg->length);
2014         }
2015
2016         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2017
2018         /*  clear the whole page */
2019         dma_pte_clear_range(domain, start_addr, start_addr + size);
2020         /* free page tables */
2021         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2022
2023         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2024                         size >> PAGE_SHIFT_4K, 0))
2025                 iommu_flush_write_buffer(domain->iommu);
2026
2027         /* free iova */
2028         __free_iova(&domain->iovad, iova);
2029 }
2030
2031 static int intel_nontranslate_map_sg(struct device *hddev,
2032         struct scatterlist *sglist, int nelems, int dir)
2033 {
2034         int i;
2035         struct scatterlist *sg;
2036
2037         for_each_sg(sglist, sg, nelems, i) {
2038                 BUG_ON(!sg_page(sg));
2039                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2040                 sg->dma_length = sg->length;
2041         }
2042         return nelems;
2043 }
2044
2045 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2046                                 int nelems, int dir)
2047 {
2048         void *addr;
2049         int i;
2050         struct pci_dev *pdev = to_pci_dev(hwdev);
2051         struct dmar_domain *domain;
2052         size_t size = 0;
2053         int prot = 0;
2054         size_t offset = 0;
2055         struct iova *iova = NULL;
2056         int ret;
2057         struct scatterlist *sg;
2058         unsigned long start_addr;
2059
2060         BUG_ON(dir == DMA_NONE);
2061         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2062                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2063
2064         domain = get_valid_domain_for_dev(pdev);
2065         if (!domain)
2066                 return 0;
2067
2068         for_each_sg(sglist, sg, nelems, i) {
2069                 addr = SG_ENT_VIRT_ADDRESS(sg);
2070                 addr = (void *)virt_to_phys(addr);
2071                 size += aligned_size((u64)addr, sg->length);
2072         }
2073
2074         iova = __intel_alloc_iova(hwdev, domain, size);
2075         if (!iova) {
2076                 sglist->dma_length = 0;
2077                 return 0;
2078         }
2079
2080         /*
2081          * Check if DMAR supports zero-length reads on write only
2082          * mappings..
2083          */
2084         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2085                         !cap_zlr(domain->iommu->cap))
2086                 prot |= DMA_PTE_READ;
2087         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2088                 prot |= DMA_PTE_WRITE;
2089
2090         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2091         offset = 0;
2092         for_each_sg(sglist, sg, nelems, i) {
2093                 addr = SG_ENT_VIRT_ADDRESS(sg);
2094                 addr = (void *)virt_to_phys(addr);
2095                 size = aligned_size((u64)addr, sg->length);
2096                 ret = domain_page_mapping(domain, start_addr + offset,
2097                         ((u64)addr) & PAGE_MASK_4K,
2098                         size, prot);
2099                 if (ret) {
2100                         /*  clear the page */
2101                         dma_pte_clear_range(domain, start_addr,
2102                                   start_addr + offset);
2103                         /* free page tables */
2104                         dma_pte_free_pagetable(domain, start_addr,
2105                                   start_addr + offset);
2106                         /* free iova */
2107                         __free_iova(&domain->iovad, iova);
2108                         return 0;
2109                 }
2110                 sg->dma_address = start_addr + offset +
2111                                 ((u64)addr & (~PAGE_MASK_4K));
2112                 sg->dma_length = sg->length;
2113                 offset += size;
2114         }
2115
2116         /* it's a non-present to present mapping */
2117         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2118                         start_addr, offset >> PAGE_SHIFT_4K, 1))
2119                 iommu_flush_write_buffer(domain->iommu);
2120         return nelems;
2121 }
2122
2123 static struct dma_mapping_ops intel_dma_ops = {
2124         .alloc_coherent = intel_alloc_coherent,
2125         .free_coherent = intel_free_coherent,
2126         .map_single = intel_map_single,
2127         .unmap_single = intel_unmap_single,
2128         .map_sg = intel_map_sg,
2129         .unmap_sg = intel_unmap_sg,
2130 };
2131
2132 static inline int iommu_domain_cache_init(void)
2133 {
2134         int ret = 0;
2135
2136         iommu_domain_cache = kmem_cache_create("iommu_domain",
2137                                          sizeof(struct dmar_domain),
2138                                          0,
2139                                          SLAB_HWCACHE_ALIGN,
2140
2141                                          NULL);
2142         if (!iommu_domain_cache) {
2143                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2144                 ret = -ENOMEM;
2145         }
2146
2147         return ret;
2148 }
2149
2150 static inline int iommu_devinfo_cache_init(void)
2151 {
2152         int ret = 0;
2153
2154         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2155                                          sizeof(struct device_domain_info),
2156                                          0,
2157                                          SLAB_HWCACHE_ALIGN,
2158
2159                                          NULL);
2160         if (!iommu_devinfo_cache) {
2161                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2162                 ret = -ENOMEM;
2163         }
2164
2165         return ret;
2166 }
2167
2168 static inline int iommu_iova_cache_init(void)
2169 {
2170         int ret = 0;
2171
2172         iommu_iova_cache = kmem_cache_create("iommu_iova",
2173                                          sizeof(struct iova),
2174                                          0,
2175                                          SLAB_HWCACHE_ALIGN,
2176
2177                                          NULL);
2178         if (!iommu_iova_cache) {
2179                 printk(KERN_ERR "Couldn't create iova cache\n");
2180                 ret = -ENOMEM;
2181         }
2182
2183         return ret;
2184 }
2185
2186 static int __init iommu_init_mempool(void)
2187 {
2188         int ret;
2189         ret = iommu_iova_cache_init();
2190         if (ret)
2191                 return ret;
2192
2193         ret = iommu_domain_cache_init();
2194         if (ret)
2195                 goto domain_error;
2196
2197         ret = iommu_devinfo_cache_init();
2198         if (!ret)
2199                 return ret;
2200
2201         kmem_cache_destroy(iommu_domain_cache);
2202 domain_error:
2203         kmem_cache_destroy(iommu_iova_cache);
2204
2205         return -ENOMEM;
2206 }
2207
2208 static void __init iommu_exit_mempool(void)
2209 {
2210         kmem_cache_destroy(iommu_devinfo_cache);
2211         kmem_cache_destroy(iommu_domain_cache);
2212         kmem_cache_destroy(iommu_iova_cache);
2213
2214 }
2215
2216 void __init detect_intel_iommu(void)
2217 {
2218         if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2219                 return;
2220         if (early_dmar_detect()) {
2221                 iommu_detected = 1;
2222         }
2223 }
2224
2225 static void __init init_no_remapping_devices(void)
2226 {
2227         struct dmar_drhd_unit *drhd;
2228
2229         for_each_drhd_unit(drhd) {
2230                 if (!drhd->include_all) {
2231                         int i;
2232                         for (i = 0; i < drhd->devices_cnt; i++)
2233                                 if (drhd->devices[i] != NULL)
2234                                         break;
2235                         /* ignore DMAR unit if no pci devices exist */
2236                         if (i == drhd->devices_cnt)
2237                                 drhd->ignored = 1;
2238                 }
2239         }
2240
2241         if (dmar_map_gfx)
2242                 return;
2243
2244         for_each_drhd_unit(drhd) {
2245                 int i;
2246                 if (drhd->ignored || drhd->include_all)
2247                         continue;
2248
2249                 for (i = 0; i < drhd->devices_cnt; i++)
2250                         if (drhd->devices[i] &&
2251                                 !IS_GFX_DEVICE(drhd->devices[i]))
2252                                 break;
2253
2254                 if (i < drhd->devices_cnt)
2255                         continue;
2256
2257                 /* bypass IOMMU if it is just for gfx devices */
2258                 drhd->ignored = 1;
2259                 for (i = 0; i < drhd->devices_cnt; i++) {
2260                         if (!drhd->devices[i])
2261                                 continue;
2262                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2263                 }
2264         }
2265 }
2266
2267 int __init intel_iommu_init(void)
2268 {
2269         int ret = 0;
2270
2271         if (no_iommu || swiotlb || dmar_disabled)
2272                 return -ENODEV;
2273
2274         if (dmar_table_init())
2275                 return  -ENODEV;
2276
2277         iommu_init_mempool();
2278         dmar_init_reserved_ranges();
2279
2280         init_no_remapping_devices();
2281
2282         ret = init_dmars();
2283         if (ret) {
2284                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2285                 put_iova_domain(&reserved_iova_list);
2286                 iommu_exit_mempool();
2287                 return ret;
2288         }
2289         printk(KERN_INFO
2290         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2291
2292         force_iommu = 1;
2293         dma_ops = &intel_dma_ops;
2294         return 0;
2295 }
2296