KVM: MMU: filter out the mmio pfn from the fault pfn
[linux-2.6.git] / arch / x86 / kvm / mmu.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * MMU support
8  *
9  * Copyright (C) 2006 Qumranet, Inc.
10  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11  *
12  * Authors:
13  *   Yaniv Kamay  <yaniv@qumranet.com>
14  *   Avi Kivity   <avi@qumranet.com>
15  *
16  * This work is licensed under the terms of the GNU GPL, version 2.  See
17  * the COPYING file in the top-level directory.
18  *
19  */
20
21 #include "irq.h"
22 #include "mmu.h"
23 #include "x86.h"
24 #include "kvm_cache_regs.h"
25 #include "x86.h"
26
27 #include <linux/kvm_host.h>
28 #include <linux/types.h>
29 #include <linux/string.h>
30 #include <linux/mm.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/swap.h>
34 #include <linux/hugetlb.h>
35 #include <linux/compiler.h>
36 #include <linux/srcu.h>
37 #include <linux/slab.h>
38 #include <linux/uaccess.h>
39
40 #include <asm/page.h>
41 #include <asm/cmpxchg.h>
42 #include <asm/io.h>
43 #include <asm/vmx.h>
44
45 /*
46  * When setting this variable to true it enables Two-Dimensional-Paging
47  * where the hardware walks 2 page tables:
48  * 1. the guest-virtual to guest-physical
49  * 2. while doing 1. it walks guest-physical to host-physical
50  * If the hardware supports that we don't need to do shadow paging.
51  */
52 bool tdp_enabled = false;
53
54 enum {
55         AUDIT_PRE_PAGE_FAULT,
56         AUDIT_POST_PAGE_FAULT,
57         AUDIT_PRE_PTE_WRITE,
58         AUDIT_POST_PTE_WRITE,
59         AUDIT_PRE_SYNC,
60         AUDIT_POST_SYNC
61 };
62
63 char *audit_point_name[] = {
64         "pre page fault",
65         "post page fault",
66         "pre pte write",
67         "post pte write",
68         "pre sync",
69         "post sync"
70 };
71
72 #undef MMU_DEBUG
73
74 #ifdef MMU_DEBUG
75
76 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
77 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
78
79 #else
80
81 #define pgprintk(x...) do { } while (0)
82 #define rmap_printk(x...) do { } while (0)
83
84 #endif
85
86 #ifdef MMU_DEBUG
87 static int dbg = 0;
88 module_param(dbg, bool, 0644);
89 #endif
90
91 static int oos_shadow = 1;
92 module_param(oos_shadow, bool, 0644);
93
94 #ifndef MMU_DEBUG
95 #define ASSERT(x) do { } while (0)
96 #else
97 #define ASSERT(x)                                                       \
98         if (!(x)) {                                                     \
99                 printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
100                        __FILE__, __LINE__, #x);                         \
101         }
102 #endif
103
104 #define PTE_PREFETCH_NUM                8
105
106 #define PT_FIRST_AVAIL_BITS_SHIFT 9
107 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
108
109 #define PT64_LEVEL_BITS 9
110
111 #define PT64_LEVEL_SHIFT(level) \
112                 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
113
114 #define PT64_INDEX(address, level)\
115         (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
116
117
118 #define PT32_LEVEL_BITS 10
119
120 #define PT32_LEVEL_SHIFT(level) \
121                 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
122
123 #define PT32_LVL_OFFSET_MASK(level) \
124         (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
125                                                 * PT32_LEVEL_BITS))) - 1))
126
127 #define PT32_INDEX(address, level)\
128         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
129
130
131 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
132 #define PT64_DIR_BASE_ADDR_MASK \
133         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
134 #define PT64_LVL_ADDR_MASK(level) \
135         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
136                                                 * PT64_LEVEL_BITS))) - 1))
137 #define PT64_LVL_OFFSET_MASK(level) \
138         (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
139                                                 * PT64_LEVEL_BITS))) - 1))
140
141 #define PT32_BASE_ADDR_MASK PAGE_MASK
142 #define PT32_DIR_BASE_ADDR_MASK \
143         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
144 #define PT32_LVL_ADDR_MASK(level) \
145         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
146                                             * PT32_LEVEL_BITS))) - 1))
147
148 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
149                         | PT64_NX_MASK)
150
151 #define PTE_LIST_EXT 4
152
153 #define ACC_EXEC_MASK    1
154 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
155 #define ACC_USER_MASK    PT_USER_MASK
156 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
157
158 #include <trace/events/kvm.h>
159
160 #define CREATE_TRACE_POINTS
161 #include "mmutrace.h"
162
163 #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
164
165 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
166
167 struct pte_list_desc {
168         u64 *sptes[PTE_LIST_EXT];
169         struct pte_list_desc *more;
170 };
171
172 struct kvm_shadow_walk_iterator {
173         u64 addr;
174         hpa_t shadow_addr;
175         int level;
176         u64 *sptep;
177         unsigned index;
178 };
179
180 #define for_each_shadow_entry(_vcpu, _addr, _walker)    \
181         for (shadow_walk_init(&(_walker), _vcpu, _addr);        \
182              shadow_walk_okay(&(_walker));                      \
183              shadow_walk_next(&(_walker)))
184
185 static struct kmem_cache *pte_list_desc_cache;
186 static struct kmem_cache *mmu_page_header_cache;
187 static struct percpu_counter kvm_total_used_mmu_pages;
188
189 static u64 __read_mostly shadow_nx_mask;
190 static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
191 static u64 __read_mostly shadow_user_mask;
192 static u64 __read_mostly shadow_accessed_mask;
193 static u64 __read_mostly shadow_dirty_mask;
194
195 static inline u64 rsvd_bits(int s, int e)
196 {
197         return ((1ULL << (e - s + 1)) - 1) << s;
198 }
199
200 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
201                 u64 dirty_mask, u64 nx_mask, u64 x_mask)
202 {
203         shadow_user_mask = user_mask;
204         shadow_accessed_mask = accessed_mask;
205         shadow_dirty_mask = dirty_mask;
206         shadow_nx_mask = nx_mask;
207         shadow_x_mask = x_mask;
208 }
209 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
210
211 static int is_cpuid_PSE36(void)
212 {
213         return 1;
214 }
215
216 static int is_nx(struct kvm_vcpu *vcpu)
217 {
218         return vcpu->arch.efer & EFER_NX;
219 }
220
221 static int is_shadow_present_pte(u64 pte)
222 {
223         return pte & PT_PRESENT_MASK;
224 }
225
226 static int is_large_pte(u64 pte)
227 {
228         return pte & PT_PAGE_SIZE_MASK;
229 }
230
231 static int is_dirty_gpte(unsigned long pte)
232 {
233         return pte & PT_DIRTY_MASK;
234 }
235
236 static int is_rmap_spte(u64 pte)
237 {
238         return is_shadow_present_pte(pte);
239 }
240
241 static int is_last_spte(u64 pte, int level)
242 {
243         if (level == PT_PAGE_TABLE_LEVEL)
244                 return 1;
245         if (is_large_pte(pte))
246                 return 1;
247         return 0;
248 }
249
250 static pfn_t spte_to_pfn(u64 pte)
251 {
252         return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
253 }
254
255 static gfn_t pse36_gfn_delta(u32 gpte)
256 {
257         int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
258
259         return (gpte & PT32_DIR_PSE36_MASK) << shift;
260 }
261
262 static void __set_spte(u64 *sptep, u64 spte)
263 {
264         set_64bit(sptep, spte);
265 }
266
267 static u64 __xchg_spte(u64 *sptep, u64 new_spte)
268 {
269 #ifdef CONFIG_X86_64
270         return xchg(sptep, new_spte);
271 #else
272         u64 old_spte;
273
274         do {
275                 old_spte = *sptep;
276         } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
277
278         return old_spte;
279 #endif
280 }
281
282 static bool spte_has_volatile_bits(u64 spte)
283 {
284         if (!shadow_accessed_mask)
285                 return false;
286
287         if (!is_shadow_present_pte(spte))
288                 return false;
289
290         if ((spte & shadow_accessed_mask) &&
291               (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
292                 return false;
293
294         return true;
295 }
296
297 static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
298 {
299         return (old_spte & bit_mask) && !(new_spte & bit_mask);
300 }
301
302 static void update_spte(u64 *sptep, u64 new_spte)
303 {
304         u64 mask, old_spte = *sptep;
305
306         WARN_ON(!is_rmap_spte(new_spte));
307
308         new_spte |= old_spte & shadow_dirty_mask;
309
310         mask = shadow_accessed_mask;
311         if (is_writable_pte(old_spte))
312                 mask |= shadow_dirty_mask;
313
314         if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
315                 __set_spte(sptep, new_spte);
316         else
317                 old_spte = __xchg_spte(sptep, new_spte);
318
319         if (!shadow_accessed_mask)
320                 return;
321
322         if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
323                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
324         if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
325                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
326 }
327
328 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
329                                   struct kmem_cache *base_cache, int min)
330 {
331         void *obj;
332
333         if (cache->nobjs >= min)
334                 return 0;
335         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
336                 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
337                 if (!obj)
338                         return -ENOMEM;
339                 cache->objects[cache->nobjs++] = obj;
340         }
341         return 0;
342 }
343
344 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
345                                   struct kmem_cache *cache)
346 {
347         while (mc->nobjs)
348                 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
349 }
350
351 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
352                                        int min)
353 {
354         void *page;
355
356         if (cache->nobjs >= min)
357                 return 0;
358         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
359                 page = (void *)__get_free_page(GFP_KERNEL);
360                 if (!page)
361                         return -ENOMEM;
362                 cache->objects[cache->nobjs++] = page;
363         }
364         return 0;
365 }
366
367 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
368 {
369         while (mc->nobjs)
370                 free_page((unsigned long)mc->objects[--mc->nobjs]);
371 }
372
373 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
374 {
375         int r;
376
377         r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
378                                    pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
379         if (r)
380                 goto out;
381         r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
382         if (r)
383                 goto out;
384         r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
385                                    mmu_page_header_cache, 4);
386 out:
387         return r;
388 }
389
390 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
391 {
392         mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
393                                 pte_list_desc_cache);
394         mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
395         mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
396                                 mmu_page_header_cache);
397 }
398
399 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
400                                     size_t size)
401 {
402         void *p;
403
404         BUG_ON(!mc->nobjs);
405         p = mc->objects[--mc->nobjs];
406         return p;
407 }
408
409 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
410 {
411         return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache,
412                                       sizeof(struct pte_list_desc));
413 }
414
415 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
416 {
417         kmem_cache_free(pte_list_desc_cache, pte_list_desc);
418 }
419
420 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
421 {
422         if (!sp->role.direct)
423                 return sp->gfns[index];
424
425         return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
426 }
427
428 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
429 {
430         if (sp->role.direct)
431                 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
432         else
433                 sp->gfns[index] = gfn;
434 }
435
436 /*
437  * Return the pointer to the large page information for a given gfn,
438  * handling slots that are not large page aligned.
439  */
440 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
441                                               struct kvm_memory_slot *slot,
442                                               int level)
443 {
444         unsigned long idx;
445
446         idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
447               (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
448         return &slot->lpage_info[level - 2][idx];
449 }
450
451 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
452 {
453         struct kvm_memory_slot *slot;
454         struct kvm_lpage_info *linfo;
455         int i;
456
457         slot = gfn_to_memslot(kvm, gfn);
458         for (i = PT_DIRECTORY_LEVEL;
459              i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
460                 linfo = lpage_info_slot(gfn, slot, i);
461                 linfo->write_count += 1;
462         }
463         kvm->arch.indirect_shadow_pages++;
464 }
465
466 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
467 {
468         struct kvm_memory_slot *slot;
469         struct kvm_lpage_info *linfo;
470         int i;
471
472         slot = gfn_to_memslot(kvm, gfn);
473         for (i = PT_DIRECTORY_LEVEL;
474              i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
475                 linfo = lpage_info_slot(gfn, slot, i);
476                 linfo->write_count -= 1;
477                 WARN_ON(linfo->write_count < 0);
478         }
479         kvm->arch.indirect_shadow_pages--;
480 }
481
482 static int has_wrprotected_page(struct kvm *kvm,
483                                 gfn_t gfn,
484                                 int level)
485 {
486         struct kvm_memory_slot *slot;
487         struct kvm_lpage_info *linfo;
488
489         slot = gfn_to_memslot(kvm, gfn);
490         if (slot) {
491                 linfo = lpage_info_slot(gfn, slot, level);
492                 return linfo->write_count;
493         }
494
495         return 1;
496 }
497
498 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
499 {
500         unsigned long page_size;
501         int i, ret = 0;
502
503         page_size = kvm_host_page_size(kvm, gfn);
504
505         for (i = PT_PAGE_TABLE_LEVEL;
506              i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
507                 if (page_size >= KVM_HPAGE_SIZE(i))
508                         ret = i;
509                 else
510                         break;
511         }
512
513         return ret;
514 }
515
516 static struct kvm_memory_slot *
517 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
518                             bool no_dirty_log)
519 {
520         struct kvm_memory_slot *slot;
521
522         slot = gfn_to_memslot(vcpu->kvm, gfn);
523         if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
524               (no_dirty_log && slot->dirty_bitmap))
525                 slot = NULL;
526
527         return slot;
528 }
529
530 static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
531 {
532         return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
533 }
534
535 static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
536 {
537         int host_level, level, max_level;
538
539         host_level = host_mapping_level(vcpu->kvm, large_gfn);
540
541         if (host_level == PT_PAGE_TABLE_LEVEL)
542                 return host_level;
543
544         max_level = kvm_x86_ops->get_lpage_level() < host_level ?
545                 kvm_x86_ops->get_lpage_level() : host_level;
546
547         for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
548                 if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
549                         break;
550
551         return level - 1;
552 }
553
554 /*
555  * Pte mapping structures:
556  *
557  * If pte_list bit zero is zero, then pte_list point to the spte.
558  *
559  * If pte_list bit zero is one, (then pte_list & ~1) points to a struct
560  * pte_list_desc containing more mappings.
561  *
562  * Returns the number of pte entries before the spte was added or zero if
563  * the spte was not added.
564  *
565  */
566 static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
567                         unsigned long *pte_list)
568 {
569         struct pte_list_desc *desc;
570         int i, count = 0;
571
572         if (!*pte_list) {
573                 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
574                 *pte_list = (unsigned long)spte;
575         } else if (!(*pte_list & 1)) {
576                 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
577                 desc = mmu_alloc_pte_list_desc(vcpu);
578                 desc->sptes[0] = (u64 *)*pte_list;
579                 desc->sptes[1] = spte;
580                 *pte_list = (unsigned long)desc | 1;
581                 ++count;
582         } else {
583                 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
584                 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
585                 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
586                         desc = desc->more;
587                         count += PTE_LIST_EXT;
588                 }
589                 if (desc->sptes[PTE_LIST_EXT-1]) {
590                         desc->more = mmu_alloc_pte_list_desc(vcpu);
591                         desc = desc->more;
592                 }
593                 for (i = 0; desc->sptes[i]; ++i)
594                         ++count;
595                 desc->sptes[i] = spte;
596         }
597         return count;
598 }
599
600 static u64 *pte_list_next(unsigned long *pte_list, u64 *spte)
601 {
602         struct pte_list_desc *desc;
603         u64 *prev_spte;
604         int i;
605
606         if (!*pte_list)
607                 return NULL;
608         else if (!(*pte_list & 1)) {
609                 if (!spte)
610                         return (u64 *)*pte_list;
611                 return NULL;
612         }
613         desc = (struct pte_list_desc *)(*pte_list & ~1ul);
614         prev_spte = NULL;
615         while (desc) {
616                 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
617                         if (prev_spte == spte)
618                                 return desc->sptes[i];
619                         prev_spte = desc->sptes[i];
620                 }
621                 desc = desc->more;
622         }
623         return NULL;
624 }
625
626 static void
627 pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
628                            int i, struct pte_list_desc *prev_desc)
629 {
630         int j;
631
632         for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
633                 ;
634         desc->sptes[i] = desc->sptes[j];
635         desc->sptes[j] = NULL;
636         if (j != 0)
637                 return;
638         if (!prev_desc && !desc->more)
639                 *pte_list = (unsigned long)desc->sptes[0];
640         else
641                 if (prev_desc)
642                         prev_desc->more = desc->more;
643                 else
644                         *pte_list = (unsigned long)desc->more | 1;
645         mmu_free_pte_list_desc(desc);
646 }
647
648 static void pte_list_remove(u64 *spte, unsigned long *pte_list)
649 {
650         struct pte_list_desc *desc;
651         struct pte_list_desc *prev_desc;
652         int i;
653
654         if (!*pte_list) {
655                 printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
656                 BUG();
657         } else if (!(*pte_list & 1)) {
658                 rmap_printk("pte_list_remove:  %p 1->0\n", spte);
659                 if ((u64 *)*pte_list != spte) {
660                         printk(KERN_ERR "pte_list_remove:  %p 1->BUG\n", spte);
661                         BUG();
662                 }
663                 *pte_list = 0;
664         } else {
665                 rmap_printk("pte_list_remove:  %p many->many\n", spte);
666                 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
667                 prev_desc = NULL;
668                 while (desc) {
669                         for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
670                                 if (desc->sptes[i] == spte) {
671                                         pte_list_desc_remove_entry(pte_list,
672                                                                desc, i,
673                                                                prev_desc);
674                                         return;
675                                 }
676                         prev_desc = desc;
677                         desc = desc->more;
678                 }
679                 pr_err("pte_list_remove: %p many->many\n", spte);
680                 BUG();
681         }
682 }
683
684 typedef void (*pte_list_walk_fn) (u64 *spte);
685 static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
686 {
687         struct pte_list_desc *desc;
688         int i;
689
690         if (!*pte_list)
691                 return;
692
693         if (!(*pte_list & 1))
694                 return fn((u64 *)*pte_list);
695
696         desc = (struct pte_list_desc *)(*pte_list & ~1ul);
697         while (desc) {
698                 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
699                         fn(desc->sptes[i]);
700                 desc = desc->more;
701         }
702 }
703
704 /*
705  * Take gfn and return the reverse mapping to it.
706  */
707 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
708 {
709         struct kvm_memory_slot *slot;
710         struct kvm_lpage_info *linfo;
711
712         slot = gfn_to_memslot(kvm, gfn);
713         if (likely(level == PT_PAGE_TABLE_LEVEL))
714                 return &slot->rmap[gfn - slot->base_gfn];
715
716         linfo = lpage_info_slot(gfn, slot, level);
717
718         return &linfo->rmap_pde;
719 }
720
721 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
722 {
723         struct kvm_mmu_page *sp;
724         unsigned long *rmapp;
725
726         sp = page_header(__pa(spte));
727         kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
728         rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
729         return pte_list_add(vcpu, spte, rmapp);
730 }
731
732 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
733 {
734         return pte_list_next(rmapp, spte);
735 }
736
737 static void rmap_remove(struct kvm *kvm, u64 *spte)
738 {
739         struct kvm_mmu_page *sp;
740         gfn_t gfn;
741         unsigned long *rmapp;
742
743         sp = page_header(__pa(spte));
744         gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
745         rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
746         pte_list_remove(spte, rmapp);
747 }
748
749 static int set_spte_track_bits(u64 *sptep, u64 new_spte)
750 {
751         pfn_t pfn;
752         u64 old_spte = *sptep;
753
754         if (!spte_has_volatile_bits(old_spte))
755                 __set_spte(sptep, new_spte);
756         else
757                 old_spte = __xchg_spte(sptep, new_spte);
758
759         if (!is_rmap_spte(old_spte))
760                 return 0;
761
762         pfn = spte_to_pfn(old_spte);
763         if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
764                 kvm_set_pfn_accessed(pfn);
765         if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
766                 kvm_set_pfn_dirty(pfn);
767         return 1;
768 }
769
770 static void drop_spte(struct kvm *kvm, u64 *sptep)
771 {
772         if (set_spte_track_bits(sptep, 0ull))
773                 rmap_remove(kvm, sptep);
774 }
775
776 static int rmap_write_protect(struct kvm *kvm, u64 gfn)
777 {
778         unsigned long *rmapp;
779         u64 *spte;
780         int i, write_protected = 0;
781
782         rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
783
784         spte = rmap_next(kvm, rmapp, NULL);
785         while (spte) {
786                 BUG_ON(!spte);
787                 BUG_ON(!(*spte & PT_PRESENT_MASK));
788                 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
789                 if (is_writable_pte(*spte)) {
790                         update_spte(spte, *spte & ~PT_WRITABLE_MASK);
791                         write_protected = 1;
792                 }
793                 spte = rmap_next(kvm, rmapp, spte);
794         }
795
796         /* check for huge page mappings */
797         for (i = PT_DIRECTORY_LEVEL;
798              i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
799                 rmapp = gfn_to_rmap(kvm, gfn, i);
800                 spte = rmap_next(kvm, rmapp, NULL);
801                 while (spte) {
802                         BUG_ON(!spte);
803                         BUG_ON(!(*spte & PT_PRESENT_MASK));
804                         BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
805                         pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
806                         if (is_writable_pte(*spte)) {
807                                 drop_spte(kvm, spte);
808                                 --kvm->stat.lpages;
809                                 spte = NULL;
810                                 write_protected = 1;
811                         }
812                         spte = rmap_next(kvm, rmapp, spte);
813                 }
814         }
815
816         return write_protected;
817 }
818
819 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
820                            unsigned long data)
821 {
822         u64 *spte;
823         int need_tlb_flush = 0;
824
825         while ((spte = rmap_next(kvm, rmapp, NULL))) {
826                 BUG_ON(!(*spte & PT_PRESENT_MASK));
827                 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
828                 drop_spte(kvm, spte);
829                 need_tlb_flush = 1;
830         }
831         return need_tlb_flush;
832 }
833
834 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
835                              unsigned long data)
836 {
837         int need_flush = 0;
838         u64 *spte, new_spte;
839         pte_t *ptep = (pte_t *)data;
840         pfn_t new_pfn;
841
842         WARN_ON(pte_huge(*ptep));
843         new_pfn = pte_pfn(*ptep);
844         spte = rmap_next(kvm, rmapp, NULL);
845         while (spte) {
846                 BUG_ON(!is_shadow_present_pte(*spte));
847                 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
848                 need_flush = 1;
849                 if (pte_write(*ptep)) {
850                         drop_spte(kvm, spte);
851                         spte = rmap_next(kvm, rmapp, NULL);
852                 } else {
853                         new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
854                         new_spte |= (u64)new_pfn << PAGE_SHIFT;
855
856                         new_spte &= ~PT_WRITABLE_MASK;
857                         new_spte &= ~SPTE_HOST_WRITEABLE;
858                         new_spte &= ~shadow_accessed_mask;
859                         set_spte_track_bits(spte, new_spte);
860                         spte = rmap_next(kvm, rmapp, spte);
861                 }
862         }
863         if (need_flush)
864                 kvm_flush_remote_tlbs(kvm);
865
866         return 0;
867 }
868
869 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
870                           unsigned long data,
871                           int (*handler)(struct kvm *kvm, unsigned long *rmapp,
872                                          unsigned long data))
873 {
874         int i, j;
875         int ret;
876         int retval = 0;
877         struct kvm_memslots *slots;
878
879         slots = kvm_memslots(kvm);
880
881         for (i = 0; i < slots->nmemslots; i++) {
882                 struct kvm_memory_slot *memslot = &slots->memslots[i];
883                 unsigned long start = memslot->userspace_addr;
884                 unsigned long end;
885
886                 end = start + (memslot->npages << PAGE_SHIFT);
887                 if (hva >= start && hva < end) {
888                         gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
889                         gfn_t gfn = memslot->base_gfn + gfn_offset;
890
891                         ret = handler(kvm, &memslot->rmap[gfn_offset], data);
892
893                         for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
894                                 struct kvm_lpage_info *linfo;
895
896                                 linfo = lpage_info_slot(gfn, memslot,
897                                                         PT_DIRECTORY_LEVEL + j);
898                                 ret |= handler(kvm, &linfo->rmap_pde, data);
899                         }
900                         trace_kvm_age_page(hva, memslot, ret);
901                         retval |= ret;
902                 }
903         }
904
905         return retval;
906 }
907
908 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
909 {
910         return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
911 }
912
913 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
914 {
915         kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
916 }
917
918 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
919                          unsigned long data)
920 {
921         u64 *spte;
922         int young = 0;
923
924         /*
925          * Emulate the accessed bit for EPT, by checking if this page has
926          * an EPT mapping, and clearing it if it does. On the next access,
927          * a new EPT mapping will be established.
928          * This has some overhead, but not as much as the cost of swapping
929          * out actively used pages or breaking up actively used hugepages.
930          */
931         if (!shadow_accessed_mask)
932                 return kvm_unmap_rmapp(kvm, rmapp, data);
933
934         spte = rmap_next(kvm, rmapp, NULL);
935         while (spte) {
936                 int _young;
937                 u64 _spte = *spte;
938                 BUG_ON(!(_spte & PT_PRESENT_MASK));
939                 _young = _spte & PT_ACCESSED_MASK;
940                 if (_young) {
941                         young = 1;
942                         clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
943                 }
944                 spte = rmap_next(kvm, rmapp, spte);
945         }
946         return young;
947 }
948
949 static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
950                               unsigned long data)
951 {
952         u64 *spte;
953         int young = 0;
954
955         /*
956          * If there's no access bit in the secondary pte set by the
957          * hardware it's up to gup-fast/gup to set the access bit in
958          * the primary pte or in the page structure.
959          */
960         if (!shadow_accessed_mask)
961                 goto out;
962
963         spte = rmap_next(kvm, rmapp, NULL);
964         while (spte) {
965                 u64 _spte = *spte;
966                 BUG_ON(!(_spte & PT_PRESENT_MASK));
967                 young = _spte & PT_ACCESSED_MASK;
968                 if (young) {
969                         young = 1;
970                         break;
971                 }
972                 spte = rmap_next(kvm, rmapp, spte);
973         }
974 out:
975         return young;
976 }
977
978 #define RMAP_RECYCLE_THRESHOLD 1000
979
980 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
981 {
982         unsigned long *rmapp;
983         struct kvm_mmu_page *sp;
984
985         sp = page_header(__pa(spte));
986
987         rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
988
989         kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
990         kvm_flush_remote_tlbs(vcpu->kvm);
991 }
992
993 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
994 {
995         return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
996 }
997
998 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
999 {
1000         return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1001 }
1002
1003 #ifdef MMU_DEBUG
1004 static int is_empty_shadow_page(u64 *spt)
1005 {
1006         u64 *pos;
1007         u64 *end;
1008
1009         for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
1010                 if (is_shadow_present_pte(*pos)) {
1011                         printk(KERN_ERR "%s: %p %llx\n", __func__,
1012                                pos, *pos);
1013                         return 0;
1014                 }
1015         return 1;
1016 }
1017 #endif
1018
1019 /*
1020  * This value is the sum of all of the kvm instances's
1021  * kvm->arch.n_used_mmu_pages values.  We need a global,
1022  * aggregate version in order to make the slab shrinker
1023  * faster
1024  */
1025 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1026 {
1027         kvm->arch.n_used_mmu_pages += nr;
1028         percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1029 }
1030
1031 /*
1032  * Remove the sp from shadow page cache, after call it,
1033  * we can not find this sp from the cache, and the shadow
1034  * page table is still valid.
1035  * It should be under the protection of mmu lock.
1036  */
1037 static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
1038 {
1039         ASSERT(is_empty_shadow_page(sp->spt));
1040         hlist_del(&sp->hash_link);
1041         if (!sp->role.direct)
1042                 free_page((unsigned long)sp->gfns);
1043 }
1044
1045 /*
1046  * Free the shadow page table and the sp, we can do it
1047  * out of the protection of mmu lock.
1048  */
1049 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1050 {
1051         list_del(&sp->link);
1052         free_page((unsigned long)sp->spt);
1053         kmem_cache_free(mmu_page_header_cache, sp);
1054 }
1055
1056 static unsigned kvm_page_table_hashfn(gfn_t gfn)
1057 {
1058         return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
1059 }
1060
1061 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1062                                     struct kvm_mmu_page *sp, u64 *parent_pte)
1063 {
1064         if (!parent_pte)
1065                 return;
1066
1067         pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
1068 }
1069
1070 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1071                                        u64 *parent_pte)
1072 {
1073         pte_list_remove(parent_pte, &sp->parent_ptes);
1074 }
1075
1076 static void drop_parent_pte(struct kvm_mmu_page *sp,
1077                             u64 *parent_pte)
1078 {
1079         mmu_page_remove_parent_pte(sp, parent_pte);
1080         __set_spte(parent_pte, 0ull);
1081 }
1082
1083 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1084                                                u64 *parent_pte, int direct)
1085 {
1086         struct kvm_mmu_page *sp;
1087         sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache,
1088                                         sizeof *sp);
1089         sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1090         if (!direct)
1091                 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
1092                                                   PAGE_SIZE);
1093         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1094         list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1095         bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
1096         sp->parent_ptes = 0;
1097         mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1098         kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1099         return sp;
1100 }
1101
1102 static void mark_unsync(u64 *spte);
1103 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1104 {
1105         pte_list_walk(&sp->parent_ptes, mark_unsync);
1106 }
1107
1108 static void mark_unsync(u64 *spte)
1109 {
1110         struct kvm_mmu_page *sp;
1111         unsigned int index;
1112
1113         sp = page_header(__pa(spte));
1114         index = spte - sp->spt;
1115         if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1116                 return;
1117         if (sp->unsync_children++)
1118                 return;
1119         kvm_mmu_mark_parents_unsync(sp);
1120 }
1121
1122 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1123                                struct kvm_mmu_page *sp)
1124 {
1125         return 1;
1126 }
1127
1128 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1129 {
1130 }
1131
1132 static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
1133                                  struct kvm_mmu_page *sp, u64 *spte,
1134                                  const void *pte)
1135 {
1136         WARN_ON(1);
1137 }
1138
1139 #define KVM_PAGE_ARRAY_NR 16
1140
1141 struct kvm_mmu_pages {
1142         struct mmu_page_and_offset {
1143                 struct kvm_mmu_page *sp;
1144                 unsigned int idx;
1145         } page[KVM_PAGE_ARRAY_NR];
1146         unsigned int nr;
1147 };
1148
1149 #define for_each_unsync_children(bitmap, idx)           \
1150         for (idx = find_first_bit(bitmap, 512);         \
1151              idx < 512;                                 \
1152              idx = find_next_bit(bitmap, 512, idx+1))
1153
1154 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1155                          int idx)
1156 {
1157         int i;
1158
1159         if (sp->unsync)
1160                 for (i=0; i < pvec->nr; i++)
1161                         if (pvec->page[i].sp == sp)
1162                                 return 0;
1163
1164         pvec->page[pvec->nr].sp = sp;
1165         pvec->page[pvec->nr].idx = idx;
1166         pvec->nr++;
1167         return (pvec->nr == KVM_PAGE_ARRAY_NR);
1168 }
1169
1170 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1171                            struct kvm_mmu_pages *pvec)
1172 {
1173         int i, ret, nr_unsync_leaf = 0;
1174
1175         for_each_unsync_children(sp->unsync_child_bitmap, i) {
1176                 struct kvm_mmu_page *child;
1177                 u64 ent = sp->spt[i];
1178
1179                 if (!is_shadow_present_pte(ent) || is_large_pte(ent))
1180                         goto clear_child_bitmap;
1181
1182                 child = page_header(ent & PT64_BASE_ADDR_MASK);
1183
1184                 if (child->unsync_children) {
1185                         if (mmu_pages_add(pvec, child, i))
1186                                 return -ENOSPC;
1187
1188                         ret = __mmu_unsync_walk(child, pvec);
1189                         if (!ret)
1190                                 goto clear_child_bitmap;
1191                         else if (ret > 0)
1192                                 nr_unsync_leaf += ret;
1193                         else
1194                                 return ret;
1195                 } else if (child->unsync) {
1196                         nr_unsync_leaf++;
1197                         if (mmu_pages_add(pvec, child, i))
1198                                 return -ENOSPC;
1199                 } else
1200                          goto clear_child_bitmap;
1201
1202                 continue;
1203
1204 clear_child_bitmap:
1205                 __clear_bit(i, sp->unsync_child_bitmap);
1206                 sp->unsync_children--;
1207                 WARN_ON((int)sp->unsync_children < 0);
1208         }
1209
1210
1211         return nr_unsync_leaf;
1212 }
1213
1214 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1215                            struct kvm_mmu_pages *pvec)
1216 {
1217         if (!sp->unsync_children)
1218                 return 0;
1219
1220         mmu_pages_add(pvec, sp, 0);
1221         return __mmu_unsync_walk(sp, pvec);
1222 }
1223
1224 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1225 {
1226         WARN_ON(!sp->unsync);
1227         trace_kvm_mmu_sync_page(sp);
1228         sp->unsync = 0;
1229         --kvm->stat.mmu_unsync;
1230 }
1231
1232 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1233                                     struct list_head *invalid_list);
1234 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1235                                     struct list_head *invalid_list);
1236
1237 #define for_each_gfn_sp(kvm, sp, gfn, pos)                              \
1238   hlist_for_each_entry(sp, pos,                                         \
1239    &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)   \
1240         if ((sp)->gfn != (gfn)) {} else
1241
1242 #define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos)               \
1243   hlist_for_each_entry(sp, pos,                                         \
1244    &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)   \
1245                 if ((sp)->gfn != (gfn) || (sp)->role.direct ||          \
1246                         (sp)->role.invalid) {} else
1247
1248 /* @sp->gfn should be write-protected at the call site */
1249 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1250                            struct list_head *invalid_list, bool clear_unsync)
1251 {
1252         if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1253                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1254                 return 1;
1255         }
1256
1257         if (clear_unsync)
1258                 kvm_unlink_unsync_page(vcpu->kvm, sp);
1259
1260         if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1261                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1262                 return 1;
1263         }
1264
1265         kvm_mmu_flush_tlb(vcpu);
1266         return 0;
1267 }
1268
1269 static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
1270                                    struct kvm_mmu_page *sp)
1271 {
1272         LIST_HEAD(invalid_list);
1273         int ret;
1274
1275         ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
1276         if (ret)
1277                 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1278
1279         return ret;
1280 }
1281
1282 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1283                          struct list_head *invalid_list)
1284 {
1285         return __kvm_sync_page(vcpu, sp, invalid_list, true);
1286 }
1287
1288 /* @gfn should be write-protected at the call site */
1289 static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
1290 {
1291         struct kvm_mmu_page *s;
1292         struct hlist_node *node;
1293         LIST_HEAD(invalid_list);
1294         bool flush = false;
1295
1296         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1297                 if (!s->unsync)
1298                         continue;
1299
1300                 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1301                 kvm_unlink_unsync_page(vcpu->kvm, s);
1302                 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1303                         (vcpu->arch.mmu.sync_page(vcpu, s))) {
1304                         kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1305                         continue;
1306                 }
1307                 flush = true;
1308         }
1309
1310         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1311         if (flush)
1312                 kvm_mmu_flush_tlb(vcpu);
1313 }
1314
1315 struct mmu_page_path {
1316         struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
1317         unsigned int idx[PT64_ROOT_LEVEL-1];
1318 };
1319
1320 #define for_each_sp(pvec, sp, parents, i)                       \
1321                 for (i = mmu_pages_next(&pvec, &parents, -1),   \
1322                         sp = pvec.page[i].sp;                   \
1323                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
1324                         i = mmu_pages_next(&pvec, &parents, i))
1325
1326 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1327                           struct mmu_page_path *parents,
1328                           int i)
1329 {
1330         int n;
1331
1332         for (n = i+1; n < pvec->nr; n++) {
1333                 struct kvm_mmu_page *sp = pvec->page[n].sp;
1334
1335                 if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1336                         parents->idx[0] = pvec->page[n].idx;
1337                         return n;
1338                 }
1339
1340                 parents->parent[sp->role.level-2] = sp;
1341                 parents->idx[sp->role.level-1] = pvec->page[n].idx;
1342         }
1343
1344         return n;
1345 }
1346
1347 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1348 {
1349         struct kvm_mmu_page *sp;
1350         unsigned int level = 0;
1351
1352         do {
1353                 unsigned int idx = parents->idx[level];
1354
1355                 sp = parents->parent[level];
1356                 if (!sp)
1357                         return;
1358
1359                 --sp->unsync_children;
1360                 WARN_ON((int)sp->unsync_children < 0);
1361                 __clear_bit(idx, sp->unsync_child_bitmap);
1362                 level++;
1363         } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
1364 }
1365
1366 static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
1367                                struct mmu_page_path *parents,
1368                                struct kvm_mmu_pages *pvec)
1369 {
1370         parents->parent[parent->role.level-1] = NULL;
1371         pvec->nr = 0;
1372 }
1373
1374 static void mmu_sync_children(struct kvm_vcpu *vcpu,
1375                               struct kvm_mmu_page *parent)
1376 {
1377         int i;
1378         struct kvm_mmu_page *sp;
1379         struct mmu_page_path parents;
1380         struct kvm_mmu_pages pages;
1381         LIST_HEAD(invalid_list);
1382
1383         kvm_mmu_pages_init(parent, &parents, &pages);
1384         while (mmu_unsync_walk(parent, &pages)) {
1385                 int protected = 0;
1386
1387                 for_each_sp(pages, sp, parents, i)
1388                         protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
1389
1390                 if (protected)
1391                         kvm_flush_remote_tlbs(vcpu->kvm);
1392
1393                 for_each_sp(pages, sp, parents, i) {
1394                         kvm_sync_page(vcpu, sp, &invalid_list);
1395                         mmu_pages_clear_parents(&parents);
1396                 }
1397                 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1398                 cond_resched_lock(&vcpu->kvm->mmu_lock);
1399                 kvm_mmu_pages_init(parent, &parents, &pages);
1400         }
1401 }
1402
1403 static void init_shadow_page_table(struct kvm_mmu_page *sp)
1404 {
1405         int i;
1406
1407         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1408                 sp->spt[i] = 0ull;
1409 }
1410
1411 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1412                                              gfn_t gfn,
1413                                              gva_t gaddr,
1414                                              unsigned level,
1415                                              int direct,
1416                                              unsigned access,
1417                                              u64 *parent_pte)
1418 {
1419         union kvm_mmu_page_role role;
1420         unsigned quadrant;
1421         struct kvm_mmu_page *sp;
1422         struct hlist_node *node;
1423         bool need_sync = false;
1424
1425         role = vcpu->arch.mmu.base_role;
1426         role.level = level;
1427         role.direct = direct;
1428         if (role.direct)
1429                 role.cr4_pae = 0;
1430         role.access = access;
1431         if (!vcpu->arch.mmu.direct_map
1432             && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1433                 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1434                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1435                 role.quadrant = quadrant;
1436         }
1437         for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
1438                 if (!need_sync && sp->unsync)
1439                         need_sync = true;
1440
1441                 if (sp->role.word != role.word)
1442                         continue;
1443
1444                 if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
1445                         break;
1446
1447                 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1448                 if (sp->unsync_children) {
1449                         kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1450                         kvm_mmu_mark_parents_unsync(sp);
1451                 } else if (sp->unsync)
1452                         kvm_mmu_mark_parents_unsync(sp);
1453
1454                 trace_kvm_mmu_get_page(sp, false);
1455                 return sp;
1456         }
1457         ++vcpu->kvm->stat.mmu_cache_miss;
1458         sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
1459         if (!sp)
1460                 return sp;
1461         sp->gfn = gfn;
1462         sp->role = role;
1463         hlist_add_head(&sp->hash_link,
1464                 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
1465         if (!direct) {
1466                 if (rmap_write_protect(vcpu->kvm, gfn))
1467                         kvm_flush_remote_tlbs(vcpu->kvm);
1468                 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
1469                         kvm_sync_pages(vcpu, gfn);
1470
1471                 account_shadowed(vcpu->kvm, gfn);
1472         }
1473         init_shadow_page_table(sp);
1474         trace_kvm_mmu_get_page(sp, true);
1475         return sp;
1476 }
1477
1478 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1479                              struct kvm_vcpu *vcpu, u64 addr)
1480 {
1481         iterator->addr = addr;
1482         iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1483         iterator->level = vcpu->arch.mmu.shadow_root_level;
1484
1485         if (iterator->level == PT64_ROOT_LEVEL &&
1486             vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
1487             !vcpu->arch.mmu.direct_map)
1488                 --iterator->level;
1489
1490         if (iterator->level == PT32E_ROOT_LEVEL) {
1491                 iterator->shadow_addr
1492                         = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1493                 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
1494                 --iterator->level;
1495                 if (!iterator->shadow_addr)
1496                         iterator->level = 0;
1497         }
1498 }
1499
1500 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1501 {
1502         if (iterator->level < PT_PAGE_TABLE_LEVEL)
1503                 return false;
1504
1505         iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1506         iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
1507         return true;
1508 }
1509
1510 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1511 {
1512         if (is_last_spte(*iterator->sptep, iterator->level)) {
1513                 iterator->level = 0;
1514                 return;
1515         }
1516
1517         iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
1518         --iterator->level;
1519 }
1520
1521 static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1522 {
1523         u64 spte;
1524
1525         spte = __pa(sp->spt)
1526                 | PT_PRESENT_MASK | PT_ACCESSED_MASK
1527                 | PT_WRITABLE_MASK | PT_USER_MASK;
1528         __set_spte(sptep, spte);
1529 }
1530
1531 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1532 {
1533         if (is_large_pte(*sptep)) {
1534                 drop_spte(vcpu->kvm, sptep);
1535                 kvm_flush_remote_tlbs(vcpu->kvm);
1536         }
1537 }
1538
1539 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1540                                    unsigned direct_access)
1541 {
1542         if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
1543                 struct kvm_mmu_page *child;
1544
1545                 /*
1546                  * For the direct sp, if the guest pte's dirty bit
1547                  * changed form clean to dirty, it will corrupt the
1548                  * sp's access: allow writable in the read-only sp,
1549                  * so we should update the spte at this point to get
1550                  * a new sp with the correct access.
1551                  */
1552                 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
1553                 if (child->role.access == direct_access)
1554                         return;
1555
1556                 drop_parent_pte(child, sptep);
1557                 kvm_flush_remote_tlbs(vcpu->kvm);
1558         }
1559 }
1560
1561 static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
1562                              u64 *spte)
1563 {
1564         u64 pte;
1565         struct kvm_mmu_page *child;
1566
1567         pte = *spte;
1568         if (is_shadow_present_pte(pte)) {
1569                 if (is_last_spte(pte, sp->role.level))
1570                         drop_spte(kvm, spte);
1571                 else {
1572                         child = page_header(pte & PT64_BASE_ADDR_MASK);
1573                         drop_parent_pte(child, spte);
1574                 }
1575         }
1576
1577         if (is_large_pte(pte))
1578                 --kvm->stat.lpages;
1579 }
1580
1581 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1582                                          struct kvm_mmu_page *sp)
1583 {
1584         unsigned i;
1585
1586         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1587                 mmu_page_zap_pte(kvm, sp, sp->spt + i);
1588 }
1589
1590 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1591 {
1592         mmu_page_remove_parent_pte(sp, parent_pte);
1593 }
1594
1595 static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
1596 {
1597         int i;
1598         struct kvm_vcpu *vcpu;
1599
1600         kvm_for_each_vcpu(i, vcpu, kvm)
1601                 vcpu->arch.last_pte_updated = NULL;
1602 }
1603
1604 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1605 {
1606         u64 *parent_pte;
1607
1608         while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL)))
1609                 drop_parent_pte(sp, parent_pte);
1610 }
1611
1612 static int mmu_zap_unsync_children(struct kvm *kvm,
1613                                    struct kvm_mmu_page *parent,
1614                                    struct list_head *invalid_list)
1615 {
1616         int i, zapped = 0;
1617         struct mmu_page_path parents;
1618         struct kvm_mmu_pages pages;
1619
1620         if (parent->role.level == PT_PAGE_TABLE_LEVEL)
1621                 return 0;
1622
1623         kvm_mmu_pages_init(parent, &parents, &pages);
1624         while (mmu_unsync_walk(parent, &pages)) {
1625                 struct kvm_mmu_page *sp;
1626
1627                 for_each_sp(pages, sp, parents, i) {
1628                         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
1629                         mmu_pages_clear_parents(&parents);
1630                         zapped++;
1631                 }
1632                 kvm_mmu_pages_init(parent, &parents, &pages);
1633         }
1634
1635         return zapped;
1636 }
1637
1638 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1639                                     struct list_head *invalid_list)
1640 {
1641         int ret;
1642
1643         trace_kvm_mmu_prepare_zap_page(sp);
1644         ++kvm->stat.mmu_shadow_zapped;
1645         ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
1646         kvm_mmu_page_unlink_children(kvm, sp);
1647         kvm_mmu_unlink_parents(kvm, sp);
1648         if (!sp->role.invalid && !sp->role.direct)
1649                 unaccount_shadowed(kvm, sp->gfn);
1650         if (sp->unsync)
1651                 kvm_unlink_unsync_page(kvm, sp);
1652         if (!sp->root_count) {
1653                 /* Count self */
1654                 ret++;
1655                 list_move(&sp->link, invalid_list);
1656                 kvm_mod_used_mmu_pages(kvm, -1);
1657         } else {
1658                 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1659                 kvm_reload_remote_mmus(kvm);
1660         }
1661
1662         sp->role.invalid = 1;
1663         kvm_mmu_reset_last_pte_updated(kvm);
1664         return ret;
1665 }
1666
1667 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1668                                     struct list_head *invalid_list)
1669 {
1670         struct kvm_mmu_page *sp;
1671
1672         if (list_empty(invalid_list))
1673                 return;
1674
1675         kvm_flush_remote_tlbs(kvm);
1676
1677         do {
1678                 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1679                 WARN_ON(!sp->role.invalid || sp->root_count);
1680                 kvm_mmu_isolate_page(sp);
1681                 kvm_mmu_free_page(sp);
1682         } while (!list_empty(invalid_list));
1683
1684 }
1685
1686 /*
1687  * Changing the number of mmu pages allocated to the vm
1688  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
1689  */
1690 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1691 {
1692         LIST_HEAD(invalid_list);
1693         /*
1694          * If we set the number of mmu pages to be smaller be than the
1695          * number of actived pages , we must to free some mmu pages before we
1696          * change the value
1697          */
1698
1699         if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
1700                 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
1701                         !list_empty(&kvm->arch.active_mmu_pages)) {
1702                         struct kvm_mmu_page *page;
1703
1704                         page = container_of(kvm->arch.active_mmu_pages.prev,
1705                                             struct kvm_mmu_page, link);
1706                         kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
1707                 }
1708                 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1709                 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
1710         }
1711
1712         kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
1713 }
1714
1715 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1716 {
1717         struct kvm_mmu_page *sp;
1718         struct hlist_node *node;
1719         LIST_HEAD(invalid_list);
1720         int r;
1721
1722         pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
1723         r = 0;
1724
1725         for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1726                 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
1727                          sp->role.word);
1728                 r = 1;
1729                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1730         }
1731         kvm_mmu_commit_zap_page(kvm, &invalid_list);
1732         return r;
1733 }
1734
1735 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1736 {
1737         struct kvm_mmu_page *sp;
1738         struct hlist_node *node;
1739         LIST_HEAD(invalid_list);
1740
1741         for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1742                 pgprintk("%s: zap %llx %x\n",
1743                          __func__, gfn, sp->role.word);
1744                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1745         }
1746         kvm_mmu_commit_zap_page(kvm, &invalid_list);
1747 }
1748
1749 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1750 {
1751         int slot = memslot_id(kvm, gfn);
1752         struct kvm_mmu_page *sp = page_header(__pa(pte));
1753
1754         __set_bit(slot, sp->slot_bitmap);
1755 }
1756
1757 /*
1758  * The function is based on mtrr_type_lookup() in
1759  * arch/x86/kernel/cpu/mtrr/generic.c
1760  */
1761 static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
1762                          u64 start, u64 end)
1763 {
1764         int i;
1765         u64 base, mask;
1766         u8 prev_match, curr_match;
1767         int num_var_ranges = KVM_NR_VAR_MTRR;
1768
1769         if (!mtrr_state->enabled)
1770                 return 0xFF;
1771
1772         /* Make end inclusive end, instead of exclusive */
1773         end--;
1774
1775         /* Look in fixed ranges. Just return the type as per start */
1776         if (mtrr_state->have_fixed && (start < 0x100000)) {
1777                 int idx;
1778
1779                 if (start < 0x80000) {
1780                         idx = 0;
1781                         idx += (start >> 16);
1782                         return mtrr_state->fixed_ranges[idx];
1783                 } else if (start < 0xC0000) {
1784                         idx = 1 * 8;
1785                         idx += ((start - 0x80000) >> 14);
1786                         return mtrr_state->fixed_ranges[idx];
1787                 } else if (start < 0x1000000) {
1788                         idx = 3 * 8;
1789                         idx += ((start - 0xC0000) >> 12);
1790                         return mtrr_state->fixed_ranges[idx];
1791                 }
1792         }
1793
1794         /*
1795          * Look in variable ranges
1796          * Look of multiple ranges matching this address and pick type
1797          * as per MTRR precedence
1798          */
1799         if (!(mtrr_state->enabled & 2))
1800                 return mtrr_state->def_type;
1801
1802         prev_match = 0xFF;
1803         for (i = 0; i < num_var_ranges; ++i) {
1804                 unsigned short start_state, end_state;
1805
1806                 if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
1807                         continue;
1808
1809                 base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
1810                        (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
1811                 mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
1812                        (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
1813
1814                 start_state = ((start & mask) == (base & mask));
1815                 end_state = ((end & mask) == (base & mask));
1816                 if (start_state != end_state)
1817                         return 0xFE;
1818
1819                 if ((start & mask) != (base & mask))
1820                         continue;
1821
1822                 curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
1823                 if (prev_match == 0xFF) {
1824                         prev_match = curr_match;
1825                         continue;
1826                 }
1827
1828                 if (prev_match == MTRR_TYPE_UNCACHABLE ||
1829                     curr_match == MTRR_TYPE_UNCACHABLE)
1830                         return MTRR_TYPE_UNCACHABLE;
1831
1832                 if ((prev_match == MTRR_TYPE_WRBACK &&
1833                      curr_match == MTRR_TYPE_WRTHROUGH) ||
1834                     (prev_match == MTRR_TYPE_WRTHROUGH &&
1835                      curr_match == MTRR_TYPE_WRBACK)) {
1836                         prev_match = MTRR_TYPE_WRTHROUGH;
1837                         curr_match = MTRR_TYPE_WRTHROUGH;
1838                 }
1839
1840                 if (prev_match != curr_match)
1841                         return MTRR_TYPE_UNCACHABLE;
1842         }
1843
1844         if (prev_match != 0xFF)
1845                 return prev_match;
1846
1847         return mtrr_state->def_type;
1848 }
1849
1850 u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1851 {
1852         u8 mtrr;
1853
1854         mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
1855                              (gfn << PAGE_SHIFT) + PAGE_SIZE);
1856         if (mtrr == 0xfe || mtrr == 0xff)
1857                 mtrr = MTRR_TYPE_WRBACK;
1858         return mtrr;
1859 }
1860 EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1861
1862 static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1863 {
1864         trace_kvm_mmu_unsync_page(sp);
1865         ++vcpu->kvm->stat.mmu_unsync;
1866         sp->unsync = 1;
1867
1868         kvm_mmu_mark_parents_unsync(sp);
1869 }
1870
1871 static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
1872 {
1873         struct kvm_mmu_page *s;
1874         struct hlist_node *node;
1875
1876         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1877                 if (s->unsync)
1878                         continue;
1879                 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1880                 __kvm_unsync_page(vcpu, s);
1881         }
1882 }
1883
1884 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1885                                   bool can_unsync)
1886 {
1887         struct kvm_mmu_page *s;
1888         struct hlist_node *node;
1889         bool need_unsync = false;
1890
1891         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1892                 if (!can_unsync)
1893                         return 1;
1894
1895                 if (s->role.level != PT_PAGE_TABLE_LEVEL)
1896                         return 1;
1897
1898                 if (!need_unsync && !s->unsync) {
1899                         if (!oos_shadow)
1900                                 return 1;
1901                         need_unsync = true;
1902                 }
1903         }
1904         if (need_unsync)
1905                 kvm_unsync_pages(vcpu, gfn);
1906         return 0;
1907 }
1908
1909 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1910                     unsigned pte_access, int user_fault,
1911                     int write_fault, int level,
1912                     gfn_t gfn, pfn_t pfn, bool speculative,
1913                     bool can_unsync, bool host_writable)
1914 {
1915         u64 spte, entry = *sptep;
1916         int ret = 0;
1917
1918         /*
1919          * We don't set the accessed bit, since we sometimes want to see
1920          * whether the guest actually used the pte (in order to detect
1921          * demand paging).
1922          */
1923         spte = PT_PRESENT_MASK;
1924         if (!speculative)
1925                 spte |= shadow_accessed_mask;
1926
1927         if (pte_access & ACC_EXEC_MASK)
1928                 spte |= shadow_x_mask;
1929         else
1930                 spte |= shadow_nx_mask;
1931         if (pte_access & ACC_USER_MASK)
1932                 spte |= shadow_user_mask;
1933         if (level > PT_PAGE_TABLE_LEVEL)
1934                 spte |= PT_PAGE_SIZE_MASK;
1935         if (tdp_enabled)
1936                 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1937                         kvm_is_mmio_pfn(pfn));
1938
1939         if (host_writable)
1940                 spte |= SPTE_HOST_WRITEABLE;
1941         else
1942                 pte_access &= ~ACC_WRITE_MASK;
1943
1944         spte |= (u64)pfn << PAGE_SHIFT;
1945
1946         if ((pte_access & ACC_WRITE_MASK)
1947             || (!vcpu->arch.mmu.direct_map && write_fault
1948                 && !is_write_protection(vcpu) && !user_fault)) {
1949
1950                 if (level > PT_PAGE_TABLE_LEVEL &&
1951                     has_wrprotected_page(vcpu->kvm, gfn, level)) {
1952                         ret = 1;
1953                         drop_spte(vcpu->kvm, sptep);
1954                         goto done;
1955                 }
1956
1957                 spte |= PT_WRITABLE_MASK;
1958
1959                 if (!vcpu->arch.mmu.direct_map
1960                     && !(pte_access & ACC_WRITE_MASK)) {
1961                         spte &= ~PT_USER_MASK;
1962                         /*
1963                          * If we converted a user page to a kernel page,
1964                          * so that the kernel can write to it when cr0.wp=0,
1965                          * then we should prevent the kernel from executing it
1966                          * if SMEP is enabled.
1967                          */
1968                         if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
1969                                 spte |= PT64_NX_MASK;
1970                 }
1971
1972                 /*
1973                  * Optimization: for pte sync, if spte was writable the hash
1974                  * lookup is unnecessary (and expensive). Write protection
1975                  * is responsibility of mmu_get_page / kvm_sync_page.
1976                  * Same reasoning can be applied to dirty page accounting.
1977                  */
1978                 if (!can_unsync && is_writable_pte(*sptep))
1979                         goto set_pte;
1980
1981                 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1982                         pgprintk("%s: found shadow page for %llx, marking ro\n",
1983                                  __func__, gfn);
1984                         ret = 1;
1985                         pte_access &= ~ACC_WRITE_MASK;
1986                         if (is_writable_pte(spte))
1987                                 spte &= ~PT_WRITABLE_MASK;
1988                 }
1989         }
1990
1991         if (pte_access & ACC_WRITE_MASK)
1992                 mark_page_dirty(vcpu->kvm, gfn);
1993
1994 set_pte:
1995         update_spte(sptep, spte);
1996         /*
1997          * If we overwrite a writable spte with a read-only one we
1998          * should flush remote TLBs. Otherwise rmap_write_protect
1999          * will find a read-only spte, even though the writable spte
2000          * might be cached on a CPU's TLB.
2001          */
2002         if (is_writable_pte(entry) && !is_writable_pte(*sptep))
2003                 kvm_flush_remote_tlbs(vcpu->kvm);
2004 done:
2005         return ret;
2006 }
2007
2008 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2009                          unsigned pt_access, unsigned pte_access,
2010                          int user_fault, int write_fault,
2011                          int *emulate, int level, gfn_t gfn,
2012                          pfn_t pfn, bool speculative,
2013                          bool host_writable)
2014 {
2015         int was_rmapped = 0;
2016         int rmap_count;
2017
2018         pgprintk("%s: spte %llx access %x write_fault %d"
2019                  " user_fault %d gfn %llx\n",
2020                  __func__, *sptep, pt_access,
2021                  write_fault, user_fault, gfn);
2022
2023         if (is_rmap_spte(*sptep)) {
2024                 /*
2025                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
2026                  * the parent of the now unreachable PTE.
2027                  */
2028                 if (level > PT_PAGE_TABLE_LEVEL &&
2029                     !is_large_pte(*sptep)) {
2030                         struct kvm_mmu_page *child;
2031                         u64 pte = *sptep;
2032
2033                         child = page_header(pte & PT64_BASE_ADDR_MASK);
2034                         drop_parent_pte(child, sptep);
2035                         kvm_flush_remote_tlbs(vcpu->kvm);
2036                 } else if (pfn != spte_to_pfn(*sptep)) {
2037                         pgprintk("hfn old %llx new %llx\n",
2038                                  spte_to_pfn(*sptep), pfn);
2039                         drop_spte(vcpu->kvm, sptep);
2040                         kvm_flush_remote_tlbs(vcpu->kvm);
2041                 } else
2042                         was_rmapped = 1;
2043         }
2044
2045         if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2046                       level, gfn, pfn, speculative, true,
2047                       host_writable)) {
2048                 if (write_fault)
2049                         *emulate = 1;
2050                 kvm_mmu_flush_tlb(vcpu);
2051         }
2052
2053         pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2054         pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2055                  is_large_pte(*sptep)? "2MB" : "4kB",
2056                  *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
2057                  *sptep, sptep);
2058         if (!was_rmapped && is_large_pte(*sptep))
2059                 ++vcpu->kvm->stat.lpages;
2060
2061         if (is_shadow_present_pte(*sptep)) {
2062                 page_header_update_slot(vcpu->kvm, sptep, gfn);
2063                 if (!was_rmapped) {
2064                         rmap_count = rmap_add(vcpu, sptep, gfn);
2065                         if (rmap_count > RMAP_RECYCLE_THRESHOLD)
2066                                 rmap_recycle(vcpu, sptep, gfn);
2067                 }
2068         }
2069         kvm_release_pfn_clean(pfn);
2070         if (speculative) {
2071                 vcpu->arch.last_pte_updated = sptep;
2072                 vcpu->arch.last_pte_gfn = gfn;
2073         }
2074 }
2075
2076 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2077 {
2078 }
2079
2080 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2081                                      bool no_dirty_log)
2082 {
2083         struct kvm_memory_slot *slot;
2084         unsigned long hva;
2085
2086         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2087         if (!slot) {
2088                 get_page(fault_page);
2089                 return page_to_pfn(fault_page);
2090         }
2091
2092         hva = gfn_to_hva_memslot(slot, gfn);
2093
2094         return hva_to_pfn_atomic(vcpu->kvm, hva);
2095 }
2096
2097 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2098                                     struct kvm_mmu_page *sp,
2099                                     u64 *start, u64 *end)
2100 {
2101         struct page *pages[PTE_PREFETCH_NUM];
2102         unsigned access = sp->role.access;
2103         int i, ret;
2104         gfn_t gfn;
2105
2106         gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2107         if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
2108                 return -1;
2109
2110         ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
2111         if (ret <= 0)
2112                 return -1;
2113
2114         for (i = 0; i < ret; i++, gfn++, start++)
2115                 mmu_set_spte(vcpu, start, ACC_ALL,
2116                              access, 0, 0, NULL,
2117                              sp->role.level, gfn,
2118                              page_to_pfn(pages[i]), true, true);
2119
2120         return 0;
2121 }
2122
2123 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2124                                   struct kvm_mmu_page *sp, u64 *sptep)
2125 {
2126         u64 *spte, *start = NULL;
2127         int i;
2128
2129         WARN_ON(!sp->role.direct);
2130
2131         i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
2132         spte = sp->spt + i;
2133
2134         for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2135                 if (is_shadow_present_pte(*spte) || spte == sptep) {
2136                         if (!start)
2137                                 continue;
2138                         if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2139                                 break;
2140                         start = NULL;
2141                 } else if (!start)
2142                         start = spte;
2143         }
2144 }
2145
2146 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2147 {
2148         struct kvm_mmu_page *sp;
2149
2150         /*
2151          * Since it's no accessed bit on EPT, it's no way to
2152          * distinguish between actually accessed translations
2153          * and prefetched, so disable pte prefetch if EPT is
2154          * enabled.
2155          */
2156         if (!shadow_accessed_mask)
2157                 return;
2158
2159         sp = page_header(__pa(sptep));
2160         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
2161                 return;
2162
2163         __direct_pte_prefetch(vcpu, sp, sptep);
2164 }
2165
2166 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2167                         int map_writable, int level, gfn_t gfn, pfn_t pfn,
2168                         bool prefault)
2169 {
2170         struct kvm_shadow_walk_iterator iterator;
2171         struct kvm_mmu_page *sp;
2172         int emulate = 0;
2173         gfn_t pseudo_gfn;
2174
2175         for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2176                 if (iterator.level == level) {
2177                         unsigned pte_access = ACC_ALL;
2178
2179                         mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
2180                                      0, write, &emulate,
2181                                      level, gfn, pfn, prefault, map_writable);
2182                         direct_pte_prefetch(vcpu, iterator.sptep);
2183                         ++vcpu->stat.pf_fixed;
2184                         break;
2185                 }
2186
2187                 if (!is_shadow_present_pte(*iterator.sptep)) {
2188                         u64 base_addr = iterator.addr;
2189
2190                         base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
2191                         pseudo_gfn = base_addr >> PAGE_SHIFT;
2192                         sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
2193                                               iterator.level - 1,
2194                                               1, ACC_ALL, iterator.sptep);
2195                         if (!sp) {
2196                                 pgprintk("nonpaging_map: ENOMEM\n");
2197                                 kvm_release_pfn_clean(pfn);
2198                                 return -ENOMEM;
2199                         }
2200
2201                         __set_spte(iterator.sptep,
2202                                    __pa(sp->spt)
2203                                    | PT_PRESENT_MASK | PT_WRITABLE_MASK
2204                                    | shadow_user_mask | shadow_x_mask
2205                                    | shadow_accessed_mask);
2206                 }
2207         }
2208         return emulate;
2209 }
2210
2211 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
2212 {
2213         siginfo_t info;
2214
2215         info.si_signo   = SIGBUS;
2216         info.si_errno   = 0;
2217         info.si_code    = BUS_MCEERR_AR;
2218         info.si_addr    = (void __user *)address;
2219         info.si_addr_lsb = PAGE_SHIFT;
2220
2221         send_sig_info(SIGBUS, &info, tsk);
2222 }
2223
2224 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gva_t gva,
2225                                unsigned access, gfn_t gfn, pfn_t pfn)
2226 {
2227         kvm_release_pfn_clean(pfn);
2228         if (is_hwpoison_pfn(pfn)) {
2229                 kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
2230                 return 0;
2231         } else if (is_fault_pfn(pfn))
2232                 return -EFAULT;
2233
2234         vcpu_cache_mmio_info(vcpu, gva, gfn, access);
2235         return 1;
2236 }
2237
2238 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2239                                         gfn_t *gfnp, pfn_t *pfnp, int *levelp)
2240 {
2241         pfn_t pfn = *pfnp;
2242         gfn_t gfn = *gfnp;
2243         int level = *levelp;
2244
2245         /*
2246          * Check if it's a transparent hugepage. If this would be an
2247          * hugetlbfs page, level wouldn't be set to
2248          * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
2249          * here.
2250          */
2251         if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
2252             level == PT_PAGE_TABLE_LEVEL &&
2253             PageTransCompound(pfn_to_page(pfn)) &&
2254             !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
2255                 unsigned long mask;
2256                 /*
2257                  * mmu_notifier_retry was successful and we hold the
2258                  * mmu_lock here, so the pmd can't become splitting
2259                  * from under us, and in turn
2260                  * __split_huge_page_refcount() can't run from under
2261                  * us and we can safely transfer the refcount from
2262                  * PG_tail to PG_head as we switch the pfn to tail to
2263                  * head.
2264                  */
2265                 *levelp = level = PT_DIRECTORY_LEVEL;
2266                 mask = KVM_PAGES_PER_HPAGE(level) - 1;
2267                 VM_BUG_ON((gfn & mask) != (pfn & mask));
2268                 if (pfn & mask) {
2269                         gfn &= ~mask;
2270                         *gfnp = gfn;
2271                         kvm_release_pfn_clean(pfn);
2272                         pfn &= ~mask;
2273                         if (!get_page_unless_zero(pfn_to_page(pfn)))
2274                                 BUG();
2275                         *pfnp = pfn;
2276                 }
2277         }
2278 }
2279
2280 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2281                          gva_t gva, pfn_t *pfn, bool write, bool *writable);
2282
2283 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2284                          bool prefault)
2285 {
2286         int r;
2287         int level;
2288         int force_pt_level;
2289         pfn_t pfn;
2290         unsigned long mmu_seq;
2291         bool map_writable;
2292
2293         force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2294         if (likely(!force_pt_level)) {
2295                 level = mapping_level(vcpu, gfn);
2296                 /*
2297                  * This path builds a PAE pagetable - so we can map
2298                  * 2mb pages at maximum. Therefore check if the level
2299                  * is larger than that.
2300                  */
2301                 if (level > PT_DIRECTORY_LEVEL)
2302                         level = PT_DIRECTORY_LEVEL;
2303
2304                 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2305         } else
2306                 level = PT_PAGE_TABLE_LEVEL;
2307
2308         mmu_seq = vcpu->kvm->mmu_notifier_seq;
2309         smp_rmb();
2310
2311         if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
2312                 return 0;
2313
2314         /* mmio */
2315         if (is_error_pfn(pfn))
2316                 return kvm_handle_bad_page(vcpu, v, ACC_ALL, gfn, pfn);
2317
2318         spin_lock(&vcpu->kvm->mmu_lock);
2319         if (mmu_notifier_retry(vcpu, mmu_seq))
2320                 goto out_unlock;
2321         kvm_mmu_free_some_pages(vcpu);
2322         if (likely(!force_pt_level))
2323                 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2324         r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
2325                          prefault);
2326         spin_unlock(&vcpu->kvm->mmu_lock);
2327
2328
2329         return r;
2330
2331 out_unlock:
2332         spin_unlock(&vcpu->kvm->mmu_lock);
2333         kvm_release_pfn_clean(pfn);
2334         return 0;
2335 }
2336
2337
2338 static void mmu_free_roots(struct kvm_vcpu *vcpu)
2339 {
2340         int i;
2341         struct kvm_mmu_page *sp;
2342         LIST_HEAD(invalid_list);
2343
2344         if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2345                 return;
2346         spin_lock(&vcpu->kvm->mmu_lock);
2347         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
2348             (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
2349              vcpu->arch.mmu.direct_map)) {
2350                 hpa_t root = vcpu->arch.mmu.root_hpa;
2351
2352                 sp = page_header(root);
2353                 --sp->root_count;
2354                 if (!sp->root_count && sp->role.invalid) {
2355                         kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2356                         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2357                 }
2358                 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2359                 spin_unlock(&vcpu->kvm->mmu_lock);
2360                 return;
2361         }
2362         for (i = 0; i < 4; ++i) {
2363                 hpa_t root = vcpu->arch.mmu.pae_root[i];
2364
2365                 if (root) {
2366                         root &= PT64_BASE_ADDR_MASK;
2367                         sp = page_header(root);
2368                         --sp->root_count;
2369                         if (!sp->root_count && sp->role.invalid)
2370                                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2371                                                          &invalid_list);
2372                 }
2373                 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2374         }
2375         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2376         spin_unlock(&vcpu->kvm->mmu_lock);
2377         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2378 }
2379
2380 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2381 {
2382         int ret = 0;
2383
2384         if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
2385                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2386                 ret = 1;
2387         }
2388
2389         return ret;
2390 }
2391
2392 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2393 {
2394         struct kvm_mmu_page *sp;
2395         unsigned i;
2396
2397         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2398                 spin_lock(&vcpu->kvm->mmu_lock);
2399                 kvm_mmu_free_some_pages(vcpu);
2400                 sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
2401                                       1, ACC_ALL, NULL);
2402                 ++sp->root_count;
2403                 spin_unlock(&vcpu->kvm->mmu_lock);
2404                 vcpu->arch.mmu.root_hpa = __pa(sp->spt);
2405         } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
2406                 for (i = 0; i < 4; ++i) {
2407                         hpa_t root = vcpu->arch.mmu.pae_root[i];
2408
2409                         ASSERT(!VALID_PAGE(root));
2410                         spin_lock(&vcpu->kvm->mmu_lock);
2411                         kvm_mmu_free_some_pages(vcpu);
2412                         sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
2413                                               i << 30,
2414                                               PT32_ROOT_LEVEL, 1, ACC_ALL,
2415                                               NULL);
2416                         root = __pa(sp->spt);
2417                         ++sp->root_count;
2418                         spin_unlock(&vcpu->kvm->mmu_lock);
2419                         vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2420                 }
2421                 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2422         } else
2423                 BUG();
2424
2425         return 0;
2426 }
2427
2428 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2429 {
2430         struct kvm_mmu_page *sp;
2431         u64 pdptr, pm_mask;
2432         gfn_t root_gfn;
2433         int i;
2434
2435         root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
2436
2437         if (mmu_check_root(vcpu, root_gfn))
2438                 return 1;
2439
2440         /*
2441          * Do we shadow a long mode page table? If so we need to
2442          * write-protect the guests page table root.
2443          */
2444         if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2445                 hpa_t root = vcpu->arch.mmu.root_hpa;
2446
2447                 ASSERT(!VALID_PAGE(root));
2448
2449                 spin_lock(&vcpu->kvm->mmu_lock);
2450                 kvm_mmu_free_some_pages(vcpu);
2451                 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
2452                                       0, ACC_ALL, NULL);
2453                 root = __pa(sp->spt);
2454                 ++sp->root_count;
2455                 spin_unlock(&vcpu->kvm->mmu_lock);
2456                 vcpu->arch.mmu.root_hpa = root;
2457                 return 0;
2458         }
2459
2460         /*
2461          * We shadow a 32 bit page table. This may be a legacy 2-level
2462          * or a PAE 3-level page table. In either case we need to be aware that
2463          * the shadow page table may be a PAE or a long mode page table.
2464          */
2465         pm_mask = PT_PRESENT_MASK;
2466         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
2467                 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
2468
2469         for (i = 0; i < 4; ++i) {
2470                 hpa_t root = vcpu->arch.mmu.pae_root[i];
2471
2472                 ASSERT(!VALID_PAGE(root));
2473                 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2474                         pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
2475                         if (!is_present_gpte(pdptr)) {
2476                                 vcpu->arch.mmu.pae_root[i] = 0;
2477                                 continue;
2478                         }
2479                         root_gfn = pdptr >> PAGE_SHIFT;
2480                         if (mmu_check_root(vcpu, root_gfn))
2481                                 return 1;
2482                 }
2483                 spin_lock(&vcpu->kvm->mmu_lock);
2484                 kvm_mmu_free_some_pages(vcpu);
2485                 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2486                                       PT32_ROOT_LEVEL, 0,
2487                                       ACC_ALL, NULL);
2488                 root = __pa(sp->spt);
2489                 ++sp->root_count;
2490                 spin_unlock(&vcpu->kvm->mmu_lock);
2491
2492                 vcpu->arch.mmu.pae_root[i] = root | pm_mask;
2493         }
2494         vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2495
2496         /*
2497          * If we shadow a 32 bit page table with a long mode page
2498          * table we enter this path.
2499          */
2500         if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2501                 if (vcpu->arch.mmu.lm_root == NULL) {
2502                         /*
2503                          * The additional page necessary for this is only
2504                          * allocated on demand.
2505                          */
2506
2507                         u64 *lm_root;
2508
2509                         lm_root = (void*)get_zeroed_page(GFP_KERNEL);
2510                         if (lm_root == NULL)
2511                                 return 1;
2512
2513                         lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
2514
2515                         vcpu->arch.mmu.lm_root = lm_root;
2516                 }
2517
2518                 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
2519         }
2520
2521         return 0;
2522 }
2523
2524 static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2525 {
2526         if (vcpu->arch.mmu.direct_map)
2527                 return mmu_alloc_direct_roots(vcpu);
2528         else
2529                 return mmu_alloc_shadow_roots(vcpu);
2530 }
2531
2532 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2533 {
2534         int i;
2535         struct kvm_mmu_page *sp;
2536
2537         if (vcpu->arch.mmu.direct_map)
2538                 return;
2539
2540         if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2541                 return;
2542
2543         vcpu_clear_mmio_info(vcpu, ~0ul);
2544         trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2545         if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2546                 hpa_t root = vcpu->arch.mmu.root_hpa;
2547                 sp = page_header(root);
2548                 mmu_sync_children(vcpu, sp);
2549                 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2550                 return;
2551         }
2552         for (i = 0; i < 4; ++i) {
2553                 hpa_t root = vcpu->arch.mmu.pae_root[i];
2554
2555                 if (root && VALID_PAGE(root)) {
2556                         root &= PT64_BASE_ADDR_MASK;
2557                         sp = page_header(root);
2558                         mmu_sync_children(vcpu, sp);
2559                 }
2560         }
2561         trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2562 }
2563
2564 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2565 {
2566         spin_lock(&vcpu->kvm->mmu_lock);
2567         mmu_sync_roots(vcpu);
2568         spin_unlock(&vcpu->kvm->mmu_lock);
2569 }
2570
2571 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2572                                   u32 access, struct x86_exception *exception)
2573 {
2574         if (exception)
2575                 exception->error_code = 0;
2576         return vaddr;
2577 }
2578
2579 static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2580                                          u32 access,
2581                                          struct x86_exception *exception)
2582 {
2583         if (exception)
2584                 exception->error_code = 0;
2585         return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2586 }
2587
2588 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2589                                 u32 error_code, bool prefault)
2590 {
2591         gfn_t gfn;
2592         int r;
2593
2594         pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
2595         r = mmu_topup_memory_caches(vcpu);
2596         if (r)
2597                 return r;
2598
2599         ASSERT(vcpu);
2600         ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2601
2602         gfn = gva >> PAGE_SHIFT;
2603
2604         return nonpaging_map(vcpu, gva & PAGE_MASK,
2605                              error_code & PFERR_WRITE_MASK, gfn, prefault);
2606 }
2607
2608 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
2609 {
2610         struct kvm_arch_async_pf arch;
2611
2612         arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
2613         arch.gfn = gfn;
2614         arch.direct_map = vcpu->arch.mmu.direct_map;
2615         arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
2616
2617         return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
2618 }
2619
2620 static bool can_do_async_pf(struct kvm_vcpu *vcpu)
2621 {
2622         if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
2623                      kvm_event_needs_reinjection(vcpu)))
2624                 return false;
2625
2626         return kvm_x86_ops->interrupt_allowed(vcpu);
2627 }
2628
2629 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2630                          gva_t gva, pfn_t *pfn, bool write, bool *writable)
2631 {
2632         bool async;
2633
2634         *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
2635
2636         if (!async)
2637                 return false; /* *pfn has correct page already */
2638
2639         put_page(pfn_to_page(*pfn));
2640
2641         if (!prefault && can_do_async_pf(vcpu)) {
2642                 trace_kvm_try_async_get_page(gva, gfn);
2643                 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
2644                         trace_kvm_async_pf_doublefault(gva, gfn);
2645                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
2646                         return true;
2647                 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
2648                         return true;
2649         }
2650
2651         *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
2652
2653         return false;
2654 }
2655
2656 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2657                           bool prefault)
2658 {
2659         pfn_t pfn;
2660         int r;
2661         int level;
2662         int force_pt_level;
2663         gfn_t gfn = gpa >> PAGE_SHIFT;
2664         unsigned long mmu_seq;
2665         int write = error_code & PFERR_WRITE_MASK;
2666         bool map_writable;
2667
2668         ASSERT(vcpu);
2669         ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2670
2671         r = mmu_topup_memory_caches(vcpu);
2672         if (r)
2673                 return r;
2674
2675         force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2676         if (likely(!force_pt_level)) {
2677                 level = mapping_level(vcpu, gfn);
2678                 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2679         } else
2680                 level = PT_PAGE_TABLE_LEVEL;
2681
2682         mmu_seq = vcpu->kvm->mmu_notifier_seq;
2683         smp_rmb();
2684
2685         if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
2686                 return 0;
2687
2688         /* mmio */
2689         if (is_error_pfn(pfn))
2690                 return kvm_handle_bad_page(vcpu, 0, 0, gfn, pfn);
2691         spin_lock(&vcpu->kvm->mmu_lock);
2692         if (mmu_notifier_retry(vcpu, mmu_seq))
2693                 goto out_unlock;
2694         kvm_mmu_free_some_pages(vcpu);
2695         if (likely(!force_pt_level))
2696                 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2697         r = __direct_map(vcpu, gpa, write, map_writable,
2698                          level, gfn, pfn, prefault);
2699         spin_unlock(&vcpu->kvm->mmu_lock);
2700
2701         return r;
2702
2703 out_unlock:
2704         spin_unlock(&vcpu->kvm->mmu_lock);
2705         kvm_release_pfn_clean(pfn);
2706         return 0;
2707 }
2708
2709 static void nonpaging_free(struct kvm_vcpu *vcpu)
2710 {
2711         mmu_free_roots(vcpu);
2712 }
2713
2714 static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2715                                   struct kvm_mmu *context)
2716 {
2717         context->new_cr3 = nonpaging_new_cr3;
2718         context->page_fault = nonpaging_page_fault;
2719         context->gva_to_gpa = nonpaging_gva_to_gpa;
2720         context->free = nonpaging_free;
2721         context->sync_page = nonpaging_sync_page;
2722         context->invlpg = nonpaging_invlpg;
2723         context->update_pte = nonpaging_update_pte;
2724         context->root_level = 0;
2725         context->shadow_root_level = PT32E_ROOT_LEVEL;
2726         context->root_hpa = INVALID_PAGE;
2727         context->direct_map = true;
2728         context->nx = false;
2729         return 0;
2730 }
2731
2732 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2733 {
2734         ++vcpu->stat.tlb_flush;
2735         kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2736 }
2737
2738 static void paging_new_cr3(struct kvm_vcpu *vcpu)
2739 {
2740         pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
2741         mmu_free_roots(vcpu);
2742 }
2743
2744 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
2745 {
2746         return kvm_read_cr3(vcpu);
2747 }
2748
2749 static void inject_page_fault(struct kvm_vcpu *vcpu,
2750                               struct x86_exception *fault)
2751 {
2752         vcpu->arch.mmu.inject_page_fault(vcpu, fault);
2753 }
2754
2755 static void paging_free(struct kvm_vcpu *vcpu)
2756 {
2757         nonpaging_free(vcpu);
2758 }
2759
2760 static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2761 {
2762         int bit7;
2763
2764         bit7 = (gpte >> 7) & 1;
2765         return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2766 }
2767
2768 #define PTTYPE 64
2769 #include "paging_tmpl.h"
2770 #undef PTTYPE
2771
2772 #define PTTYPE 32
2773 #include "paging_tmpl.h"
2774 #undef PTTYPE
2775
2776 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
2777                                   struct kvm_mmu *context,
2778                                   int level)
2779 {
2780         int maxphyaddr = cpuid_maxphyaddr(vcpu);
2781         u64 exb_bit_rsvd = 0;
2782
2783         if (!context->nx)
2784                 exb_bit_rsvd = rsvd_bits(63, 63);
2785         switch (level) {
2786         case PT32_ROOT_LEVEL:
2787                 /* no rsvd bits for 2 level 4K page table entries */
2788                 context->rsvd_bits_mask[0][1] = 0;
2789                 context->rsvd_bits_mask[0][0] = 0;
2790                 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2791
2792                 if (!is_pse(vcpu)) {
2793                         context->rsvd_bits_mask[1][1] = 0;
2794                         break;
2795                 }
2796
2797                 if (is_cpuid_PSE36())
2798                         /* 36bits PSE 4MB page */
2799                         context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2800                 else
2801                         /* 32 bits PSE 4MB page */
2802                         context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2803                 break;
2804         case PT32E_ROOT_LEVEL:
2805                 context->rsvd_bits_mask[0][2] =
2806                         rsvd_bits(maxphyaddr, 63) |
2807                         rsvd_bits(7, 8) | rsvd_bits(1, 2);      /* PDPTE */
2808                 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2809                         rsvd_bits(maxphyaddr, 62);      /* PDE */
2810                 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2811                         rsvd_bits(maxphyaddr, 62);      /* PTE */
2812                 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2813                         rsvd_bits(maxphyaddr, 62) |
2814                         rsvd_bits(13, 20);              /* large page */
2815                 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2816                 break;
2817         case PT64_ROOT_LEVEL:
2818                 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
2819                         rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2820                 context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
2821                         rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2822                 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2823                         rsvd_bits(maxphyaddr, 51);
2824                 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2825                         rsvd_bits(maxphyaddr, 51);
2826                 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2827                 context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
2828                         rsvd_bits(maxphyaddr, 51) |
2829                         rsvd_bits(13, 29);
2830                 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2831                         rsvd_bits(maxphyaddr, 51) |
2832                         rsvd_bits(13, 20);              /* large page */
2833                 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2834                 break;
2835         }
2836 }
2837
2838 static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2839                                         struct kvm_mmu *context,
2840                                         int level)
2841 {
2842         context->nx = is_nx(vcpu);
2843
2844         reset_rsvds_bits_mask(vcpu, context, level);
2845
2846         ASSERT(is_pae(vcpu));
2847         context->new_cr3 = paging_new_cr3;
2848         context->page_fault = paging64_page_fault;
2849         context->gva_to_gpa = paging64_gva_to_gpa;
2850         context->sync_page = paging64_sync_page;
2851         context->invlpg = paging64_invlpg;
2852         context->update_pte = paging64_update_pte;
2853         context->free = paging_free;
2854         context->root_level = level;
2855         context->shadow_root_level = level;
2856         context->root_hpa = INVALID_PAGE;
2857         context->direct_map = false;
2858         return 0;
2859 }
2860
2861 static int paging64_init_context(struct kvm_vcpu *vcpu,
2862                                  struct kvm_mmu *context)
2863 {
2864         return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
2865 }
2866
2867 static int paging32_init_context(struct kvm_vcpu *vcpu,
2868                                  struct kvm_mmu *context)
2869 {
2870         context->nx = false;
2871
2872         reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2873
2874         context->new_cr3 = paging_new_cr3;
2875         context->page_fault = paging32_page_fault;
2876         context->gva_to_gpa = paging32_gva_to_gpa;
2877         context->free = paging_free;
2878         context->sync_page = paging32_sync_page;
2879         context->invlpg = paging32_invlpg;
2880         context->update_pte = paging32_update_pte;
2881         context->root_level = PT32_ROOT_LEVEL;
2882         context->shadow_root_level = PT32E_ROOT_LEVEL;
2883         context->root_hpa = INVALID_PAGE;
2884         context->direct_map = false;
2885         return 0;
2886 }
2887
2888 static int paging32E_init_context(struct kvm_vcpu *vcpu,
2889                                   struct kvm_mmu *context)
2890 {
2891         return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
2892 }
2893
2894 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2895 {
2896         struct kvm_mmu *context = vcpu->arch.walk_mmu;
2897
2898         context->base_role.word = 0;
2899         context->new_cr3 = nonpaging_new_cr3;
2900         context->page_fault = tdp_page_fault;
2901         context->free = nonpaging_free;
2902         context->sync_page = nonpaging_sync_page;
2903         context->invlpg = nonpaging_invlpg;
2904         context->update_pte = nonpaging_update_pte;
2905         context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2906         context->root_hpa = INVALID_PAGE;
2907         context->direct_map = true;
2908         context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
2909         context->get_cr3 = get_cr3;
2910         context->inject_page_fault = kvm_inject_page_fault;
2911         context->nx = is_nx(vcpu);
2912
2913         if (!is_paging(vcpu)) {
2914                 context->nx = false;
2915                 context->gva_to_gpa = nonpaging_gva_to_gpa;
2916                 context->root_level = 0;
2917         } else if (is_long_mode(vcpu)) {
2918                 context->nx = is_nx(vcpu);
2919                 reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
2920                 context->gva_to_gpa = paging64_gva_to_gpa;
2921                 context->root_level = PT64_ROOT_LEVEL;
2922         } else if (is_pae(vcpu)) {
2923                 context->nx = is_nx(vcpu);
2924                 reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
2925                 context->gva_to_gpa = paging64_gva_to_gpa;
2926                 context->root_level = PT32E_ROOT_LEVEL;
2927         } else {
2928                 context->nx = false;
2929                 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2930                 context->gva_to_gpa = paging32_gva_to_gpa;
2931                 context->root_level = PT32_ROOT_LEVEL;
2932         }
2933
2934         return 0;
2935 }
2936
2937 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
2938 {
2939         int r;
2940         bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
2941         ASSERT(vcpu);
2942         ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2943
2944         if (!is_paging(vcpu))
2945                 r = nonpaging_init_context(vcpu, context);
2946         else if (is_long_mode(vcpu))
2947                 r = paging64_init_context(vcpu, context);
2948         else if (is_pae(vcpu))
2949                 r = paging32E_init_context(vcpu, context);
2950         else
2951                 r = paging32_init_context(vcpu, context);
2952
2953         vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2954         vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
2955         vcpu->arch.mmu.base_role.smep_andnot_wp
2956                 = smep && !is_write_protection(vcpu);
2957
2958         return r;
2959 }
2960 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
2961
2962 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2963 {
2964         int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
2965
2966         vcpu->arch.walk_mmu->set_cr3           = kvm_x86_ops->set_cr3;
2967         vcpu->arch.walk_mmu->get_cr3           = get_cr3;
2968         vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
2969
2970         return r;
2971 }
2972
2973 static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
2974 {
2975         struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
2976
2977         g_context->get_cr3           = get_cr3;
2978         g_context->inject_page_fault = kvm_inject_page_fault;
2979
2980         /*
2981          * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
2982          * translation of l2_gpa to l1_gpa addresses is done using the
2983          * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
2984          * functions between mmu and nested_mmu are swapped.
2985          */
2986         if (!is_paging(vcpu)) {
2987                 g_context->nx = false;
2988                 g_context->root_level = 0;
2989                 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
2990         } else if (is_long_mode(vcpu)) {
2991                 g_context->nx = is_nx(vcpu);
2992                 reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
2993                 g_context->root_level = PT64_ROOT_LEVEL;
2994                 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
2995         } else if (is_pae(vcpu)) {
2996                 g_context->nx = is_nx(vcpu);
2997                 reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
2998                 g_context->root_level = PT32E_ROOT_LEVEL;
2999                 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3000         } else {
3001                 g_context->nx = false;
3002                 reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
3003                 g_context->root_level = PT32_ROOT_LEVEL;
3004                 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
3005         }
3006
3007         return 0;
3008 }
3009
3010 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
3011 {
3012         if (mmu_is_nested(vcpu))
3013                 return init_kvm_nested_mmu(vcpu);
3014         else if (tdp_enabled)
3015                 return init_kvm_tdp_mmu(vcpu);
3016         else
3017                 return init_kvm_softmmu(vcpu);
3018 }
3019
3020 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
3021 {
3022         ASSERT(vcpu);
3023         if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
3024                 /* mmu.free() should set root_hpa = INVALID_PAGE */
3025                 vcpu->arch.mmu.free(vcpu);
3026 }
3027
3028 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
3029 {
3030         destroy_kvm_mmu(vcpu);
3031         return init_kvm_mmu(vcpu);
3032 }
3033 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
3034
3035 int kvm_mmu_load(struct kvm_vcpu *vcpu)
3036 {
3037         int r;
3038
3039         r = mmu_topup_memory_caches(vcpu);
3040         if (r)
3041                 goto out;
3042         r = mmu_alloc_roots(vcpu);
3043         spin_lock(&vcpu->kvm->mmu_lock);
3044         mmu_sync_roots(vcpu);
3045         spin_unlock(&vcpu->kvm->mmu_lock);
3046         if (r)
3047                 goto out;
3048         /* set_cr3() should ensure TLB has been flushed */
3049         vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
3050 out:
3051         return r;
3052 }
3053 EXPORT_SYMBOL_GPL(kvm_mmu_load);
3054
3055 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
3056 {
3057         mmu_free_roots(vcpu);
3058 }
3059 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
3060
3061 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3062                                   struct kvm_mmu_page *sp, u64 *spte,
3063                                   const void *new)
3064 {
3065         if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
3066                 ++vcpu->kvm->stat.mmu_pde_zapped;
3067                 return;
3068         }
3069
3070         ++vcpu->kvm->stat.mmu_pte_updated;
3071         vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
3072 }
3073
3074 static bool need_remote_flush(u64 old, u64 new)
3075 {
3076         if (!is_shadow_present_pte(old))
3077                 return false;
3078         if (!is_shadow_present_pte(new))
3079                 return true;
3080         if ((old ^ new) & PT64_BASE_ADDR_MASK)
3081                 return true;
3082         old ^= PT64_NX_MASK;
3083         new ^= PT64_NX_MASK;
3084         return (old & ~new & PT64_PERM_MASK) != 0;
3085 }
3086
3087 static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
3088                                     bool remote_flush, bool local_flush)
3089 {
3090         if (zap_page)
3091                 return;
3092
3093         if (remote_flush)
3094                 kvm_flush_remote_tlbs(vcpu->kvm);
3095         else if (local_flush)
3096                 kvm_mmu_flush_tlb(vcpu);
3097 }
3098
3099 static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
3100 {
3101         u64 *spte = vcpu->arch.last_pte_updated;
3102
3103         return !!(spte && (*spte & shadow_accessed_mask));
3104 }
3105
3106 static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
3107 {
3108         u64 *spte = vcpu->arch.last_pte_updated;
3109
3110         if (spte
3111             && vcpu->arch.last_pte_gfn == gfn
3112             && shadow_accessed_mask
3113             && !(*spte & shadow_accessed_mask)
3114             && is_shadow_present_pte(*spte))
3115                 set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
3116 }
3117
3118 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3119                        const u8 *new, int bytes,
3120                        bool guest_initiated)
3121 {
3122         gfn_t gfn = gpa >> PAGE_SHIFT;
3123         union kvm_mmu_page_role mask = { .word = 0 };
3124         struct kvm_mmu_page *sp;
3125         struct hlist_node *node;
3126         LIST_HEAD(invalid_list);
3127         u64 entry, gentry, *spte;
3128         unsigned pte_size, page_offset, misaligned, quadrant, offset;
3129         int level, npte, invlpg_counter, r, flooded = 0;
3130         bool remote_flush, local_flush, zap_page;
3131
3132         /*
3133          * If we don't have indirect shadow pages, it means no page is
3134          * write-protected, so we can exit simply.
3135          */
3136         if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
3137                 return;
3138
3139         zap_page = remote_flush = local_flush = false;
3140         offset = offset_in_page(gpa);
3141
3142         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
3143
3144         invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
3145
3146         /*
3147          * Assume that the pte write on a page table of the same type
3148          * as the current vcpu paging mode since we update the sptes only
3149          * when they have the same mode.
3150          */
3151         if ((is_pae(vcpu) && bytes == 4) || !new) {
3152                 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
3153                 if (is_pae(vcpu)) {
3154                         gpa &= ~(gpa_t)7;
3155                         bytes = 8;
3156                 }
3157                 r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
3158                 if (r)
3159                         gentry = 0;
3160                 new = (const u8 *)&gentry;
3161         }
3162
3163         switch (bytes) {
3164         case 4:
3165                 gentry = *(const u32 *)new;
3166                 break;
3167         case 8:
3168                 gentry = *(const u64 *)new;
3169                 break;
3170         default:
3171                 gentry = 0;
3172                 break;
3173         }
3174
3175         spin_lock(&vcpu->kvm->mmu_lock);
3176         if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
3177                 gentry = 0;
3178         kvm_mmu_free_some_pages(vcpu);
3179         ++vcpu->kvm->stat.mmu_pte_write;
3180         trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
3181         if (guest_initiated) {
3182                 kvm_mmu_access_page(vcpu, gfn);
3183                 if (gfn == vcpu->arch.last_pt_write_gfn
3184                     && !last_updated_pte_accessed(vcpu)) {
3185                         ++vcpu->arch.last_pt_write_count;
3186                         if (vcpu->arch.last_pt_write_count >= 3)
3187                                 flooded = 1;
3188                 } else {
3189                         vcpu->arch.last_pt_write_gfn = gfn;
3190                         vcpu->arch.last_pt_write_count = 1;
3191                         vcpu->arch.last_pte_updated = NULL;
3192                 }
3193         }
3194
3195         mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
3196         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
3197                 pte_size = sp->role.cr4_pae ? 8 : 4;
3198                 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
3199                 misaligned |= bytes < 4;
3200                 if (misaligned || flooded) {
3201                         /*
3202                          * Misaligned accesses are too much trouble to fix
3203                          * up; also, they usually indicate a page is not used
3204                          * as a page table.
3205                          *
3206                          * If we're seeing too many writes to a page,
3207                          * it may no longer be a page table, or we may be
3208                          * forking, in which case it is better to unmap the
3209                          * page.
3210                          */
3211                         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
3212                                  gpa, bytes, sp->role.word);
3213                         zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
3214                                                      &invalid_list);
3215                         ++vcpu->kvm->stat.mmu_flooded;
3216                         continue;
3217                 }
3218                 page_offset = offset;
3219                 level = sp->role.level;
3220                 npte = 1;
3221                 if (!sp->role.cr4_pae) {
3222                         page_offset <<= 1;      /* 32->64 */
3223                         /*
3224                          * A 32-bit pde maps 4MB while the shadow pdes map
3225                          * only 2MB.  So we need to double the offset again
3226                          * and zap two pdes instead of one.
3227                          */
3228                         if (level == PT32_ROOT_LEVEL) {
3229                                 page_offset &= ~7; /* kill rounding error */
3230                                 page_offset <<= 1;
3231                                 npte = 2;
3232                         }
3233                         quadrant = page_offset >> PAGE_SHIFT;
3234                         page_offset &= ~PAGE_MASK;
3235                         if (quadrant != sp->role.quadrant)
3236                                 continue;
3237                 }
3238                 local_flush = true;
3239                 spte = &sp->spt[page_offset / sizeof(*spte)];
3240                 while (npte--) {
3241                         entry = *spte;
3242                         mmu_page_zap_pte(vcpu->kvm, sp, spte);
3243                         if (gentry &&
3244                               !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
3245                               & mask.word))
3246                                 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
3247                         if (!remote_flush && need_remote_flush(entry, *spte))
3248                                 remote_flush = true;
3249                         ++spte;
3250                 }
3251         }
3252         mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
3253         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3254         trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
3255         spin_unlock(&vcpu->kvm->mmu_lock);
3256 }
3257
3258 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
3259 {
3260         gpa_t gpa;
3261         int r;
3262
3263         if (vcpu->arch.mmu.direct_map)
3264                 return 0;
3265
3266         gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
3267
3268         spin_lock(&vcpu->kvm->mmu_lock);
3269         r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3270         spin_unlock(&vcpu->kvm->mmu_lock);
3271         return r;
3272 }
3273 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
3274
3275 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
3276 {
3277         LIST_HEAD(invalid_list);
3278
3279         while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
3280                !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
3281                 struct kvm_mmu_page *sp;
3282
3283                 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
3284                                   struct kvm_mmu_page, link);
3285                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
3286                 ++vcpu->kvm->stat.mmu_recycled;
3287         }
3288         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3289 }
3290
3291 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
3292                        void *insn, int insn_len)
3293 {
3294         int r;
3295         enum emulation_result er;
3296
3297         r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
3298         if (r < 0)
3299                 goto out;
3300
3301         if (!r) {
3302                 r = 1;
3303                 goto out;
3304         }
3305
3306         r = mmu_topup_memory_caches(vcpu);
3307         if (r)
3308                 goto out;
3309
3310         er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
3311
3312         switch (er) {
3313         case EMULATE_DONE:
3314                 return 1;
3315         case EMULATE_DO_MMIO:
3316                 ++vcpu->stat.mmio_exits;
3317                 /* fall through */
3318         case EMULATE_FAIL:
3319                 return 0;
3320         default:
3321                 BUG();
3322         }
3323 out:
3324         return r;
3325 }
3326 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
3327
3328 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
3329 {
3330         vcpu->arch.mmu.invlpg(vcpu, gva);
3331         kvm_mmu_flush_tlb(vcpu);
3332         ++vcpu->stat.invlpg;
3333 }
3334 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
3335
3336 void kvm_enable_tdp(void)
3337 {
3338         tdp_enabled = true;
3339 }
3340 EXPORT_SYMBOL_GPL(kvm_enable_tdp);
3341
3342 void kvm_disable_tdp(void)
3343 {
3344         tdp_enabled = false;
3345 }
3346 EXPORT_SYMBOL_GPL(kvm_disable_tdp);
3347
3348 static void free_mmu_pages(struct kvm_vcpu *vcpu)
3349 {
3350         free_page((unsigned long)vcpu->arch.mmu.pae_root);
3351         if (vcpu->arch.mmu.lm_root != NULL)
3352                 free_page((unsigned long)vcpu->arch.mmu.lm_root);
3353 }
3354
3355 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
3356 {
3357         struct page *page;
3358         int i;
3359
3360         ASSERT(vcpu);
3361
3362         /*
3363          * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
3364          * Therefore we need to allocate shadow page tables in the first
3365          * 4GB of memory, which happens to fit the DMA32 zone.
3366          */
3367         page = alloc_page(GFP_KERNEL | __GFP_DMA32);
3368         if (!page)
3369                 return -ENOMEM;
3370
3371         vcpu->arch.mmu.pae_root = page_address(page);
3372         for (i = 0; i < 4; ++i)
3373                 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
3374
3375         return 0;
3376 }
3377
3378 int kvm_mmu_create(struct kvm_vcpu *vcpu)
3379 {
3380         ASSERT(vcpu);
3381         ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3382
3383         return alloc_mmu_pages(vcpu);
3384 }
3385
3386 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
3387 {
3388         ASSERT(vcpu);
3389         ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3390
3391         return init_kvm_mmu(vcpu);
3392 }
3393
3394 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3395 {
3396         struct kvm_mmu_page *sp;
3397
3398         list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
3399                 int i;
3400                 u64 *pt;
3401
3402                 if (!test_bit(slot, sp->slot_bitmap))
3403                         continue;
3404
3405                 pt = sp->spt;
3406                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3407                         if (!is_shadow_present_pte(pt[i]) ||
3408                               !is_last_spte(pt[i], sp->role.level))
3409                                 continue;
3410
3411                         if (is_large_pte(pt[i])) {
3412                                 drop_spte(kvm, &pt[i]);
3413                                 --kvm->stat.lpages;
3414                                 continue;
3415                         }
3416
3417                         /* avoid RMW */
3418                         if (is_writable_pte(pt[i]))
3419                                 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
3420                 }
3421         }
3422         kvm_flush_remote_tlbs(kvm);
3423 }
3424
3425 void kvm_mmu_zap_all(struct kvm *kvm)
3426 {
3427         struct kvm_mmu_page *sp, *node;
3428         LIST_HEAD(invalid_list);
3429
3430         spin_lock(&kvm->mmu_lock);
3431 restart:
3432         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
3433                 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
3434                         goto restart;
3435
3436         kvm_mmu_commit_zap_page(kvm, &invalid_list);
3437         spin_unlock(&kvm->mmu_lock);
3438 }
3439
3440 static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3441                                                struct list_head *invalid_list)
3442 {
3443         struct kvm_mmu_page *page;
3444
3445         page = container_of(kvm->arch.active_mmu_pages.prev,
3446                             struct kvm_mmu_page, link);
3447         return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3448 }
3449
3450 static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3451 {
3452         struct kvm *kvm;
3453         struct kvm *kvm_freed = NULL;
3454         int nr_to_scan = sc->nr_to_scan;
3455
3456         if (nr_to_scan == 0)
3457                 goto out;
3458
3459         raw_spin_lock(&kvm_lock);
3460
3461         list_for_each_entry(kvm, &vm_list, vm_list) {
3462                 int idx, freed_pages;
3463                 LIST_HEAD(invalid_list);
3464
3465                 idx = srcu_read_lock(&kvm->srcu);
3466                 spin_lock(&kvm->mmu_lock);
3467                 if (!kvm_freed && nr_to_scan > 0 &&
3468                     kvm->arch.n_used_mmu_pages > 0) {
3469                         freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3470                                                           &invalid_list);
3471                         kvm_freed = kvm;
3472                 }
3473                 nr_to_scan--;
3474
3475                 kvm_mmu_commit_zap_page(kvm, &invalid_list);
3476                 spin_unlock(&kvm->mmu_lock);
3477                 srcu_read_unlock(&kvm->srcu, idx);
3478         }
3479         if (kvm_freed)
3480                 list_move_tail(&kvm_freed->vm_list, &vm_list);
3481
3482         raw_spin_unlock(&kvm_lock);
3483
3484 out:
3485         return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
3486 }
3487
3488 static struct shrinker mmu_shrinker = {
3489         .shrink = mmu_shrink,
3490         .seeks = DEFAULT_SEEKS * 10,
3491 };
3492
3493 static void mmu_destroy_caches(void)
3494 {
3495         if (pte_list_desc_cache)
3496                 kmem_cache_destroy(pte_list_desc_cache);
3497         if (mmu_page_header_cache)
3498                 kmem_cache_destroy(mmu_page_header_cache);
3499 }
3500
3501 int kvm_mmu_module_init(void)
3502 {
3503         pte_list_desc_cache = kmem_cache_create("pte_list_desc",
3504                                             sizeof(struct pte_list_desc),
3505                                             0, 0, NULL);
3506         if (!pte_list_desc_cache)
3507                 goto nomem;
3508
3509         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
3510                                                   sizeof(struct kvm_mmu_page),
3511                                                   0, 0, NULL);
3512         if (!mmu_page_header_cache)
3513                 goto nomem;
3514
3515         if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
3516                 goto nomem;
3517
3518         register_shrinker(&mmu_shrinker);
3519
3520         return 0;
3521
3522 nomem:
3523         mmu_destroy_caches();
3524         return -ENOMEM;
3525 }
3526
3527 /*
3528  * Caculate mmu pages needed for kvm.
3529  */
3530 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3531 {
3532         int i;
3533         unsigned int nr_mmu_pages;
3534         unsigned int  nr_pages = 0;
3535         struct kvm_memslots *slots;
3536
3537         slots = kvm_memslots(kvm);
3538
3539         for (i = 0; i < slots->nmemslots; i++)
3540                 nr_pages += slots->memslots[i].npages;
3541
3542         nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
3543         nr_mmu_pages = max(nr_mmu_pages,
3544                         (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
3545
3546         return nr_mmu_pages;
3547 }
3548
3549 static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3550                                 unsigned len)
3551 {
3552         if (len > buffer->len)
3553                 return NULL;
3554         return buffer->ptr;
3555 }
3556
3557 static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3558                                 unsigned len)
3559 {
3560         void *ret;
3561
3562         ret = pv_mmu_peek_buffer(buffer, len);
3563         if (!ret)
3564                 return ret;
3565         buffer->ptr += len;
3566         buffer->len -= len;
3567         buffer->processed += len;
3568         return ret;
3569 }
3570
3571 static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3572                              gpa_t addr, gpa_t value)
3573 {
3574         int bytes = 8;
3575         int r;
3576
3577         if (!is_long_mode(vcpu) && !is_pae(vcpu))
3578                 bytes = 4;
3579
3580         r = mmu_topup_memory_caches(vcpu);
3581         if (r)
3582                 return r;
3583
3584         if (!emulator_write_phys(vcpu, addr, &value, bytes))
3585                 return -EFAULT;
3586
3587         return 1;
3588 }
3589
3590 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3591 {
3592         (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
3593         return 1;
3594 }
3595
3596 static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
3597 {
3598         spin_lock(&vcpu->kvm->mmu_lock);
3599         mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
3600         spin_unlock(&vcpu->kvm->mmu_lock);
3601         return 1;
3602 }
3603
3604 static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
3605                              struct kvm_pv_mmu_op_buffer *buffer)
3606 {
3607         struct kvm_mmu_op_header *header;
3608
3609         header = pv_mmu_peek_buffer(buffer, sizeof *header);
3610         if (!header)
3611                 return 0;
3612         switch (header->op) {
3613         case KVM_MMU_OP_WRITE_PTE: {
3614                 struct kvm_mmu_op_write_pte *wpte;
3615
3616                 wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
3617                 if (!wpte)
3618                         return 0;
3619                 return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
3620                                         wpte->pte_val);
3621         }
3622         case KVM_MMU_OP_FLUSH_TLB: {
3623                 struct kvm_mmu_op_flush_tlb *ftlb;
3624
3625                 ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
3626                 if (!ftlb)
3627                         return 0;
3628                 return kvm_pv_mmu_flush_tlb(vcpu);
3629         }
3630         case KVM_MMU_OP_RELEASE_PT: {
3631                 struct kvm_mmu_op_release_pt *rpt;
3632
3633                 rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
3634                 if (!rpt)
3635                         return 0;
3636                 return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
3637         }
3638         default: return 0;
3639         }
3640 }
3641
3642 int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
3643                   gpa_t addr, unsigned long *ret)
3644 {
3645         int r;
3646         struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
3647
3648         buffer->ptr = buffer->buf;
3649         buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
3650         buffer->processed = 0;
3651
3652         r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
3653         if (r)
3654                 goto out;
3655
3656         while (buffer->len) {
3657                 r = kvm_pv_mmu_op_one(vcpu, buffer);
3658                 if (r < 0)
3659                         goto out;
3660                 if (r == 0)
3661                         break;
3662         }
3663
3664         r = 1;
3665 out:
3666         *ret = buffer->processed;
3667         return r;
3668 }
3669
3670 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3671 {
3672         struct kvm_shadow_walk_iterator iterator;
3673         int nr_sptes = 0;
3674
3675         spin_lock(&vcpu->kvm->mmu_lock);
3676         for_each_shadow_entry(vcpu, addr, iterator) {
3677                 sptes[iterator.level-1] = *iterator.sptep;
3678                 nr_sptes++;
3679                 if (!is_shadow_present_pte(*iterator.sptep))
3680                         break;
3681         }
3682         spin_unlock(&vcpu->kvm->mmu_lock);
3683
3684         return nr_sptes;
3685 }
3686 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3687
3688 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3689 {
3690         ASSERT(vcpu);
3691
3692         destroy_kvm_mmu(vcpu);
3693         free_mmu_pages(vcpu);
3694         mmu_free_memory_caches(vcpu);
3695 }
3696
3697 #ifdef CONFIG_KVM_MMU_AUDIT
3698 #include "mmu_audit.c"
3699 #else
3700 static void mmu_audit_disable(void) { }
3701 #endif
3702
3703 void kvm_mmu_module_exit(void)
3704 {
3705         mmu_destroy_caches();
3706         percpu_counter_destroy(&kvm_total_used_mmu_pages);
3707         unregister_shrinker(&mmu_shrinker);
3708         mmu_audit_disable();
3709 }