KVM: remove the usage of the mmap_sem for the protection of the memory slots.
Izik Eidus [Sun, 10 Feb 2008 16:04:15 +0000 (18:04 +0200)]
This patch replaces the mmap_sem lock for the memory slots with a new
kvm private lock, it is needed beacuse untill now there were cases where
kvm accesses user memory while holding the mmap semaphore.

Signed-off-by: Izik Eidus <izike@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

arch/x86/kvm/mmu.c
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
include/linux/kvm_host.h
virt/kvm/kvm_main.c

index 8efdcdb..2603710 100644 (file)
@@ -876,11 +876,18 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
 
 struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
 {
+       struct page *page;
+
        gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
 
        if (gpa == UNMAPPED_GVA)
                return NULL;
-       return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+
+       down_read(&current->mm->mmap_sem);
+       page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+       up_read(&current->mm->mmap_sem);
+
+       return page;
 }
 
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
@@ -1020,15 +1027,18 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 
        struct page *page;
 
+       down_read(&vcpu->kvm->slots_lock);
+
        down_read(&current->mm->mmap_sem);
        page = gfn_to_page(vcpu->kvm, gfn);
+       up_read(&current->mm->mmap_sem);
 
        spin_lock(&vcpu->kvm->mmu_lock);
        kvm_mmu_free_some_pages(vcpu);
        r = __nonpaging_map(vcpu, v, write, gfn, page);
        spin_unlock(&vcpu->kvm->mmu_lock);
 
-       up_read(&current->mm->mmap_sem);
+       up_read(&vcpu->kvm->slots_lock);
 
        return r;
 }
@@ -1362,6 +1372,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        gfn_t gfn;
        int r;
        u64 gpte = 0;
+       struct page *page;
 
        if (bytes != 4 && bytes != 8)
                return;
@@ -1389,6 +1400,11 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        if (!is_present_pte(gpte))
                return;
        gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+
+       down_read(&current->mm->mmap_sem);
+       page = gfn_to_page(vcpu->kvm, gfn);
+       up_read(&current->mm->mmap_sem);
+
        vcpu->arch.update_pte.gfn = gfn;
        vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn);
 }
@@ -1496,9 +1512,9 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
        gpa_t gpa;
        int r;
 
-       down_read(&current->mm->mmap_sem);
+       down_read(&vcpu->kvm->slots_lock);
        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
-       up_read(&current->mm->mmap_sem);
+       up_read(&vcpu->kvm->slots_lock);
 
        spin_lock(&vcpu->kvm->mmu_lock);
        r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
index 03ba860..2009c6e 100644 (file)
@@ -91,7 +91,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
        pt_element_t *table;
        struct page *page;
 
+       down_read(&current->mm->mmap_sem);
        page = gfn_to_page(kvm, table_gfn);
+       up_read(&current->mm->mmap_sem);
+
        table = kmap_atomic(page, KM_USER0);
 
        ret = CMPXCHG(&table[index], orig_pte, new_pte);
@@ -378,7 +381,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
        if (r)
                return r;
 
-       down_read(&current->mm->mmap_sem);
+       down_read(&vcpu->kvm->slots_lock);
        /*
         * Look up the shadow pte for the faulting address.
         */
@@ -392,11 +395,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
                pgprintk("%s: guest page fault\n", __FUNCTION__);
                inject_page_fault(vcpu, addr, walker.error_code);
                vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
-               up_read(&current->mm->mmap_sem);
+               up_read(&vcpu->kvm->slots_lock);
                return 0;
        }
 
+       down_read(&current->mm->mmap_sem);
        page = gfn_to_page(vcpu->kvm, walker.gfn);
+       up_read(&current->mm->mmap_sem);
 
        spin_lock(&vcpu->kvm->mmu_lock);
        kvm_mmu_free_some_pages(vcpu);
@@ -413,14 +418,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
         */
        if (shadow_pte && is_io_pte(*shadow_pte)) {
                spin_unlock(&vcpu->kvm->mmu_lock);
-               up_read(&current->mm->mmap_sem);
+               up_read(&vcpu->kvm->slots_lock);
                return 1;
        }
 
        ++vcpu->stat.pf_fixed;
        kvm_mmu_audit(vcpu, "post page fault (fixed)");
        spin_unlock(&vcpu->kvm->mmu_lock);
-       up_read(&current->mm->mmap_sem);
+       up_read(&vcpu->kvm->slots_lock);
 
        return write_pt;
 }
index ad36447..86f5bf1 100644 (file)
@@ -1477,7 +1477,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
        struct kvm_userspace_memory_region kvm_userspace_mem;
        int r = 0;
 
-       down_write(&current->mm->mmap_sem);
+       down_write(&kvm->slots_lock);
        if (kvm->arch.apic_access_page)
                goto out;
        kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
@@ -1487,9 +1487,12 @@ static int alloc_apic_access_page(struct kvm *kvm)
        r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
        if (r)
                goto out;
+
+       down_read(&current->mm->mmap_sem);
        kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
+       up_read(&current->mm->mmap_sem);
 out:
-       up_write(&current->mm->mmap_sem);
+       up_write(&kvm->slots_lock);
        return r;
 }
 
index 338764f..6b01552 100644 (file)
@@ -184,7 +184,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
        int ret;
        u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 
-       down_read(&current->mm->mmap_sem);
+       down_read(&vcpu->kvm->slots_lock);
        ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
                                  offset * sizeof(u64), sizeof(pdpte));
        if (ret < 0) {
@@ -201,7 +201,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 
        memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 out:
-       up_read(&current->mm->mmap_sem);
+       up_read(&vcpu->kvm->slots_lock);
 
        return ret;
 }
@@ -215,13 +215,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
        if (is_long_mode(vcpu) || !is_pae(vcpu))
                return false;
 
-       down_read(&current->mm->mmap_sem);
+       down_read(&vcpu->kvm->slots_lock);
        r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
        if (r < 0)
                goto out;
        changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
 out:
-       up_read(&current->mm->mmap_sem);
+       up_read(&vcpu->kvm->slots_lock);
 
        return changed;
 }
@@ -359,7 +359,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                 */
        }
 
-       down_read(&current->mm->mmap_sem);
+       down_read(&vcpu->kvm->slots_lock);
        /*
         * Does the new cr3 value map to physical memory? (Note, we
         * catch an invalid cr3 even in real-mode, because it would
@@ -375,7 +375,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                vcpu->arch.cr3 = cr3;
                vcpu->arch.mmu.new_cr3(vcpu);
        }
-       up_read(&current->mm->mmap_sem);
+       up_read(&vcpu->kvm->slots_lock);
 }
 EXPORT_SYMBOL_GPL(set_cr3);
 
@@ -1232,12 +1232,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
        if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
                return -EINVAL;
 
-       down_write(&current->mm->mmap_sem);
+       down_write(&kvm->slots_lock);
 
        kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
        kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
 
-       up_write(&current->mm->mmap_sem);
+       up_write(&kvm->slots_lock);
        return 0;
 }
 
@@ -1286,7 +1286,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
            < alias->target_phys_addr)
                goto out;
 
-       down_write(&current->mm->mmap_sem);
+       down_write(&kvm->slots_lock);
 
        p = &kvm->arch.aliases[alias->slot];
        p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -1300,7 +1300,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
 
        kvm_mmu_zap_all(kvm);
 
-       up_write(&current->mm->mmap_sem);
+       up_write(&kvm->slots_lock);
 
        return 0;
 
@@ -1376,7 +1376,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
        struct kvm_memory_slot *memslot;
        int is_dirty = 0;
 
-       down_write(&current->mm->mmap_sem);
+       down_write(&kvm->slots_lock);
 
        r = kvm_get_dirty_log(kvm, log, &is_dirty);
        if (r)
@@ -1392,7 +1392,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
        }
        r = 0;
 out:
-       up_write(&current->mm->mmap_sem);
+       up_write(&kvm->slots_lock);
        return r;
 }
 
@@ -1570,7 +1570,7 @@ int emulator_read_std(unsigned long addr,
        void *data = val;
        int r = X86EMUL_CONTINUE;
 
-       down_read(&current->mm->mmap_sem);
+       down_read(&vcpu->kvm->slots_lock);
        while (bytes) {
                gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
                unsigned offset = addr & (PAGE_SIZE-1);
@@ -1592,7 +1592,7 @@ int emulator_read_std(unsigned long addr,
                addr += tocopy;
        }
 out:
-       up_read(&current->mm->mmap_sem);
+       up_read(&vcpu->kvm->slots_lock);
        return r;
 }
 EXPORT_SYMBOL_GPL(emulator_read_std);
@@ -1611,9 +1611,9 @@ static int emulator_read_emulated(unsigned long addr,
                return X86EMUL_CONTINUE;
        }
 
-       down_read(&current->mm->mmap_sem);
+       down_read(&vcpu->kvm->slots_lock);
        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
-       up_read(&current->mm->mmap_sem);
+       up_read(&vcpu->kvm->slots_lock);
 
        /* For APIC access vmexit */
        if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -1651,14 +1651,14 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 {
        int ret;
 
-       down_read(&current->mm->mmap_sem);
+       down_read(&vcpu->kvm->slots_lock);
        ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
        if (ret < 0) {
-               up_read(&current->mm->mmap_sem);
+               up_read(&vcpu->kvm->slots_lock);
                return 0;
        }
        kvm_mmu_pte_write(vcpu, gpa, val, bytes);
-       up_read(&current->mm->mmap_sem);
+       up_read(&vcpu->kvm->slots_lock);
        return 1;
 }
 
@@ -1670,9 +1670,9 @@ static int emulator_write_emulated_onepage(unsigned long addr,
        struct kvm_io_device *mmio_dev;
        gpa_t                 gpa;
 
-       down_read(&current->mm->mmap_sem);
+       down_read(&vcpu->kvm->slots_lock);
        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
-       up_read(&current->mm->mmap_sem);
+       up_read(&vcpu->kvm->slots_lock);
 
        if (gpa == UNMAPPED_GVA) {
                kvm_inject_page_fault(vcpu, addr, 2);
@@ -1749,7 +1749,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
                char *kaddr;
                u64 val;
 
-               down_read(&current->mm->mmap_sem);
+               down_read(&vcpu->kvm->slots_lock);
                gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
 
                if (gpa == UNMAPPED_GVA ||
@@ -1760,13 +1760,17 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
                        goto emul_write;
 
                val = *(u64 *)new;
+
+               down_read(&current->mm->mmap_sem);
                page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+               up_read(&current->mm->mmap_sem);
+
                kaddr = kmap_atomic(page, KM_USER0);
                set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
                kunmap_atomic(kaddr, KM_USER0);
                kvm_release_page_dirty(page);
        emul_write:
-               up_read(&current->mm->mmap_sem);
+               up_read(&vcpu->kvm->slots_lock);
        }
 #endif
 
@@ -2159,10 +2163,10 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
                kvm_x86_ops->skip_emulated_instruction(vcpu);
 
        for (i = 0; i < nr_pages; ++i) {
-               down_read(&current->mm->mmap_sem);
+               down_read(&vcpu->kvm->slots_lock);
                page = gva_to_page(vcpu, address + i * PAGE_SIZE);
                vcpu->arch.pio.guest_pages[i] = page;
-               up_read(&current->mm->mmap_sem);
+               up_read(&vcpu->kvm->slots_lock);
                if (!page) {
                        kvm_inject_gp(vcpu, 0);
                        free_pio_guest_pages(vcpu);
@@ -2485,8 +2489,9 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
 
        down_read(&current->mm->mmap_sem);
        page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
-       vcpu->arch.apic->vapic_page = page;
        up_read(&current->mm->mmap_sem);
+
+       vcpu->arch.apic->vapic_page = page;
 }
 
 static void vapic_exit(struct kvm_vcpu *vcpu)
@@ -2959,9 +2964,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
        gpa_t gpa;
 
        vcpu_load(vcpu);
-       down_read(&current->mm->mmap_sem);
+       down_read(&vcpu->kvm->slots_lock);
        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
-       up_read(&current->mm->mmap_sem);
+       up_read(&vcpu->kvm->slots_lock);
        tr->physical_address = gpa;
        tr->valid = gpa != UNMAPPED_GVA;
        tr->writeable = 1;
@@ -3234,11 +3239,13 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
         */
        if (!user_alloc) {
                if (npages && !old.rmap) {
+                       down_write(&current->mm->mmap_sem);
                        memslot->userspace_addr = do_mmap(NULL, 0,
                                                     npages * PAGE_SIZE,
                                                     PROT_READ | PROT_WRITE,
                                                     MAP_SHARED | MAP_ANONYMOUS,
                                                     0);
+                       up_write(&current->mm->mmap_sem);
 
                        if (IS_ERR((void *)memslot->userspace_addr))
                                return PTR_ERR((void *)memslot->userspace_addr);
@@ -3246,8 +3253,10 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
                        if (!old.user_alloc && old.rmap) {
                                int ret;
 
+                               down_write(&current->mm->mmap_sem);
                                ret = do_munmap(current->mm, old.userspace_addr,
                                                old.npages * PAGE_SIZE);
+                               up_write(&current->mm->mmap_sem);
                                if (ret < 0)
                                        printk(KERN_WARNING
                                       "kvm_vm_ioctl_set_memory_region: "
index ea4764b..928b0d5 100644 (file)
@@ -107,6 +107,7 @@ struct kvm_memory_slot {
 struct kvm {
        struct mutex lock; /* protects the vcpus array and APIC accesses */
        spinlock_t mmu_lock;
+       struct rw_semaphore slots_lock;
        struct mm_struct *mm; /* userspace tied to this vm */
        int nmemslots;
        struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
index 32fbf80..b2e1289 100644 (file)
@@ -169,6 +169,7 @@ static struct kvm *kvm_create_vm(void)
        kvm_io_bus_init(&kvm->pio_bus);
        mutex_init(&kvm->lock);
        kvm_io_bus_init(&kvm->mmio_bus);
+       init_rwsem(&kvm->slots_lock);
        spin_lock(&kvm_lock);
        list_add(&kvm->vm_list, &vm_list);
        spin_unlock(&kvm_lock);
@@ -339,9 +340,9 @@ int kvm_set_memory_region(struct kvm *kvm,
 {
        int r;
 
-       down_write(&current->mm->mmap_sem);
+       down_write(&kvm->slots_lock);
        r = __kvm_set_memory_region(kvm, mem, user_alloc);
-       up_write(&current->mm->mmap_sem);
+       up_write(&kvm->slots_lock);
        return r;
 }
 EXPORT_SYMBOL_GPL(kvm_set_memory_region);