mm: xip fix fault vs sparse page invalidate race
Nick Piggin [Wed, 20 Aug 2008 21:09:20 +0000 (14:09 -0700)]
XIP has a race between sparse pages being inserted into page tables, and
sparse pages being zapped when its time to put a non-sparse page in.

What can happen is that a process can be left with a dangling sparse page
in a MAP_SHARED mapping, while the rest of the world sees the non-sparse
version.  Ie.  data corruption.

Guard these operations with a seqlock, making fault-in-sparse-pages the
slowpath, and try-to-unmap-sparse-pages the fastpath.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Jared Hulbert <jaredeh@gmail.com>
Acked-by: Carsten Otte <cotte@freenet.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

mm/filemap_xip.c

index 8b710ca..5b9ec47 100644 (file)
@@ -15,6 +15,8 @@
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
 #include <linux/sched.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
  * We do use our own empty page to avoid interference with other users
  * of ZERO_PAGE(), such as /dev/zero
  */
+static DEFINE_MUTEX(xip_sparse_mutex);
+static seqcount_t xip_sparse_seq = SEQCNT_ZERO;
 static struct page *__xip_sparse_page;
 
+/* called under xip_sparse_mutex */
 static struct page *xip_sparse_page(void)
 {
        if (!__xip_sparse_page) {
                struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
 
-               if (page) {
-                       static DEFINE_SPINLOCK(xip_alloc_lock);
-                       spin_lock(&xip_alloc_lock);
-                       if (!__xip_sparse_page)
-                               __xip_sparse_page = page;
-                       else
-                               __free_page(page);
-                       spin_unlock(&xip_alloc_lock);
-               }
+               if (page)
+                       __xip_sparse_page = page;
        }
        return __xip_sparse_page;
 }
@@ -174,11 +172,16 @@ __xip_unmap (struct address_space * mapping,
        pte_t pteval;
        spinlock_t *ptl;
        struct page *page;
+       unsigned count;
+       int locked = 0;
+
+       count = read_seqcount_begin(&xip_sparse_seq);
 
        page = __xip_sparse_page;
        if (!page)
                return;
 
+retry:
        spin_lock(&mapping->i_mmap_lock);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                mm = vma->vm_mm;
@@ -198,6 +201,14 @@ __xip_unmap (struct address_space * mapping,
                }
        }
        spin_unlock(&mapping->i_mmap_lock);
+
+       if (locked) {
+               mutex_unlock(&xip_sparse_mutex);
+       } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
+               mutex_lock(&xip_sparse_mutex);
+               locked = 1;
+               goto retry;
+       }
 }
 
 /*
@@ -218,7 +229,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        int error;
 
        /* XXX: are VM_FAULT_ codes OK? */
-
+again:
        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if (vmf->pgoff >= size)
                return VM_FAULT_SIGBUS;
@@ -245,6 +256,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                __xip_unmap(mapping, vmf->pgoff);
 
 found:
+               printk("%s insert %lx@%lx\n", current->comm, (unsigned long)vmf->virtual_address, xip_pfn);
                err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
                                                        xip_pfn);
                if (err == -ENOMEM)
@@ -252,14 +264,34 @@ found:
                BUG_ON(err);
                return VM_FAULT_NOPAGE;
        } else {
+               int err, ret = VM_FAULT_OOM;
+
+               mutex_lock(&xip_sparse_mutex);
+               write_seqcount_begin(&xip_sparse_seq);
+               error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+                                                       &xip_mem, &xip_pfn);
+               if (unlikely(!error)) {
+                       write_seqcount_end(&xip_sparse_seq);
+                       mutex_unlock(&xip_sparse_mutex);
+                       goto again;
+               }
+               if (error != -ENODATA)
+                       goto out;
                /* not shared and writable, use xip_sparse_page() */
                page = xip_sparse_page();
                if (!page)
-                       return VM_FAULT_OOM;
+                       goto out;
+               err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
+                                                       page);
+               if (err == -ENOMEM)
+                       goto out;
 
-               page_cache_get(page);
-               vmf->page = page;
-               return 0;
+               ret = VM_FAULT_NOPAGE;
+out:
+               write_seqcount_end(&xip_sparse_seq);
+               mutex_unlock(&xip_sparse_mutex);
+
+               return ret;
        }
 }