mm: introduce mm_populate() for populating new vmas
Michel Lespinasse [Sat, 23 Feb 2013 00:32:37 +0000 (16:32 -0800)]
When creating new mappings using the MAP_POPULATE / MAP_LOCKED flags (or
with MCL_FUTURE in effect), we want to populate the pages within the
newly created vmas.  This may take a while as we may have to read pages
from disk, so ideally we want to do this outside of the write-locked
mmap_sem region.

This change introduces mm_populate(), which is used to defer populating
such mappings until after the mmap_sem write lock has been released.
This is implemented as a generalization of the former do_mlock_pages(),
which accomplished the same task but was using during mlock() /
mlockall().

Signed-off-by: Michel Lespinasse <walken@google.com>
Reported-by: Andy Lutomirski <luto@amacapital.net>
Acked-by: Rik van Riel <riel@redhat.com>
Tested-by: Andy Lutomirski <luto@amacapital.net>
Cc: Greg Ungerer <gregungerer@westnet.com.au>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

fs/aio.c
include/linux/mm.h
ipc/shm.c
mm/mlock.c
mm/mmap.c
mm/nommu.c
mm/util.c

index 71f613c..82eec7c 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -103,6 +103,7 @@ static int aio_setup_ring(struct kioctx *ctx)
        unsigned nr_events = ctx->max_reqs;
        unsigned long size;
        int nr_pages;
+       bool populate;
 
        /* Compensate for the ring buffer's head/tail overlap entry */
        nr_events += 2; /* 1 is required, 2 for good luck */
@@ -129,7 +130,8 @@ static int aio_setup_ring(struct kioctx *ctx)
        down_write(&ctx->mm->mmap_sem);
        info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, 
                                        PROT_READ|PROT_WRITE,
-                                       MAP_ANONYMOUS|MAP_PRIVATE, 0);
+                                       MAP_ANONYMOUS|MAP_PRIVATE, 0,
+                                       &populate);
        if (IS_ERR((void *)info->mmap_base)) {
                up_write(&ctx->mm->mmap_sem);
                info->mmap_size = 0;
@@ -147,6 +149,8 @@ static int aio_setup_ring(struct kioctx *ctx)
                aio_free_ring(ctx);
                return -EAGAIN;
        }
+       if (populate)
+               mm_populate(info->mmap_base, info->mmap_size);
 
        ctx->user_id = info->mmap_base;
 
index 9d9dcc3..da0a0fe 100644 (file)
@@ -1474,11 +1474,23 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
        unsigned long len, unsigned long flags,
        vm_flags_t vm_flags, unsigned long pgoff);
-extern unsigned long do_mmap_pgoff(struct file *, unsigned long,
-        unsigned long, unsigned long,
-        unsigned long, unsigned long);
+extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+       unsigned long len, unsigned long prot, unsigned long flags,
+       unsigned long pgoff, bool *populate);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t);
 
+#ifdef CONFIG_MMU
+extern int __mm_populate(unsigned long addr, unsigned long len,
+                        int ignore_errors);
+static inline void mm_populate(unsigned long addr, unsigned long len)
+{
+       /* Ignore errors */
+       (void) __mm_populate(addr, len, 1);
+}
+#else
+static inline void mm_populate(unsigned long addr, unsigned long len) {}
+#endif
+
 /* These take the mm semaphore themselves */
 extern unsigned long vm_brk(unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
index 4fa6d8f..9f047ba 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -967,11 +967,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
        unsigned long flags;
        unsigned long prot;
        int acc_mode;
-       unsigned long user_addr;
        struct ipc_namespace *ns;
        struct shm_file_data *sfd;
        struct path path;
        fmode_t f_mode;
+       bool populate = false;
 
        err = -EINVAL;
        if (shmid < 0)
@@ -1070,13 +1070,15 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
                        goto invalid;
        }
                
-       user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0);
-       *raddr = user_addr;
+       addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
+       *raddr = addr;
        err = 0;
-       if (IS_ERR_VALUE(user_addr))
-               err = (long)user_addr;
+       if (IS_ERR_VALUE(addr))
+               err = (long)addr;
 invalid:
        up_write(&current->mm->mmap_sem);
+       if (populate)
+               mm_populate(addr, size);
 
 out_fput:
        fput(file);
index c9bd528..a296a49 100644 (file)
@@ -416,7 +416,14 @@ static int do_mlock(unsigned long start, size_t len, int on)
        return error;
 }
 
-static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
+/*
+ * __mm_populate - populate and/or mlock pages within a range of address space.
+ *
+ * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
+ * flags. VMAs must be already marked with the desired vm_flags, and
+ * mmap_sem must not be held.
+ */
+int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
 {
        struct mm_struct *mm = current->mm;
        unsigned long end, nstart, nend;
@@ -498,7 +505,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
                error = do_mlock(start, len, 1);
        up_write(&current->mm->mmap_sem);
        if (!error)
-               error = do_mlock_pages(start, len, 0);
+               error = __mm_populate(start, len, 0);
        return error;
 }
 
@@ -564,10 +571,8 @@ SYSCALL_DEFINE1(mlockall, int, flags)
            capable(CAP_IPC_LOCK))
                ret = do_mlockall(flags);
        up_write(&current->mm->mmap_sem);
-       if (!ret && (flags & MCL_CURRENT)) {
-               /* Ignore errors */
-               do_mlock_pages(0, TASK_SIZE, 1);
-       }
+       if (!ret && (flags & MCL_CURRENT))
+               mm_populate(0, TASK_SIZE);
 out:
        return ret;
 }
index 09da0b2..9b12e30 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1154,12 +1154,15 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
 
 unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                        unsigned long len, unsigned long prot,
-                       unsigned long flags, unsigned long pgoff)
+                       unsigned long flags, unsigned long pgoff,
+                       bool *populate)
 {
        struct mm_struct * mm = current->mm;
        struct inode *inode;
        vm_flags_t vm_flags;
 
+       *populate = false;
+
        /*
         * Does the application expect PROT_READ to imply PROT_EXEC?
         *
@@ -1280,7 +1283,12 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                }
        }
 
-       return mmap_region(file, addr, len, flags, vm_flags, pgoff);
+       addr = mmap_region(file, addr, len, flags, vm_flags, pgoff);
+       if (!IS_ERR_VALUE(addr) &&
+           ((vm_flags & VM_LOCKED) ||
+            (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
+               *populate = true;
+       return addr;
 }
 
 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
@@ -1531,10 +1539,12 @@ out:
 
        vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
-               if (!mlock_vma_pages_range(vma, addr, addr + len))
+               if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
+                                       vma == get_gate_vma(current->mm)))
                        mm->locked_vm += (len >> PAGE_SHIFT);
-       } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
-               make_pages_present(addr, addr + len);
+               else
+                       vma->vm_flags &= ~VM_LOCKED;
+       }
 
        if (file)
                uprobe_mmap(vma);
index b20db4e..7296a5a 100644 (file)
@@ -1250,7 +1250,8 @@ unsigned long do_mmap_pgoff(struct file *file,
                            unsigned long len,
                            unsigned long prot,
                            unsigned long flags,
-                           unsigned long pgoff)
+                           unsigned long pgoff,
+                           bool *populate)
 {
        struct vm_area_struct *vma;
        struct vm_region *region;
@@ -1260,6 +1261,8 @@ unsigned long do_mmap_pgoff(struct file *file,
 
        kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
 
+       *populate = false;
+
        /* decide whether we should attempt the mapping, and if so what sort of
         * mapping */
        ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
index c55e26b..13467e0 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -355,12 +355,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 {
        unsigned long ret;
        struct mm_struct *mm = current->mm;
+       bool populate;
 
        ret = security_mmap_file(file, prot, flag);
        if (!ret) {
                down_write(&mm->mmap_sem);
-               ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
+               ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
+                                   &populate);
                up_write(&mm->mmap_sem);
+               if (!IS_ERR_VALUE(ret) && populate)
+                       mm_populate(ret, len);
        }
        return ret;
 }