mm: avoid anon_vma_chain allocation under anon_vma lock
Linus Torvalds [Sat, 18 Jun 2011 02:05:36 +0000 (19:05 -0700)]
Hugh Dickins points out that lockdep (correctly) spots a potential
deadlock on the anon_vma lock, because we now do a GFP_KERNEL allocation
of anon_vma_chain while doing anon_vma_clone().  The problem is that
page reclaim will want to take the anon_vma lock of any anonymous pages
that it will try to reclaim.

So re-organize the code in anon_vma_clone() slightly: first do just a
GFP_NOWAIT allocation, which will usually work fine.  But if that fails,
let's just drop the lock and re-do the allocation, now with GFP_KERNEL.

End result: not only do we avoid the locking problem, this also ends up
getting better concurrency in case the allocation does need to block.
Tim Chen reports that with all these anon_vma locking tweaks, we're now
almost back up to the spinlock performance.

Reported-and-tested-by: Hugh Dickins <hughd@google.com>
Tested-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

mm/rmap.c

index 68756a7..27dfd3b 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -112,9 +112,9 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
        kmem_cache_free(anon_vma_cachep, anon_vma);
 }
 
-static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
+static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
 {
-       return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
+       return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
 }
 
 static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
@@ -159,7 +159,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                struct mm_struct *mm = vma->vm_mm;
                struct anon_vma *allocated;
 
-               avc = anon_vma_chain_alloc();
+               avc = anon_vma_chain_alloc(GFP_KERNEL);
                if (!avc)
                        goto out_enomem;
 
@@ -253,9 +253,14 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
        list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma;
 
-               avc = anon_vma_chain_alloc();
-               if (!avc)
-                       goto enomem_failure;
+               avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
+               if (unlikely(!avc)) {
+                       unlock_anon_vma_root(root);
+                       root = NULL;
+                       avc = anon_vma_chain_alloc(GFP_KERNEL);
+                       if (!avc)
+                               goto enomem_failure;
+               }
                anon_vma = pavc->anon_vma;
                root = lock_anon_vma_root(root, anon_vma);
                anon_vma_chain_link(dst, avc, anon_vma);
@@ -264,7 +269,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
        return 0;
 
  enomem_failure:
-       unlock_anon_vma_root(root);
        unlink_anon_vmas(dst);
        return -ENOMEM;
 }
@@ -294,7 +298,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        anon_vma = anon_vma_alloc();
        if (!anon_vma)
                goto out_error;
-       avc = anon_vma_chain_alloc();
+       avc = anon_vma_chain_alloc(GFP_KERNEL);
        if (!avc)
                goto out_error_free_anon_vma;