mm: vmscan: fix force-scanning small targets without swap
[linux-2.6.git] / mm / mmap.c
index 15b1fae..a65efd4 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -122,9 +122,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                return 0;
 
        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
-               unsigned long n;
+               free = global_page_state(NR_FREE_PAGES);
+               free += global_page_state(NR_FILE_PAGES);
+
+               /*
+                * shmem pages shouldn't be counted as free in this
+                * case, they can't be purged, only swapped out, and
+                * that won't affect the overall amount of available
+                * memory in the system.
+                */
+               free -= global_page_state(NR_SHMEM);
 
-               free = global_page_state(NR_FILE_PAGES);
                free += nr_swap_pages;
 
                /*
@@ -136,34 +144,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                free += global_page_state(NR_SLAB_RECLAIMABLE);
 
                /*
-                * Leave the last 3% for root
-                */
-               if (!cap_sys_admin)
-                       free -= free / 32;
-
-               if (free > pages)
-                       return 0;
-
-               /*
-                * nr_free_pages() is very expensive on large systems,
-                * only call if we're about to fail.
-                */
-               n = nr_free_pages();
-
-               /*
                 * Leave reserved pages. The pages are not for anonymous pages.
                 */
-               if (n <= totalreserve_pages)
+               if (free <= totalreserve_pages)
                        goto error;
                else
-                       n -= totalreserve_pages;
+                       free -= totalreserve_pages;
 
                /*
                 * Leave the last 3% for root
                 */
                if (!cap_sys_admin)
-                       n -= n / 32;
-               free += n;
+                       free -= free / 32;
 
                if (free > pages)
                        return 0;
@@ -194,7 +186,7 @@ error:
 }
 
 /*
- * Requires inode->i_mapping->i_mmap_lock
+ * Requires inode->i_mapping->i_mmap_mutex
  */
 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
                struct file *file, struct address_space *mapping)
@@ -222,9 +214,9 @@ void unlink_file_vma(struct vm_area_struct *vma)
 
        if (file) {
                struct address_space *mapping = file->f_mapping;
-               spin_lock(&mapping->i_mmap_lock);
+               mutex_lock(&mapping->i_mmap_mutex);
                __remove_shared_vm_struct(vma, file, mapping);
-               spin_unlock(&mapping->i_mmap_lock);
+               mutex_unlock(&mapping->i_mmap_mutex);
        }
 }
 
@@ -398,29 +390,6 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
        return vma;
 }
 
-static inline void
-__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
-               struct vm_area_struct *prev, struct rb_node *rb_parent)
-{
-       struct vm_area_struct *next;
-
-       vma->vm_prev = prev;
-       if (prev) {
-               next = prev->vm_next;
-               prev->vm_next = vma;
-       } else {
-               mm->mmap = vma;
-               if (rb_parent)
-                       next = rb_entry(rb_parent,
-                                       struct vm_area_struct, vm_rb);
-               else
-                       next = NULL;
-       }
-       vma->vm_next = next;
-       if (next)
-               next->vm_prev = vma;
-}
-
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
                struct rb_node **rb_link, struct rb_node *rb_parent)
 {
@@ -468,16 +437,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
        if (vma->vm_file)
                mapping = vma->vm_file->f_mapping;
 
-       if (mapping) {
-               spin_lock(&mapping->i_mmap_lock);
-               vma->vm_truncate_count = mapping->truncate_count;
-       }
+       if (mapping)
+               mutex_lock(&mapping->i_mmap_mutex);
 
        __vma_link(mm, vma, prev, rb_link, rb_parent);
        __vma_link_file(vma);
 
        if (mapping)
-               spin_unlock(&mapping->i_mmap_lock);
+               mutex_unlock(&mapping->i_mmap_mutex);
 
        mm->map_count++;
        validate_mm(mm);
@@ -580,17 +547,8 @@ again:                     remove_next = 1 + (end > next->vm_end);
                mapping = file->f_mapping;
                if (!(vma->vm_flags & VM_NONLINEAR))
                        root = &mapping->i_mmap;
-               spin_lock(&mapping->i_mmap_lock);
-               if (importer &&
-                   vma->vm_truncate_count != next->vm_truncate_count) {
-                       /*
-                        * unmap_mapping_range might be in progress:
-                        * ensure that the expanding vma is rescanned.
-                        */
-                       importer->vm_truncate_count = 0;
-               }
+               mutex_lock(&mapping->i_mmap_mutex);
                if (insert) {
-                       insert->vm_truncate_count = vma->vm_truncate_count;
                        /*
                         * Put into prio_tree now, so instantiated pages
                         * are visible to arm/parisc __flush_dcache_page
@@ -656,7 +614,7 @@ again:                      remove_next = 1 + (end > next->vm_end);
        if (anon_vma)
                anon_vma_unlock(anon_vma);
        if (mapping)
-               spin_unlock(&mapping->i_mmap_lock);
+               mutex_unlock(&mapping->i_mmap_mutex);
 
        if (remove_next) {
                if (file) {
@@ -703,9 +661,17 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
 }
 
 static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
-                                       struct anon_vma *anon_vma2)
+                                       struct anon_vma *anon_vma2,
+                                       struct vm_area_struct *vma)
 {
-       return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2);
+       /*
+        * The list_is_singular() test is to avoid merging VMA cloned from
+        * parents. This can improve scalability caused by anon_vma lock.
+        */
+       if ((!anon_vma1 || !anon_vma2) && (!vma ||
+               list_is_singular(&vma->anon_vma_chain)))
+               return 1;
+       return anon_vma1 == anon_vma2;
 }
 
 /*
@@ -724,7 +690,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
        struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
 {
        if (is_mergeable_vma(vma, file, vm_flags) &&
-           is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
+           is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                if (vma->vm_pgoff == vm_pgoff)
                        return 1;
        }
@@ -743,7 +709,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
        struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
 {
        if (is_mergeable_vma(vma, file, vm_flags) &&
-           is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
+           is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                pgoff_t vm_pglen;
                vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
                if (vma->vm_pgoff + vm_pglen == vm_pgoff)
@@ -821,7 +787,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                can_vma_merge_before(next, vm_flags,
                                        anon_vma, file, pgoff+pglen) &&
                                is_mergeable_anon_vma(prev->anon_vma,
-                                                     next->anon_vma)) {
+                                                     next->anon_vma, NULL)) {
                                                        /* cases 1, 6 */
                        err = vma_adjust(prev, prev->vm_start,
                                next->vm_end, prev->vm_pgoff, NULL);
@@ -932,14 +898,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
        if (anon_vma)
                return anon_vma;
 try_prev:
-       /*
-        * It is potentially slow to have to call find_vma_prev here.
-        * But it's only on the first write fault on the vma, not
-        * every time, and we could devise a way to avoid it later
-        * (e.g. stash info in next's anon_vma_node when assigning
-        * an anon_vma, or when trying vma_merge).  Another time.
-        */
-       BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma);
+       near = vma->vm_prev;
        if (!near)
                goto none;
 
@@ -986,7 +945,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 {
        struct mm_struct * mm = current->mm;
        struct inode *inode;
-       unsigned int vm_flags;
+       vm_flags_t vm_flags;
        int error;
        unsigned long reqprot = prot;
 
@@ -1191,7 +1150,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
  */
 int vma_wants_writenotify(struct vm_area_struct *vma)
 {
-       unsigned int vm_flags = vma->vm_flags;
+       vm_flags_t vm_flags = vma->vm_flags;
 
        /* If it was private or non-writable, the write bit is already clear */
        if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
@@ -1219,7 +1178,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
  * We account for memory if it's a private writeable mapping,
  * not hugepages and VM_NORESERVE wasn't set.
  */
-static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
+static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 {
        /*
         * hugetlb has its own accounting separate from the core VM
@@ -1233,7 +1192,7 @@ static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
 
 unsigned long mmap_region(struct file *file, unsigned long addr,
                          unsigned long len, unsigned long flags,
-                         unsigned int vm_flags, unsigned long pgoff)
+                         vm_flags_t vm_flags, unsigned long pgoff)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
@@ -1789,7 +1748,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 /*
  * vma is the first one with address < vma->vm_start.  Have to extend vma.
  */
-static int expand_downwards(struct vm_area_struct *vma,
+int expand_downwards(struct vm_area_struct *vma,
                                   unsigned long address)
 {
        int error;
@@ -1836,11 +1795,6 @@ static int expand_downwards(struct vm_area_struct *vma,
        return error;
 }
 
-int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address)
-{
-       return expand_downwards(vma, address);
-}
-
 #ifdef CONFIG_STACK_GROWSUP
 int expand_stack(struct vm_area_struct *vma, unsigned long address)
 {
@@ -1923,17 +1877,17 @@ static void unmap_region(struct mm_struct *mm,
                unsigned long start, unsigned long end)
 {
        struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
-       struct mmu_gather *tlb;
+       struct mmu_gather tlb;
        unsigned long nr_accounted = 0;
 
        lru_add_drain();
-       tlb = tlb_gather_mmu(mm, 0);
+       tlb_gather_mmu(&tlb, mm, 0);
        update_hiwater_rss(mm);
        unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-       free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
-                                next? next->vm_start: 0);
-       tlb_finish_mmu(tlb, start, end);
+       free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+                                next ? next->vm_start : 0);
+       tlb_finish_mmu(&tlb, start, end);
 }
 
 /*
@@ -2075,9 +2029,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
                return -EINVAL;
 
        /* Find the first overlapping VMA */
-       vma = find_vma_prev(mm, start, &prev);
+       vma = find_vma(mm, start);
        if (!vma)
                return 0;
+       prev = vma->vm_prev;
        /* we have  start < vma->vm_end  */
 
        /* if it doesn't overlap, we have nothing.. */
@@ -2275,7 +2230,7 @@ EXPORT_SYMBOL(do_brk);
 /* Release all mmaps. */
 void exit_mmap(struct mm_struct *mm)
 {
-       struct mmu_gather *tlb;
+       struct mmu_gather tlb;
        struct vm_area_struct *vma;
        unsigned long nr_accounted = 0;
        unsigned long end;
@@ -2300,14 +2255,14 @@ void exit_mmap(struct mm_struct *mm)
 
        lru_add_drain();
        flush_cache_mm(mm);
-       tlb = tlb_gather_mmu(mm, 1);
+       tlb_gather_mmu(&tlb, mm, 1);
        /* update_hiwater_rss(mm) here? but nobody should be looking */
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
        end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
 
-       free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
-       tlb_finish_mmu(tlb, 0, end);
+       free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+       tlb_finish_mmu(&tlb, 0, end);
 
        /*
         * Walk the list again, actually closing and freeing it,
@@ -2321,7 +2276,7 @@ void exit_mmap(struct mm_struct *mm)
 
 /* Insert vm structure into process list sorted by address
  * and into the inode's i_mmap tree.  If vm_file is non-NULL
- * then i_mmap_lock is taken here.
+ * then i_mmap_mutex is taken here.
  */
 int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 {
@@ -2533,15 +2488,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
                 */
-               spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem);
+               mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
                /*
                 * We can safely modify head.next after taking the
-                * anon_vma->root->lock. If some other vma in this mm shares
+                * anon_vma->root->mutex. If some other vma in this mm shares
                 * the same anon_vma we won't take it again.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us thanks to the
-                * anon_vma->root->lock.
+                * anon_vma->root->mutex.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
                                       &anon_vma->root->head.next))
@@ -2563,7 +2518,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
                 */
                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
                        BUG();
-               spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
+               mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
        }
 }
 
@@ -2590,7 +2545,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
  * vma in this mm is backed by the same anon_vma or address_space.
  *
  * We can take all the locks in random order because the VM code
- * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
+ * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never
  * takes more than one of them in a row. Secondly we're protected
  * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
  *
@@ -2646,7 +2601,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us until we release the
-                * anon_vma->root->lock.
+                * anon_vma->root->mutex.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
                                          &anon_vma->root->head.next))
@@ -2662,7 +2617,7 @@ static void vm_unlock_mapping(struct address_space *mapping)
                 * AS_MM_ALL_LOCKS can't change to 0 from under us
                 * because we hold the mm_all_locks_mutex.
                 */
-               spin_unlock(&mapping->i_mmap_lock);
+               mutex_unlock(&mapping->i_mmap_mutex);
                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
                                        &mapping->flags))
                        BUG();