mm: vmscan: fix force-scanning small targets without swap
[linux-2.6.git] / mm / mmap.c
index 50a4aa0..a65efd4 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/perf_event.h>
 #include <linux/audit.h>
+#include <linux/khugepaged.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -83,10 +84,14 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags)
 }
 EXPORT_SYMBOL(vm_get_page_prot);
 
-int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
-int sysctl_overcommit_ratio = 50;      /* default is 50% */
+int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic overcommit */
+int sysctl_overcommit_ratio __read_mostly = 50;        /* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-struct percpu_counter vm_committed_as;
+/*
+ * Make sure vm_committed_as in one cacheline and not cacheline shared with
+ * other variables. It can be updated by several CPUs frequently.
+ */
+struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
 
 /*
  * Check that a process has enough memory to allocate a new virtual
@@ -117,9 +122,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                return 0;
 
        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
-               unsigned long n;
+               free = global_page_state(NR_FREE_PAGES);
+               free += global_page_state(NR_FILE_PAGES);
+
+               /*
+                * shmem pages shouldn't be counted as free in this
+                * case, they can't be purged, only swapped out, and
+                * that won't affect the overall amount of available
+                * memory in the system.
+                */
+               free -= global_page_state(NR_SHMEM);
 
-               free = global_page_state(NR_FILE_PAGES);
                free += nr_swap_pages;
 
                /*
@@ -131,34 +144,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                free += global_page_state(NR_SLAB_RECLAIMABLE);
 
                /*
-                * Leave the last 3% for root
-                */
-               if (!cap_sys_admin)
-                       free -= free / 32;
-
-               if (free > pages)
-                       return 0;
-
-               /*
-                * nr_free_pages() is very expensive on large systems,
-                * only call if we're about to fail.
-                */
-               n = nr_free_pages();
-
-               /*
                 * Leave reserved pages. The pages are not for anonymous pages.
                 */
-               if (n <= totalreserve_pages)
+               if (free <= totalreserve_pages)
                        goto error;
                else
-                       n -= totalreserve_pages;
+                       free -= totalreserve_pages;
 
                /*
                 * Leave the last 3% for root
                 */
                if (!cap_sys_admin)
-                       n -= n / 32;
-               free += n;
+                       free -= free / 32;
 
                if (free > pages)
                        return 0;
@@ -189,7 +186,7 @@ error:
 }
 
 /*
- * Requires inode->i_mapping->i_mmap_lock
+ * Requires inode->i_mapping->i_mmap_mutex
  */
 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
                struct file *file, struct address_space *mapping)
@@ -217,9 +214,9 @@ void unlink_file_vma(struct vm_area_struct *vma)
 
        if (file) {
                struct address_space *mapping = file->f_mapping;
-               spin_lock(&mapping->i_mmap_lock);
+               mutex_lock(&mapping->i_mmap_mutex);
                __remove_shared_vm_struct(vma, file, mapping);
-               spin_unlock(&mapping->i_mmap_lock);
+               mutex_unlock(&mapping->i_mmap_mutex);
        }
 }
 
@@ -253,7 +250,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
        down_write(&mm->mmap_sem);
 
 #ifdef CONFIG_COMPAT_BRK
-       min_brk = mm->end_code;
+       /*
+        * CONFIG_COMPAT_BRK can still be overridden by setting
+        * randomize_va_space to 2, which will still cause mm->start_brk
+        * to be arbitrarily shifted
+        */
+       if (current->brk_randomized)
+               min_brk = mm->start_brk;
+       else
+               min_brk = mm->end_data;
 #else
        min_brk = mm->start_brk;
 #endif
@@ -385,29 +390,6 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
        return vma;
 }
 
-static inline void
-__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
-               struct vm_area_struct *prev, struct rb_node *rb_parent)
-{
-       struct vm_area_struct *next;
-
-       vma->vm_prev = prev;
-       if (prev) {
-               next = prev->vm_next;
-               prev->vm_next = vma;
-       } else {
-               mm->mmap = vma;
-               if (rb_parent)
-                       next = rb_entry(rb_parent,
-                                       struct vm_area_struct, vm_rb);
-               else
-                       next = NULL;
-       }
-       vma->vm_next = next;
-       if (next)
-               next->vm_prev = vma;
-}
-
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
                struct rb_node **rb_link, struct rb_node *rb_parent)
 {
@@ -455,16 +437,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
        if (vma->vm_file)
                mapping = vma->vm_file->f_mapping;
 
-       if (mapping) {
-               spin_lock(&mapping->i_mmap_lock);
-               vma->vm_truncate_count = mapping->truncate_count;
-       }
+       if (mapping)
+               mutex_lock(&mapping->i_mmap_mutex);
 
        __vma_link(mm, vma, prev, rb_link, rb_parent);
        __vma_link_file(vma);
 
        if (mapping)
-               spin_unlock(&mapping->i_mmap_lock);
+               mutex_unlock(&mapping->i_mmap_mutex);
 
        mm->map_count++;
        validate_mm(mm);
@@ -567,17 +547,8 @@ again:                     remove_next = 1 + (end > next->vm_end);
                mapping = file->f_mapping;
                if (!(vma->vm_flags & VM_NONLINEAR))
                        root = &mapping->i_mmap;
-               spin_lock(&mapping->i_mmap_lock);
-               if (importer &&
-                   vma->vm_truncate_count != next->vm_truncate_count) {
-                       /*
-                        * unmap_mapping_range might be in progress:
-                        * ensure that the expanding vma is rescanned.
-                        */
-                       importer->vm_truncate_count = 0;
-               }
+               mutex_lock(&mapping->i_mmap_mutex);
                if (insert) {
-                       insert->vm_truncate_count = vma->vm_truncate_count;
                        /*
                         * Put into prio_tree now, so instantiated pages
                         * are visible to arm/parisc __flush_dcache_page
@@ -588,13 +559,15 @@ again:                    remove_next = 1 + (end > next->vm_end);
                }
        }
 
+       vma_adjust_trans_huge(vma, start, end, adjust_next);
+
        /*
         * When changing only vma->vm_end, we don't really need anon_vma
         * lock. This is a fairly rare case by itself, but the anon_vma
         * lock may be shared between many sibling processes.  Skipping
         * the lock for brk adjustments makes a difference sometimes.
         */
-       if (vma->anon_vma && (insert || importer || start != vma->vm_start)) {
+       if (vma->anon_vma && (importer || start != vma->vm_start)) {
                anon_vma = vma->anon_vma;
                anon_vma_lock(anon_vma);
        }
@@ -641,7 +614,7 @@ again:                      remove_next = 1 + (end > next->vm_end);
        if (anon_vma)
                anon_vma_unlock(anon_vma);
        if (mapping)
-               spin_unlock(&mapping->i_mmap_lock);
+               mutex_unlock(&mapping->i_mmap_mutex);
 
        if (remove_next) {
                if (file) {
@@ -688,9 +661,17 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
 }
 
 static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
-                                       struct anon_vma *anon_vma2)
+                                       struct anon_vma *anon_vma2,
+                                       struct vm_area_struct *vma)
 {
-       return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2);
+       /*
+        * The list_is_singular() test is to avoid merging VMA cloned from
+        * parents. This can improve scalability caused by anon_vma lock.
+        */
+       if ((!anon_vma1 || !anon_vma2) && (!vma ||
+               list_is_singular(&vma->anon_vma_chain)))
+               return 1;
+       return anon_vma1 == anon_vma2;
 }
 
 /*
@@ -709,7 +690,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
        struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
 {
        if (is_mergeable_vma(vma, file, vm_flags) &&
-           is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
+           is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                if (vma->vm_pgoff == vm_pgoff)
                        return 1;
        }
@@ -728,7 +709,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
        struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
 {
        if (is_mergeable_vma(vma, file, vm_flags) &&
-           is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
+           is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                pgoff_t vm_pglen;
                vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
                if (vma->vm_pgoff + vm_pglen == vm_pgoff)
@@ -806,7 +787,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                can_vma_merge_before(next, vm_flags,
                                        anon_vma, file, pgoff+pglen) &&
                                is_mergeable_anon_vma(prev->anon_vma,
-                                                     next->anon_vma)) {
+                                                     next->anon_vma, NULL)) {
                                                        /* cases 1, 6 */
                        err = vma_adjust(prev, prev->vm_start,
                                next->vm_end, prev->vm_pgoff, NULL);
@@ -815,6 +796,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                end, prev->vm_pgoff, NULL);
                if (err)
                        return NULL;
+               khugepaged_enter_vma_merge(prev);
                return prev;
        }
 
@@ -833,6 +815,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                next->vm_pgoff - pglen, NULL);
                if (err)
                        return NULL;
+               khugepaged_enter_vma_merge(area);
                return area;
        }
 
@@ -915,14 +898,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
        if (anon_vma)
                return anon_vma;
 try_prev:
-       /*
-        * It is potentially slow to have to call find_vma_prev here.
-        * But it's only on the first write fault on the vma, not
-        * every time, and we could devise a way to avoid it later
-        * (e.g. stash info in next's anon_vma_node when assigning
-        * an anon_vma, or when trying vma_merge).  Another time.
-        */
-       BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma);
+       near = vma->vm_prev;
        if (!near)
                goto none;
 
@@ -969,7 +945,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 {
        struct mm_struct * mm = current->mm;
        struct inode *inode;
-       unsigned int vm_flags;
+       vm_flags_t vm_flags;
        int error;
        unsigned long reqprot = prot;
 
@@ -1174,7 +1150,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
  */
 int vma_wants_writenotify(struct vm_area_struct *vma)
 {
-       unsigned int vm_flags = vma->vm_flags;
+       vm_flags_t vm_flags = vma->vm_flags;
 
        /* If it was private or non-writable, the write bit is already clear */
        if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
@@ -1202,7 +1178,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
  * We account for memory if it's a private writeable mapping,
  * not hugepages and VM_NORESERVE wasn't set.
  */
-static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
+static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 {
        /*
         * hugetlb has its own accounting separate from the core VM
@@ -1216,7 +1192,7 @@ static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
 
 unsigned long mmap_region(struct file *file, unsigned long addr,
                          unsigned long len, unsigned long flags,
-                         unsigned int vm_flags, unsigned long pgoff)
+                         vm_flags_t vm_flags, unsigned long pgoff)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
@@ -1754,13 +1730,17 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                size = address - vma->vm_start;
                grow = (address - vma->vm_end) >> PAGE_SHIFT;
 
-               error = acct_stack_growth(vma, size, grow);
-               if (!error) {
-                       vma->vm_end = address;
-                       perf_event_mmap(vma);
+               error = -ENOMEM;
+               if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
+                       error = acct_stack_growth(vma, size, grow);
+                       if (!error) {
+                               vma->vm_end = address;
+                               perf_event_mmap(vma);
+                       }
                }
        }
        vma_unlock_anon_vma(vma);
+       khugepaged_enter_vma_merge(vma);
        return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1768,7 +1748,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 /*
  * vma is the first one with address < vma->vm_start.  Have to extend vma.
  */
-static int expand_downwards(struct vm_area_struct *vma,
+int expand_downwards(struct vm_area_struct *vma,
                                   unsigned long address)
 {
        int error;
@@ -1800,22 +1780,21 @@ static int expand_downwards(struct vm_area_struct *vma,
                size = vma->vm_end - address;
                grow = (vma->vm_start - address) >> PAGE_SHIFT;
 
-               error = acct_stack_growth(vma, size, grow);
-               if (!error) {
-                       vma->vm_start = address;
-                       vma->vm_pgoff -= grow;
-                       perf_event_mmap(vma);
+               error = -ENOMEM;
+               if (grow <= vma->vm_pgoff) {
+                       error = acct_stack_growth(vma, size, grow);
+                       if (!error) {
+                               vma->vm_start = address;
+                               vma->vm_pgoff -= grow;
+                               perf_event_mmap(vma);
+                       }
                }
        }
        vma_unlock_anon_vma(vma);
+       khugepaged_enter_vma_merge(vma);
        return error;
 }
 
-int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address)
-{
-       return expand_downwards(vma, address);
-}
-
 #ifdef CONFIG_STACK_GROWSUP
 int expand_stack(struct vm_area_struct *vma, unsigned long address)
 {
@@ -1898,17 +1877,17 @@ static void unmap_region(struct mm_struct *mm,
                unsigned long start, unsigned long end)
 {
        struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
-       struct mmu_gather *tlb;
+       struct mmu_gather tlb;
        unsigned long nr_accounted = 0;
 
        lru_add_drain();
-       tlb = tlb_gather_mmu(mm, 0);
+       tlb_gather_mmu(&tlb, mm, 0);
        update_hiwater_rss(mm);
        unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-       free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
-                                next? next->vm_start: 0);
-       tlb_finish_mmu(tlb, start, end);
+       free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+                                next ? next->vm_start : 0);
+       tlb_finish_mmu(&tlb, start, end);
 }
 
 /*
@@ -2050,9 +2029,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
                return -EINVAL;
 
        /* Find the first overlapping VMA */
-       vma = find_vma_prev(mm, start, &prev);
+       vma = find_vma(mm, start);
        if (!vma)
                return 0;
+       prev = vma->vm_prev;
        /* we have  start < vma->vm_end  */
 
        /* if it doesn't overlap, we have nothing.. */
@@ -2250,7 +2230,7 @@ EXPORT_SYMBOL(do_brk);
 /* Release all mmaps. */
 void exit_mmap(struct mm_struct *mm)
 {
-       struct mmu_gather *tlb;
+       struct mmu_gather tlb;
        struct vm_area_struct *vma;
        unsigned long nr_accounted = 0;
        unsigned long end;
@@ -2275,14 +2255,14 @@ void exit_mmap(struct mm_struct *mm)
 
        lru_add_drain();
        flush_cache_mm(mm);
-       tlb = tlb_gather_mmu(mm, 1);
+       tlb_gather_mmu(&tlb, mm, 1);
        /* update_hiwater_rss(mm) here? but nobody should be looking */
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
        end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
 
-       free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
-       tlb_finish_mmu(tlb, 0, end);
+       free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+       tlb_finish_mmu(&tlb, 0, end);
 
        /*
         * Walk the list again, actually closing and freeing it,
@@ -2296,7 +2276,7 @@ void exit_mmap(struct mm_struct *mm)
 
 /* Insert vm structure into process list sorted by address
  * and into the inode's i_mmap tree.  If vm_file is non-NULL
- * then i_mmap_lock is taken here.
+ * then i_mmap_mutex is taken here.
  */
 int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 {
@@ -2508,15 +2488,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
                 */
-               spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem);
+               mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
                /*
                 * We can safely modify head.next after taking the
-                * anon_vma->root->lock. If some other vma in this mm shares
+                * anon_vma->root->mutex. If some other vma in this mm shares
                 * the same anon_vma we won't take it again.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us thanks to the
-                * anon_vma->root->lock.
+                * anon_vma->root->mutex.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
                                       &anon_vma->root->head.next))
@@ -2538,7 +2518,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
                 */
                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
                        BUG();
-               spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
+               mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
        }
 }
 
@@ -2565,7 +2545,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
  * vma in this mm is backed by the same anon_vma or address_space.
  *
  * We can take all the locks in random order because the VM code
- * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
+ * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never
  * takes more than one of them in a row. Secondly we're protected
  * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
  *
@@ -2621,7 +2601,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us until we release the
-                * anon_vma->root->lock.
+                * anon_vma->root->mutex.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
                                          &anon_vma->root->head.next))
@@ -2637,7 +2617,7 @@ static void vm_unlock_mapping(struct address_space *mapping)
                 * AS_MM_ALL_LOCKS can't change to 0 from under us
                 * because we hold the mm_all_locks_mutex.
                 */
-               spin_unlock(&mapping->i_mmap_lock);
+               mutex_unlock(&mapping->i_mmap_mutex);
                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
                                        &mapping->flags))
                        BUG();