tmpfs: convert shmem_truncate_range to radix-swap
[linux-2.6.git] / mm / huge_memory.c
index f4f6041..e2d1587 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/kthread.h>
 #include <linux/khugepaged.h>
 #include <linux/freezer.h>
+#include <linux/mman.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
 #include "internal.h"
@@ -243,24 +244,28 @@ static ssize_t single_flag_show(struct kobject *kobj,
                                struct kobj_attribute *attr, char *buf,
                                enum transparent_hugepage_flag flag)
 {
-       if (test_bit(flag, &transparent_hugepage_flags))
-               return sprintf(buf, "[yes] no\n");
-       else
-               return sprintf(buf, "yes [no]\n");
+       return sprintf(buf, "%d\n",
+                      !!test_bit(flag, &transparent_hugepage_flags));
 }
+
 static ssize_t single_flag_store(struct kobject *kobj,
                                 struct kobj_attribute *attr,
                                 const char *buf, size_t count,
                                 enum transparent_hugepage_flag flag)
 {
-       if (!memcmp("yes", buf,
-                   min(sizeof("yes")-1, count))) {
+       unsigned long value;
+       int ret;
+
+       ret = kstrtoul(buf, 10, &value);
+       if (ret < 0)
+               return ret;
+       if (value > 1)
+               return -EINVAL;
+
+       if (value)
                set_bit(flag, &transparent_hugepage_flags);
-       } else if (!memcmp("no", buf,
-                          min(sizeof("no")-1, count))) {
+       else
                clear_bit(flag, &transparent_hugepage_flags);
-       } else
-               return -EINVAL;
 
        return count;
 }
@@ -642,23 +647,24 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
        return ret;
 }
 
-static inline gfp_t alloc_hugepage_gfpmask(int defrag)
+static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
 {
-       return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
+       return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
 }
 
 static inline struct page *alloc_hugepage_vma(int defrag,
                                              struct vm_area_struct *vma,
-                                             unsigned long haddr)
+                                             unsigned long haddr, int nd,
+                                             gfp_t extra_gfp)
 {
-       return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
-                              HPAGE_PMD_ORDER, vma, haddr);
+       return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
+                              HPAGE_PMD_ORDER, vma, haddr, nd);
 }
 
 #ifndef CONFIG_NUMA
 static inline struct page *alloc_hugepage(int defrag)
 {
-       return alloc_pages(alloc_hugepage_gfpmask(defrag),
+       return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
                           HPAGE_PMD_ORDER);
 }
 #endif
@@ -677,9 +683,12 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                if (unlikely(khugepaged_enter(vma)))
                        return VM_FAULT_OOM;
                page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-                                         vma, haddr);
-               if (unlikely(!page))
+                                         vma, haddr, numa_node_id(), 0);
+               if (unlikely(!page)) {
+                       count_vm_event(THP_FAULT_FALLBACK);
                        goto out;
+               }
+               count_vm_event(THP_FAULT_ALLOC);
                if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
                        put_page(page);
                        goto out;
@@ -798,8 +807,9 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        }
 
        for (i = 0; i < HPAGE_PMD_NR; i++) {
-               pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
-                                         vma, address);
+               pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
+                                              __GFP_OTHER_NODE,
+                                              vma, address, page_to_nid(page));
                if (unlikely(!pages[i] ||
                             mem_cgroup_newpage_charge(pages[i], mm,
                                                       GFP_KERNEL))) {
@@ -901,16 +911,18 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (transparent_hugepage_enabled(vma) &&
            !transparent_hugepage_debug_cow())
                new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-                                             vma, haddr);
+                                             vma, haddr, numa_node_id(), 0);
        else
                new_page = NULL;
 
        if (unlikely(!new_page)) {
+               count_vm_event(THP_FAULT_FALLBACK);
                ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
                                                   pmd, orig_pmd, page, haddr);
                put_page(page);
                goto out;
        }
+       count_vm_event(THP_FAULT_ALLOC);
 
        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
                put_page(new_page);
@@ -1127,7 +1139,7 @@ static int __split_huge_page_splitting(struct page *page,
                 * We can't temporarily set the pmd to null in order
                 * to split it, the pmd must remain marked huge at all
                 * times or the VM won't take the pmd_trans_huge paths
-                * and it won't wait on the anon_vma->root->lock to
+                * and it won't wait on the anon_vma->root->mutex to
                 * serialize against split_huge_page*.
                 */
                pmdp_splitting_flush_notify(vma, address, pmd);
@@ -1161,7 +1173,12 @@ static void __split_huge_page_refcount(struct page *page)
                /* after clearing PageTail the gup refcount can be released */
                smp_mb();
 
-               page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+               /*
+                * retain hwpoison flag of the poisoned tail page:
+                *   fix for the unsuitable process killed on Guest Machine(KVM)
+                *   by the memory-failure.
+                */
+               page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
                page_tail->flags |= (page->flags &
                                     ((1L << PG_referenced) |
                                      (1L << PG_swapbacked) |
@@ -1202,6 +1219,8 @@ static void __split_huge_page_refcount(struct page *page)
                BUG_ON(!PageDirty(page_tail));
                BUG_ON(!PageSwapBacked(page_tail));
 
+               mem_cgroup_split_huge_fixup(page, page_tail);
+
                lru_add_page_tail(zone, page, page_tail);
        }
 
@@ -1314,7 +1333,7 @@ static int __split_huge_page_map(struct page *page,
        return ret;
 }
 
-/* must be called with anon_vma->root->lock hold */
+/* must be called with anon_vma->root->mutex hold */
 static void __split_huge_page(struct page *page,
                              struct anon_vma *anon_vma)
 {
@@ -1380,6 +1399,7 @@ int split_huge_page(struct page *page)
 
        BUG_ON(!PageSwapBacked(page));
        __split_huge_page(page, anon_vma);
+       count_vm_event(THP_SPLIT);
 
        BUG_ON(PageCompound(page));
 out_unlock:
@@ -1388,18 +1408,44 @@ out:
        return ret;
 }
 
-int hugepage_madvise(unsigned long *vm_flags)
-{
-       /*
-        * Be somewhat over-protective like KSM for now!
-        */
-       if (*vm_flags & (VM_HUGEPAGE | VM_SHARED  | VM_MAYSHARE   |
-                        VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
-                        VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
-                        VM_MIXEDMAP | VM_SAO))
-               return -EINVAL;
+#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
+                  VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
 
-       *vm_flags |= VM_HUGEPAGE;
+int hugepage_madvise(struct vm_area_struct *vma,
+                    unsigned long *vm_flags, int advice)
+{
+       switch (advice) {
+       case MADV_HUGEPAGE:
+               /*
+                * Be somewhat over-protective like KSM for now!
+                */
+               if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
+                       return -EINVAL;
+               *vm_flags &= ~VM_NOHUGEPAGE;
+               *vm_flags |= VM_HUGEPAGE;
+               /*
+                * If the vma become good for khugepaged to scan,
+                * register it here without waiting a page fault that
+                * may not happen any time soon.
+                */
+               if (unlikely(khugepaged_enter_vma_merge(vma)))
+                       return -ENOMEM;
+               break;
+       case MADV_NOHUGEPAGE:
+               /*
+                * Be somewhat over-protective like KSM for now!
+                */
+               if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
+                       return -EINVAL;
+               *vm_flags &= ~VM_HUGEPAGE;
+               *vm_flags |= VM_NOHUGEPAGE;
+               /*
+                * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+                * this vma even if we leave the mm registered in khugepaged if
+                * it got registered before VM_NOHUGEPAGE was set.
+                */
+               break;
+       }
 
        return 0;
 }
@@ -1523,10 +1569,14 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
                 * page fault if needed.
                 */
                return 0;
-       if (vma->vm_file || vma->vm_ops)
+       if (vma->vm_ops)
                /* khugepaged not yet working on file or special mappings */
                return 0;
-       VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+       /*
+        * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
+        * true too, verify it here.
+        */
+       VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (hstart < hend)
@@ -1546,14 +1596,13 @@ void __khugepaged_exit(struct mm_struct *mm)
                list_del(&mm_slot->mm_node);
                free = 1;
        }
+       spin_unlock(&khugepaged_mm_lock);
 
        if (free) {
-               spin_unlock(&khugepaged_mm_lock);
                clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
                free_mm_slot(mm_slot);
                mmdrop(mm);
        } else if (mm_slot) {
-               spin_unlock(&khugepaged_mm_lock);
                /*
                 * This is required to serialize against
                 * khugepaged_test_exit() (which is guaranteed to run
@@ -1564,8 +1613,7 @@ void __khugepaged_exit(struct mm_struct *mm)
                 */
                down_write(&mm->mmap_sem);
                up_write(&mm->mmap_sem);
-       } else
-               spin_unlock(&khugepaged_mm_lock);
+       }
 }
 
 static void release_pte_page(struct page *page)
@@ -1706,7 +1754,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 static void collapse_huge_page(struct mm_struct *mm,
                               unsigned long address,
                               struct page **hpage,
-                              struct vm_area_struct *vma)
+                              struct vm_area_struct *vma,
+                              int node)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -1720,6 +1769,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 #ifndef CONFIG_NUMA
+       up_read(&mm->mmap_sem);
        VM_BUG_ON(!*hpage);
        new_page = *hpage;
 #else
@@ -1734,22 +1784,29 @@ static void collapse_huge_page(struct mm_struct *mm,
         * mmap_sem in read mode is good idea also to allow greater
         * scalability.
         */
-       new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+       new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+                                     node, __GFP_OTHER_NODE);
+
+       /*
+        * After allocating the hugepage, release the mmap_sem read lock in
+        * preparation for taking it in write mode.
+        */
+       up_read(&mm->mmap_sem);
        if (unlikely(!new_page)) {
-               up_read(&mm->mmap_sem);
+               count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                *hpage = ERR_PTR(-ENOMEM);
                return;
        }
 #endif
+
+       count_vm_event(THP_COLLAPSE_ALLOC);
        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
-               up_read(&mm->mmap_sem);
+#ifdef CONFIG_NUMA
                put_page(new_page);
+#endif
                return;
        }
 
-       /* after allocating the hugepage upgrade to mmap_sem write mode */
-       up_read(&mm->mmap_sem);
-
        /*
         * Prevent all access to pagetables with the exception of
         * gup_fast later hanlded by the ptep_clear_flush and the VM
@@ -1765,13 +1822,19 @@ static void collapse_huge_page(struct mm_struct *mm,
        if (address < hstart || address + HPAGE_PMD_SIZE > hend)
                goto out;
 
-       if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+       if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+           (vma->vm_flags & VM_NOHUGEPAGE))
                goto out;
 
-       /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
-       if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+       if (!vma->anon_vma || vma->vm_ops)
+               goto out;
+       if (is_vma_temporary_stack(vma))
                goto out;
-       VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+       /*
+        * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
+        * true too, verify it here.
+        */
+       VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
 
        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
@@ -1804,15 +1867,14 @@ static void collapse_huge_page(struct mm_struct *mm,
        spin_lock(ptl);
        isolated = __collapse_huge_page_isolate(vma, address, pte);
        spin_unlock(ptl);
-       pte_unmap(pte);
 
        if (unlikely(!isolated)) {
+               pte_unmap(pte);
                spin_lock(&mm->page_table_lock);
                BUG_ON(!pmd_none(*pmd));
                set_pmd_at(mm, address, pmd, _pmd);
                spin_unlock(&mm->page_table_lock);
                anon_vma_unlock(vma->anon_vma);
-               mem_cgroup_uncharge_page(new_page);
                goto out;
        }
 
@@ -1823,6 +1885,7 @@ static void collapse_huge_page(struct mm_struct *mm,
        anon_vma_unlock(vma->anon_vma);
 
        __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+       pte_unmap(pte);
        __SetPageUptodate(new_page);
        pgtable = pmd_pgtable(_pmd);
        VM_BUG_ON(page_count(pgtable) != 1);
@@ -1857,6 +1920,7 @@ out_up_write:
        return;
 
 out:
+       mem_cgroup_uncharge_page(new_page);
 #ifdef CONFIG_NUMA
        put_page(new_page);
 #endif
@@ -1876,6 +1940,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        struct page *page;
        unsigned long _address;
        spinlock_t *ptl;
+       int node = -1;
 
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
@@ -1906,6 +1971,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                page = vm_normal_page(vma, _address, pteval);
                if (unlikely(!page))
                        goto out_unmap;
+               /*
+                * Chose the node of the first page. This could
+                * be more sophisticated and look at more pages,
+                * but isn't for now.
+                */
+               if (node == -1)
+                       node = page_to_nid(page);
                VM_BUG_ON(PageCompound(page));
                if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
                        goto out_unmap;
@@ -1922,7 +1994,7 @@ out_unmap:
        pte_unmap_unlock(pte, ptl);
        if (ret)
                /* collapse_huge_page will return with the mmap_sem released */
-               collapse_huge_page(mm, address, hpage, vma);
+               collapse_huge_page(mm, address, hpage, vma, node);
 out:
        return ret;
 }
@@ -1988,34 +2060,33 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                        break;
                }
 
-               if (!(vma->vm_flags & VM_HUGEPAGE) &&
-                   !khugepaged_always()) {
+               if ((!(vma->vm_flags & VM_HUGEPAGE) &&
+                    !khugepaged_always()) ||
+                   (vma->vm_flags & VM_NOHUGEPAGE)) {
+               skip:
                        progress++;
                        continue;
                }
-
-               /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
-               if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
-                       khugepaged_scan.address = vma->vm_end;
-                       progress++;
-                       continue;
-               }
-               VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+               if (!vma->anon_vma || vma->vm_ops)
+                       goto skip;
+               if (is_vma_temporary_stack(vma))
+                       goto skip;
+               /*
+                * If is_pfn_mapping() is true is_learn_pfn_mapping()
+                * must be true too, verify it here.
+                */
+               VM_BUG_ON(is_linear_pfn_mapping(vma) ||
+                         vma->vm_flags & VM_NO_THP);
 
                hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
                hend = vma->vm_end & HPAGE_PMD_MASK;
-               if (hstart >= hend) {
-                       progress++;
-                       continue;
-               }
+               if (hstart >= hend)
+                       goto skip;
+               if (khugepaged_scan.address > hend)
+                       goto skip;
                if (khugepaged_scan.address < hstart)
                        khugepaged_scan.address = hstart;
-               if (khugepaged_scan.address > hend) {
-                       khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
-                       progress++;
-                       continue;
-               }
-               BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+               VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
 
                while (khugepaged_scan.address < hend) {
                        int ret;
@@ -2044,7 +2115,7 @@ breakouterloop:
 breakouterloop_mmap_sem:
 
        spin_lock(&khugepaged_mm_lock);
-       BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+       VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
        /*
         * Release the current mm_slot if this mm is about to die, or
         * if we scanned all vmas of this mm.
@@ -2096,8 +2167,11 @@ static void khugepaged_do_scan(struct page **hpage)
 #ifndef CONFIG_NUMA
                if (!*hpage) {
                        *hpage = alloc_hugepage(khugepaged_defrag());
-                       if (unlikely(!*hpage))
+                       if (unlikely(!*hpage)) {
+                               count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                                break;
+                       }
+                       count_vm_event(THP_COLLAPSE_ALLOC);
                }
 #else
                if (IS_ERR(*hpage))
@@ -2137,8 +2211,11 @@ static struct page *khugepaged_alloc_hugepage(void)
 
        do {
                hpage = alloc_hugepage(khugepaged_defrag());
-               if (!hpage)
+               if (!hpage) {
+                       count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                        khugepaged_alloc_sleep();
+               } else
+                       count_vm_event(THP_COLLAPSE_ALLOC);
        } while (unlikely(!hpage) &&
                 likely(khugepaged_enabled()));
        return hpage;
@@ -2199,9 +2276,9 @@ static int khugepaged(void *none)
 
        for (;;) {
                mutex_unlock(&khugepaged_mutex);
-               BUG_ON(khugepaged_thread != current);
+               VM_BUG_ON(khugepaged_thread != current);
                khugepaged_loop();
-               BUG_ON(khugepaged_thread != current);
+               VM_BUG_ON(khugepaged_thread != current);
 
                mutex_lock(&khugepaged_mutex);
                if (!khugepaged_enabled())