tmpfs: convert shmem_truncate_range to radix-swap
[linux-2.6.git] / mm / huge_memory.c
index c7c2cd9..e2d1587 100644 (file)
@@ -244,24 +244,28 @@ static ssize_t single_flag_show(struct kobject *kobj,
                                struct kobj_attribute *attr, char *buf,
                                enum transparent_hugepage_flag flag)
 {
-       if (test_bit(flag, &transparent_hugepage_flags))
-               return sprintf(buf, "[yes] no\n");
-       else
-               return sprintf(buf, "yes [no]\n");
+       return sprintf(buf, "%d\n",
+                      !!test_bit(flag, &transparent_hugepage_flags));
 }
+
 static ssize_t single_flag_store(struct kobject *kobj,
                                 struct kobj_attribute *attr,
                                 const char *buf, size_t count,
                                 enum transparent_hugepage_flag flag)
 {
-       if (!memcmp("yes", buf,
-                   min(sizeof("yes")-1, count))) {
+       unsigned long value;
+       int ret;
+
+       ret = kstrtoul(buf, 10, &value);
+       if (ret < 0)
+               return ret;
+       if (value > 1)
+               return -EINVAL;
+
+       if (value)
                set_bit(flag, &transparent_hugepage_flags);
-       } else if (!memcmp("no", buf,
-                          min(sizeof("no")-1, count))) {
+       else
                clear_bit(flag, &transparent_hugepage_flags);
-       } else
-               return -EINVAL;
 
        return count;
 }
@@ -643,23 +647,24 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
        return ret;
 }
 
-static inline gfp_t alloc_hugepage_gfpmask(int defrag)
+static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
 {
-       return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
+       return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
 }
 
 static inline struct page *alloc_hugepage_vma(int defrag,
                                              struct vm_area_struct *vma,
-                                             unsigned long haddr)
+                                             unsigned long haddr, int nd,
+                                             gfp_t extra_gfp)
 {
-       return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
-                              HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
+       return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
+                              HPAGE_PMD_ORDER, vma, haddr, nd);
 }
 
 #ifndef CONFIG_NUMA
 static inline struct page *alloc_hugepage(int defrag)
 {
-       return alloc_pages(alloc_hugepage_gfpmask(defrag),
+       return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
                           HPAGE_PMD_ORDER);
 }
 #endif
@@ -678,9 +683,12 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                if (unlikely(khugepaged_enter(vma)))
                        return VM_FAULT_OOM;
                page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-                                         vma, haddr);
-               if (unlikely(!page))
+                                         vma, haddr, numa_node_id(), 0);
+               if (unlikely(!page)) {
+                       count_vm_event(THP_FAULT_FALLBACK);
                        goto out;
+               }
+               count_vm_event(THP_FAULT_ALLOC);
                if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
                        put_page(page);
                        goto out;
@@ -799,8 +807,9 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        }
 
        for (i = 0; i < HPAGE_PMD_NR; i++) {
-               pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
-                                         vma, address);
+               pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
+                                              __GFP_OTHER_NODE,
+                                              vma, address, page_to_nid(page));
                if (unlikely(!pages[i] ||
                             mem_cgroup_newpage_charge(pages[i], mm,
                                                       GFP_KERNEL))) {
@@ -902,16 +911,18 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (transparent_hugepage_enabled(vma) &&
            !transparent_hugepage_debug_cow())
                new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-                                             vma, haddr);
+                                             vma, haddr, numa_node_id(), 0);
        else
                new_page = NULL;
 
        if (unlikely(!new_page)) {
+               count_vm_event(THP_FAULT_FALLBACK);
                ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
                                                   pmd, orig_pmd, page, haddr);
                put_page(page);
                goto out;
        }
+       count_vm_event(THP_FAULT_ALLOC);
 
        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
                put_page(new_page);
@@ -1128,7 +1139,7 @@ static int __split_huge_page_splitting(struct page *page,
                 * We can't temporarily set the pmd to null in order
                 * to split it, the pmd must remain marked huge at all
                 * times or the VM won't take the pmd_trans_huge paths
-                * and it won't wait on the anon_vma->root->lock to
+                * and it won't wait on the anon_vma->root->mutex to
                 * serialize against split_huge_page*.
                 */
                pmdp_splitting_flush_notify(vma, address, pmd);
@@ -1322,7 +1333,7 @@ static int __split_huge_page_map(struct page *page,
        return ret;
 }
 
-/* must be called with anon_vma->root->lock hold */
+/* must be called with anon_vma->root->mutex hold */
 static void __split_huge_page(struct page *page,
                              struct anon_vma *anon_vma)
 {
@@ -1388,6 +1399,7 @@ int split_huge_page(struct page *page)
 
        BUG_ON(!PageSwapBacked(page));
        __split_huge_page(page, anon_vma);
+       count_vm_event(THP_SPLIT);
 
        BUG_ON(PageCompound(page));
 out_unlock:
@@ -1396,6 +1408,9 @@ out:
        return ret;
 }
 
+#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
+                  VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
+
 int hugepage_madvise(struct vm_area_struct *vma,
                     unsigned long *vm_flags, int advice)
 {
@@ -1404,11 +1419,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
                /*
                 * Be somewhat over-protective like KSM for now!
                 */
-               if (*vm_flags & (VM_HUGEPAGE |
-                                VM_SHARED   | VM_MAYSHARE   |
-                                VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
-                                VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
-                                VM_MIXEDMAP | VM_SAO))
+               if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
                        return -EINVAL;
                *vm_flags &= ~VM_NOHUGEPAGE;
                *vm_flags |= VM_HUGEPAGE;
@@ -1424,11 +1435,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
                /*
                 * Be somewhat over-protective like KSM for now!
                 */
-               if (*vm_flags & (VM_NOHUGEPAGE |
-                                VM_SHARED   | VM_MAYSHARE   |
-                                VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
-                                VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
-                                VM_MIXEDMAP | VM_SAO))
+               if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
                        return -EINVAL;
                *vm_flags &= ~VM_HUGEPAGE;
                *vm_flags |= VM_NOHUGEPAGE;
@@ -1562,10 +1569,14 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
                 * page fault if needed.
                 */
                return 0;
-       if (vma->vm_file || vma->vm_ops)
+       if (vma->vm_ops)
                /* khugepaged not yet working on file or special mappings */
                return 0;
-       VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+       /*
+        * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
+        * true too, verify it here.
+        */
+       VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (hstart < hend)
@@ -1585,14 +1596,13 @@ void __khugepaged_exit(struct mm_struct *mm)
                list_del(&mm_slot->mm_node);
                free = 1;
        }
+       spin_unlock(&khugepaged_mm_lock);
 
        if (free) {
-               spin_unlock(&khugepaged_mm_lock);
                clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
                free_mm_slot(mm_slot);
                mmdrop(mm);
        } else if (mm_slot) {
-               spin_unlock(&khugepaged_mm_lock);
                /*
                 * This is required to serialize against
                 * khugepaged_test_exit() (which is guaranteed to run
@@ -1603,8 +1613,7 @@ void __khugepaged_exit(struct mm_struct *mm)
                 */
                down_write(&mm->mmap_sem);
                up_write(&mm->mmap_sem);
-       } else
-               spin_unlock(&khugepaged_mm_lock);
+       }
 }
 
 static void release_pte_page(struct page *page)
@@ -1745,7 +1754,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 static void collapse_huge_page(struct mm_struct *mm,
                               unsigned long address,
                               struct page **hpage,
-                              struct vm_area_struct *vma)
+                              struct vm_area_struct *vma,
+                              int node)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -1759,6 +1769,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 #ifndef CONFIG_NUMA
+       up_read(&mm->mmap_sem);
        VM_BUG_ON(!*hpage);
        new_page = *hpage;
 #else
@@ -1773,22 +1784,29 @@ static void collapse_huge_page(struct mm_struct *mm,
         * mmap_sem in read mode is good idea also to allow greater
         * scalability.
         */
-       new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+       new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+                                     node, __GFP_OTHER_NODE);
+
+       /*
+        * After allocating the hugepage, release the mmap_sem read lock in
+        * preparation for taking it in write mode.
+        */
+       up_read(&mm->mmap_sem);
        if (unlikely(!new_page)) {
-               up_read(&mm->mmap_sem);
+               count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                *hpage = ERR_PTR(-ENOMEM);
                return;
        }
 #endif
+
+       count_vm_event(THP_COLLAPSE_ALLOC);
        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
-               up_read(&mm->mmap_sem);
+#ifdef CONFIG_NUMA
                put_page(new_page);
+#endif
                return;
        }
 
-       /* after allocating the hugepage upgrade to mmap_sem write mode */
-       up_read(&mm->mmap_sem);
-
        /*
         * Prevent all access to pagetables with the exception of
         * gup_fast later hanlded by the ptep_clear_flush and the VM
@@ -1808,12 +1826,15 @@ static void collapse_huge_page(struct mm_struct *mm,
            (vma->vm_flags & VM_NOHUGEPAGE))
                goto out;
 
-       /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
-       if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+       if (!vma->anon_vma || vma->vm_ops)
                goto out;
        if (is_vma_temporary_stack(vma))
                goto out;
-       VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+       /*
+        * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
+        * true too, verify it here.
+        */
+       VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
 
        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
@@ -1919,6 +1940,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        struct page *page;
        unsigned long _address;
        spinlock_t *ptl;
+       int node = -1;
 
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
@@ -1949,6 +1971,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                page = vm_normal_page(vma, _address, pteval);
                if (unlikely(!page))
                        goto out_unmap;
+               /*
+                * Chose the node of the first page. This could
+                * be more sophisticated and look at more pages,
+                * but isn't for now.
+                */
+               if (node == -1)
+                       node = page_to_nid(page);
                VM_BUG_ON(PageCompound(page));
                if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
                        goto out_unmap;
@@ -1965,7 +1994,7 @@ out_unmap:
        pte_unmap_unlock(pte, ptl);
        if (ret)
                /* collapse_huge_page will return with the mmap_sem released */
-               collapse_huge_page(mm, address, hpage, vma);
+               collapse_huge_page(mm, address, hpage, vma, node);
 out:
        return ret;
 }
@@ -2038,13 +2067,16 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                        progress++;
                        continue;
                }
-               /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
-               if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+               if (!vma->anon_vma || vma->vm_ops)
                        goto skip;
                if (is_vma_temporary_stack(vma))
                        goto skip;
-
-               VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+               /*
+                * If is_pfn_mapping() is true is_learn_pfn_mapping()
+                * must be true too, verify it here.
+                */
+               VM_BUG_ON(is_linear_pfn_mapping(vma) ||
+                         vma->vm_flags & VM_NO_THP);
 
                hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
                hend = vma->vm_end & HPAGE_PMD_MASK;
@@ -2135,8 +2167,11 @@ static void khugepaged_do_scan(struct page **hpage)
 #ifndef CONFIG_NUMA
                if (!*hpage) {
                        *hpage = alloc_hugepage(khugepaged_defrag());
-                       if (unlikely(!*hpage))
+                       if (unlikely(!*hpage)) {
+                               count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                                break;
+                       }
+                       count_vm_event(THP_COLLAPSE_ALLOC);
                }
 #else
                if (IS_ERR(*hpage))
@@ -2176,8 +2211,11 @@ static struct page *khugepaged_alloc_hugepage(void)
 
        do {
                hpage = alloc_hugepage(khugepaged_defrag());
-               if (!hpage)
+               if (!hpage) {
+                       count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                        khugepaged_alloc_sleep();
+               } else
+                       count_vm_event(THP_COLLAPSE_ALLOC);
        } while (unlikely(!hpage) &&
                 likely(khugepaged_enabled()));
        return hpage;