thp: madvise(MADV_NOHUGEPAGE)
Andrea Arcangeli [Thu, 13 Jan 2011 23:47:17 +0000 (15:47 -0800)]
Add madvise MADV_NOHUGEPAGE to mark regions that are not important to be
hugepage backed.  Return -EINVAL if the vma is not of an anonymous type,
or the feature isn't built into the kernel.  Never silently return
success.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

include/linux/huge_mm.h
include/linux/khugepaged.h
include/linux/mm.h
mm/huge_memory.c
mm/madvise.c

index 9b48c24..a8b7e42 100644 (file)
@@ -52,10 +52,12 @@ extern pmd_t *page_check_address_pmd(struct page *page,
 #define HPAGE_PMD_SIZE HPAGE_SIZE
 
 #define transparent_hugepage_enabled(__vma)                            \
-       (transparent_hugepage_flags & (1<<TRANSPARENT_HUGEPAGE_FLAG) || \
-        (transparent_hugepage_flags &                                  \
-         (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) &&                    \
-         (__vma)->vm_flags & VM_HUGEPAGE))
+       ((transparent_hugepage_flags &                                  \
+         (1<<TRANSPARENT_HUGEPAGE_FLAG) ||                             \
+         (transparent_hugepage_flags &                                 \
+          (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) &&                   \
+          ((__vma)->vm_flags & VM_HUGEPAGE))) &&                       \
+        !((__vma)->vm_flags & VM_NOHUGEPAGE))
 #define transparent_hugepage_defrag(__vma)                             \
        ((transparent_hugepage_flags &                                  \
          (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)) ||                     \
@@ -103,7 +105,7 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
 #if HPAGE_PMD_ORDER > MAX_ORDER
 #error "hugepages can't be allocated by the buddy allocator"
 #endif
-extern int hugepage_madvise(unsigned long *vm_flags);
+extern int hugepage_madvise(unsigned long *vm_flags, int advice);
 extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
                                    unsigned long start,
                                    unsigned long end,
@@ -141,7 +143,7 @@ static inline int split_huge_page(struct page *page)
        do { } while (0)
 #define wait_split_huge_page(__anon_vma, __pmd)        \
        do { } while (0)
-static inline int hugepage_madvise(unsigned long *vm_flags)
+static inline int hugepage_madvise(unsigned long *vm_flags, int advice)
 {
        BUG();
        return 0;
index 552f318..6b394f0 100644 (file)
@@ -38,9 +38,10 @@ static inline void khugepaged_exit(struct mm_struct *mm)
 static inline int khugepaged_enter(struct vm_area_struct *vma)
 {
        if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
-               if (khugepaged_always() ||
-                   (khugepaged_req_madv() &&
-                    vma->vm_flags & VM_HUGEPAGE))
+               if ((khugepaged_always() ||
+                    (khugepaged_req_madv() &&
+                     vma->vm_flags & VM_HUGEPAGE)) &&
+                   !(vma->vm_flags & VM_NOHUGEPAGE))
                        if (__khugepaged_enter(vma->vm_mm))
                                return -ENOMEM;
        return 0;
index ce97a2b..956a355 100644 (file)
@@ -83,6 +83,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_GROWSUP     0x00000200
 #else
 #define VM_GROWSUP     0x00000000
+#define VM_NOHUGEPAGE  0x00000200      /* MADV_NOHUGEPAGE marked this vma */
 #endif
 #define VM_PFNMAP      0x00000400      /* Page-ranges managed without "struct page", just pure PFN */
 #define VM_DENYWRITE   0x00000800      /* ETXTBSY on write attempts.. */
index f4f6041..fce667c 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/kthread.h>
 #include <linux/khugepaged.h>
 #include <linux/freezer.h>
+#include <linux/mman.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
 #include "internal.h"
@@ -1388,18 +1389,36 @@ out:
        return ret;
 }
 
-int hugepage_madvise(unsigned long *vm_flags)
+int hugepage_madvise(unsigned long *vm_flags, int advice)
 {
-       /*
-        * Be somewhat over-protective like KSM for now!
-        */
-       if (*vm_flags & (VM_HUGEPAGE | VM_SHARED  | VM_MAYSHARE   |
-                        VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
-                        VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
-                        VM_MIXEDMAP | VM_SAO))
-               return -EINVAL;
-
-       *vm_flags |= VM_HUGEPAGE;
+       switch (advice) {
+       case MADV_HUGEPAGE:
+               /*
+                * Be somewhat over-protective like KSM for now!
+                */
+               if (*vm_flags & (VM_HUGEPAGE |
+                                VM_SHARED   | VM_MAYSHARE   |
+                                VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
+                                VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+                                VM_MIXEDMAP | VM_SAO))
+                       return -EINVAL;
+               *vm_flags &= ~VM_NOHUGEPAGE;
+               *vm_flags |= VM_HUGEPAGE;
+               break;
+       case MADV_NOHUGEPAGE:
+               /*
+                * Be somewhat over-protective like KSM for now!
+                */
+               if (*vm_flags & (VM_NOHUGEPAGE |
+                                VM_SHARED   | VM_MAYSHARE   |
+                                VM_PFNMAP   | VM_IO      | VM_DONTEXPAND |
+                                VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+                                VM_MIXEDMAP | VM_SAO))
+                       return -EINVAL;
+               *vm_flags &= ~VM_HUGEPAGE;
+               *vm_flags |= VM_NOHUGEPAGE;
+               break;
+       }
 
        return 0;
 }
index ecde40a..bbac126 100644 (file)
@@ -72,7 +72,8 @@ static long madvise_behavior(struct vm_area_struct * vma,
                        goto out;
                break;
        case MADV_HUGEPAGE:
-               error = hugepage_madvise(&new_flags);
+       case MADV_NOHUGEPAGE:
+               error = hugepage_madvise(&new_flags, behavior);
                if (error)
                        goto out;
                break;
@@ -290,6 +291,7 @@ madvise_behavior_valid(int behavior)
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        case MADV_HUGEPAGE:
+       case MADV_NOHUGEPAGE:
 #endif
                return 1;