#include <linux/mm_inline.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
+#include <linux/freezer.h>
+#include <linux/mman.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
struct kobj_attribute *attr, char *buf,
enum transparent_hugepage_flag flag)
{
- if (test_bit(flag, &transparent_hugepage_flags))
- return sprintf(buf, "[yes] no\n");
- else
- return sprintf(buf, "yes [no]\n");
+ return sprintf(buf, "%d\n",
+ !!test_bit(flag, &transparent_hugepage_flags));
}
+
static ssize_t single_flag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count,
enum transparent_hugepage_flag flag)
{
- if (!memcmp("yes", buf,
- min(sizeof("yes")-1, count))) {
+ unsigned long value;
+ int ret;
+
+ ret = kstrtoul(buf, 10, &value);
+ if (ret < 0)
+ return ret;
+ if (value > 1)
+ return -EINVAL;
+
+ if (value)
set_bit(flag, &transparent_hugepage_flags);
- } else if (!memcmp("no", buf,
- min(sizeof("no")-1, count))) {
+ else
clear_bit(flag, &transparent_hugepage_flags);
- } else
- return -EINVAL;
return count;
}
int err;
#ifdef CONFIG_SYSFS
static struct kobject *hugepage_kobj;
+#endif
+
+ err = -EINVAL;
+ if (!has_transparent_hugepage()) {
+ transparent_hugepage_flags = 0;
+ goto out;
+ }
+#ifdef CONFIG_SYSFS
err = -ENOMEM;
hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
if (unlikely(!hugepage_kobj)) {
goto out;
}
+ /*
+ * By default disable transparent hugepages on smaller systems,
+ * where the extra memory used could hurt more than TLB overhead
+ * is likely to save. The admin can still enable it through /sys.
+ */
+ if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+ transparent_hugepage_flags = 0;
+
start_khugepaged();
set_recommended_min_free_kbytes();
return ret;
}
-static inline gfp_t alloc_hugepage_gfpmask(int defrag)
+static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
{
- return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
+ return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
}
static inline struct page *alloc_hugepage_vma(int defrag,
struct vm_area_struct *vma,
- unsigned long haddr)
+ unsigned long haddr, int nd,
+ gfp_t extra_gfp)
{
- return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
- HPAGE_PMD_ORDER, vma, haddr);
+ return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
+ HPAGE_PMD_ORDER, vma, haddr, nd);
}
#ifndef CONFIG_NUMA
static inline struct page *alloc_hugepage(int defrag)
{
- return alloc_pages(alloc_hugepage_gfpmask(defrag),
+ return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
HPAGE_PMD_ORDER);
}
#endif
if (unlikely(khugepaged_enter(vma)))
return VM_FAULT_OOM;
page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
- vma, haddr);
- if (unlikely(!page))
+ vma, haddr, numa_node_id(), 0);
+ if (unlikely(!page)) {
+ count_vm_event(THP_FAULT_FALLBACK);
goto out;
+ }
+ count_vm_event(THP_FAULT_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
put_page(page);
goto out;
}
for (i = 0; i < HPAGE_PMD_NR; i++) {
- pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
- vma, address);
+ pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
+ __GFP_OTHER_NODE,
+ vma, address, page_to_nid(page));
if (unlikely(!pages[i] ||
mem_cgroup_newpage_charge(pages[i], mm,
GFP_KERNEL))) {
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow())
new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
- vma, haddr);
+ vma, haddr, numa_node_id(), 0);
else
new_page = NULL;
if (unlikely(!new_page)) {
+ count_vm_event(THP_FAULT_FALLBACK);
ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
pmd, orig_pmd, page, haddr);
put_page(page);
goto out;
}
+ count_vm_event(THP_FAULT_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
put_page(new_page);
* We can't temporarily set the pmd to null in order
* to split it, the pmd must remain marked huge at all
* times or the VM won't take the pmd_trans_huge paths
- * and it won't wait on the anon_vma->root->lock to
+ * and it won't wait on the anon_vma->root->mutex to
* serialize against split_huge_page*.
*/
pmdp_splitting_flush_notify(vma, address, pmd);
int i;
unsigned long head_index = page->index;
struct zone *zone = page_zone(page);
+ int zonestat;
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irq(&zone->lru_lock);
/* after clearing PageTail the gup refcount can be released */
smp_mb();
- page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ /*
+ * retain hwpoison flag of the poisoned tail page:
+ * fix for the unsuitable process killed on Guest Machine(KVM)
+ * by the memory-failure.
+ */
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
page_tail->flags |= (page->flags &
((1L << PG_referenced) |
(1L << PG_swapbacked) |
BUG_ON(!PageDirty(page_tail));
BUG_ON(!PageSwapBacked(page_tail));
+ mem_cgroup_split_huge_fixup(page, page_tail);
+
lru_add_page_tail(zone, page, page_tail);
}
__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+ /*
+ * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
+ * so adjust those appropriately if this page is on the LRU.
+ */
+ if (PageLRU(page)) {
+ zonestat = NR_LRU_BASE + page_lru(page);
+ __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
+ }
+
ClearPageCompound(page);
compound_unlock(page);
spin_unlock_irq(&zone->lru_lock);
return ret;
}
-/* must be called with anon_vma->root->lock hold */
+/* must be called with anon_vma->root->mutex hold */
static void __split_huge_page(struct page *page,
struct anon_vma *anon_vma)
{
BUG_ON(!PageSwapBacked(page));
__split_huge_page(page, anon_vma);
+ count_vm_event(THP_SPLIT);
BUG_ON(PageCompound(page));
out_unlock:
return ret;
}
-int hugepage_madvise(unsigned long *vm_flags)
-{
- /*
- * Be somewhat over-protective like KSM for now!
- */
- if (*vm_flags & (VM_HUGEPAGE | VM_SHARED | VM_MAYSHARE |
- VM_PFNMAP | VM_IO | VM_DONTEXPAND |
- VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
- VM_MIXEDMAP | VM_SAO))
- return -EINVAL;
+#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
+ VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
- *vm_flags |= VM_HUGEPAGE;
+int hugepage_madvise(struct vm_area_struct *vma,
+ unsigned long *vm_flags, int advice)
+{
+ switch (advice) {
+ case MADV_HUGEPAGE:
+ /*
+ * Be somewhat over-protective like KSM for now!
+ */
+ if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
+ return -EINVAL;
+ *vm_flags &= ~VM_NOHUGEPAGE;
+ *vm_flags |= VM_HUGEPAGE;
+ /*
+ * If the vma become good for khugepaged to scan,
+ * register it here without waiting a page fault that
+ * may not happen any time soon.
+ */
+ if (unlikely(khugepaged_enter_vma_merge(vma)))
+ return -ENOMEM;
+ break;
+ case MADV_NOHUGEPAGE:
+ /*
+ * Be somewhat over-protective like KSM for now!
+ */
+ if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
+ return -EINVAL;
+ *vm_flags &= ~VM_HUGEPAGE;
+ *vm_flags |= VM_NOHUGEPAGE;
+ /*
+ * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+ * this vma even if we leave the mm registered in khugepaged if
+ * it got registered before VM_NOHUGEPAGE was set.
+ */
+ break;
+ }
return 0;
}
* page fault if needed.
*/
return 0;
- if (vma->vm_file || vma->vm_ops)
+ if (vma->vm_ops)
/* khugepaged not yet working on file or special mappings */
return 0;
- VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+ /*
+ * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
+ * true too, verify it here.
+ */
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
list_del(&mm_slot->mm_node);
free = 1;
}
+ spin_unlock(&khugepaged_mm_lock);
if (free) {
- spin_unlock(&khugepaged_mm_lock);
clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
free_mm_slot(mm_slot);
mmdrop(mm);
} else if (mm_slot) {
- spin_unlock(&khugepaged_mm_lock);
/*
* This is required to serialize against
* khugepaged_test_exit() (which is guaranteed to run
*/
down_write(&mm->mmap_sem);
up_write(&mm->mmap_sem);
- } else
- spin_unlock(&khugepaged_mm_lock);
+ }
}
static void release_pte_page(struct page *page)
VM_BUG_ON(PageLRU(page));
/* If there is no mapped pte young don't collapse the page */
- if (pte_young(pteval))
+ if (pte_young(pteval) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
referenced = 1;
}
if (unlikely(!referenced))
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
struct page **hpage,
- struct vm_area_struct *vma)
+ struct vm_area_struct *vma,
+ int node)
{
pgd_t *pgd;
pud_t *pud;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
#ifndef CONFIG_NUMA
+ up_read(&mm->mmap_sem);
VM_BUG_ON(!*hpage);
new_page = *hpage;
#else
* mmap_sem in read mode is good idea also to allow greater
* scalability.
*/
- new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+ new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+ node, __GFP_OTHER_NODE);
+
+ /*
+ * After allocating the hugepage, release the mmap_sem read lock in
+ * preparation for taking it in write mode.
+ */
+ up_read(&mm->mmap_sem);
if (unlikely(!new_page)) {
- up_read(&mm->mmap_sem);
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
*hpage = ERR_PTR(-ENOMEM);
return;
}
#endif
+
+ count_vm_event(THP_COLLAPSE_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
- up_read(&mm->mmap_sem);
+#ifdef CONFIG_NUMA
put_page(new_page);
+#endif
return;
}
- /* after allocating the hugepage upgrade to mmap_sem write mode */
- up_read(&mm->mmap_sem);
-
/*
* Prevent all access to pagetables with the exception of
* gup_fast later hanlded by the ptep_clear_flush and the VM
if (address < hstart || address + HPAGE_PMD_SIZE > hend)
goto out;
- if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+ if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+ (vma->vm_flags & VM_NOHUGEPAGE))
goto out;
- /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
- if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+ if (!vma->anon_vma || vma->vm_ops)
goto out;
- VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+ if (is_vma_temporary_stack(vma))
+ goto out;
+ /*
+ * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
+ * true too, verify it here.
+ */
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
spin_lock(ptl);
isolated = __collapse_huge_page_isolate(vma, address, pte);
spin_unlock(ptl);
- pte_unmap(pte);
if (unlikely(!isolated)) {
+ pte_unmap(pte);
spin_lock(&mm->page_table_lock);
BUG_ON(!pmd_none(*pmd));
set_pmd_at(mm, address, pmd, _pmd);
spin_unlock(&mm->page_table_lock);
anon_vma_unlock(vma->anon_vma);
- mem_cgroup_uncharge_page(new_page);
goto out;
}
anon_vma_unlock(vma->anon_vma);
__collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+ pte_unmap(pte);
__SetPageUptodate(new_page);
pgtable = pmd_pgtable(_pmd);
VM_BUG_ON(page_count(pgtable) != 1);
return;
out:
+ mem_cgroup_uncharge_page(new_page);
#ifdef CONFIG_NUMA
put_page(new_page);
#endif
struct page *page;
unsigned long _address;
spinlock_t *ptl;
+ int node = -1;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
page = vm_normal_page(vma, _address, pteval);
if (unlikely(!page))
goto out_unmap;
+ /*
+ * Chose the node of the first page. This could
+ * be more sophisticated and look at more pages,
+ * but isn't for now.
+ */
+ if (node == -1)
+ node = page_to_nid(page);
VM_BUG_ON(PageCompound(page));
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
goto out_unmap;
/* cannot use mapcount: can't collapse if there's a gup pin */
if (page_count(page) != 1)
goto out_unmap;
- if (pte_young(pteval))
+ if (pte_young(pteval) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
referenced = 1;
}
if (referenced)
pte_unmap_unlock(pte, ptl);
if (ret)
/* collapse_huge_page will return with the mmap_sem released */
- collapse_huge_page(mm, address, hpage, vma);
+ collapse_huge_page(mm, address, hpage, vma, node);
out:
return ret;
}
break;
}
- if (!(vma->vm_flags & VM_HUGEPAGE) &&
- !khugepaged_always()) {
+ if ((!(vma->vm_flags & VM_HUGEPAGE) &&
+ !khugepaged_always()) ||
+ (vma->vm_flags & VM_NOHUGEPAGE)) {
+ skip:
progress++;
continue;
}
-
- /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
- if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
- khugepaged_scan.address = vma->vm_end;
- progress++;
- continue;
- }
- VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+ if (!vma->anon_vma || vma->vm_ops)
+ goto skip;
+ if (is_vma_temporary_stack(vma))
+ goto skip;
+ /*
+ * If is_pfn_mapping() is true is_learn_pfn_mapping()
+ * must be true too, verify it here.
+ */
+ VM_BUG_ON(is_linear_pfn_mapping(vma) ||
+ vma->vm_flags & VM_NO_THP);
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
- if (hstart >= hend) {
- progress++;
- continue;
- }
+ if (hstart >= hend)
+ goto skip;
+ if (khugepaged_scan.address > hend)
+ goto skip;
if (khugepaged_scan.address < hstart)
khugepaged_scan.address = hstart;
- if (khugepaged_scan.address > hend) {
- khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
- progress++;
- continue;
- }
- BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
while (khugepaged_scan.address < hend) {
int ret;
breakouterloop_mmap_sem:
spin_lock(&khugepaged_mm_lock);
- BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+ VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
/*
* Release the current mm_slot if this mm is about to die, or
* if we scanned all vmas of this mm.
#ifndef CONFIG_NUMA
if (!*hpage) {
*hpage = alloc_hugepage(khugepaged_defrag());
- if (unlikely(!*hpage))
+ if (unlikely(!*hpage)) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
break;
+ }
+ count_vm_event(THP_COLLAPSE_ALLOC);
}
#else
if (IS_ERR(*hpage))
break;
#endif
+ if (unlikely(kthread_should_stop() || freezing(current)))
+ break;
+
spin_lock(&khugepaged_mm_lock);
if (!khugepaged_scan.mm_slot)
pass_through_head++;
do {
hpage = alloc_hugepage(khugepaged_defrag());
- if (!hpage)
+ if (!hpage) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
khugepaged_alloc_sleep();
+ } else
+ count_vm_event(THP_COLLAPSE_ALLOC);
} while (unlikely(!hpage) &&
likely(khugepaged_enabled()));
return hpage;
if (hpage)
put_page(hpage);
#endif
+ try_to_freeze();
+ if (unlikely(kthread_should_stop()))
+ break;
if (khugepaged_has_work()) {
DEFINE_WAIT(wait);
if (!khugepaged_scan_sleep_millisecs)
khugepaged_scan_sleep_millisecs));
remove_wait_queue(&khugepaged_wait, &wait);
} else if (khugepaged_enabled())
- wait_event_interruptible(khugepaged_wait,
- khugepaged_wait_event());
+ wait_event_freezable(khugepaged_wait,
+ khugepaged_wait_event());
}
}
{
struct mm_slot *mm_slot;
+ set_freezable();
set_user_nice(current, 19);
/* serialize with start_khugepaged() */
for (;;) {
mutex_unlock(&khugepaged_mutex);
- BUG_ON(khugepaged_thread != current);
+ VM_BUG_ON(khugepaged_thread != current);
khugepaged_loop();
- BUG_ON(khugepaged_thread != current);
+ VM_BUG_ON(khugepaged_thread != current);
mutex_lock(&khugepaged_mutex);
if (!khugepaged_enabled())
break;
+ if (unlikely(kthread_should_stop()))
+ break;
}
spin_lock(&khugepaged_mm_lock);