[PATCH] Hugetlb: Copy on Write support
David Gibson [Fri, 6 Jan 2006 08:10:44 +0000 (00:10 -0800)]
Implement copy-on-write support for hugetlb mappings so MAP_PRIVATE can be
supported.  This helps us to safely use hugetlb pages in many more
applications.  The patch makes the following changes.  If needed, I also have
it broken out according to the following paragraphs.

1. Add a pair of functions to set/clear write access on huge ptes.  The
   writable check in make_huge_pte is moved out to the caller for use by COW
   later.

2. Hugetlb copy-on-write requires special case handling in the following
   situations:

   - copy_hugetlb_page_range() - Copied pages must be write protected so
     a COW fault will be triggered (if necessary) if those pages are written
     to.

   - find_or_alloc_huge_page() - Only MAP_SHARED pages are added to the
     page cache.  MAP_PRIVATE pages still need to be locked however.

3. Provide hugetlb_cow() and calls from hugetlb_fault() and
   hugetlb_no_page() which handles the COW fault by making the actual copy.

4. Remove the check in hugetlbfs_file_map() so that MAP_PRIVATE mmaps
   will be allowed.  Make MAP_HUGETLB exempt from the depricated VM_RESERVED
   mapping check.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Adam Litke <agl@us.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

fs/hugetlbfs/inode.c
mm/hugetlb.c

index 8c1cef3..8c41315 100644 (file)
@@ -100,9 +100,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
        loff_t len, vma_len;
        int ret;
 
-       if ((vma->vm_flags & (VM_MAYSHARE | VM_WRITE)) == VM_WRITE)
-               return -EINVAL;
-
        if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1))
                return -EINVAL;
 
index cf82251..da8a211 100644 (file)
@@ -261,11 +261,12 @@ struct vm_operations_struct hugetlb_vm_ops = {
        .nopage = hugetlb_nopage,
 };
 
-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+                               int writable)
 {
        pte_t entry;
 
-       if (vma->vm_flags & VM_WRITE) {
+       if (writable) {
                entry =
                    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
        } else {
@@ -277,12 +278,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
        return entry;
 }
 
+static void set_huge_ptep_writable(struct vm_area_struct *vma,
+                                  unsigned long address, pte_t *ptep)
+{
+       pte_t entry;
+
+       entry = pte_mkwrite(pte_mkdirty(*ptep));
+       ptep_set_access_flags(vma, address, ptep, entry, 1);
+       update_mmu_cache(vma, address, entry);
+       lazy_mmu_prot_update(entry);
+}
+
+
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                            struct vm_area_struct *vma)
 {
        pte_t *src_pte, *dst_pte, entry;
        struct page *ptepage;
        unsigned long addr;
+       int cow;
+
+       cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
        for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
                src_pte = huge_pte_offset(src, addr);
@@ -294,6 +310,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                spin_lock(&dst->page_table_lock);
                spin_lock(&src->page_table_lock);
                if (!pte_none(*src_pte)) {
+                       if (cow)
+                               ptep_set_wrprotect(src, addr, src_pte);
                        entry = *src_pte;
                        ptepage = pte_page(entry);
                        get_page(ptepage);
@@ -346,7 +364,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 }
 
 static struct page *find_or_alloc_huge_page(struct address_space *mapping,
-                                               unsigned long idx)
+                               unsigned long idx, int shared)
 {
        struct page *page;
        int err;
@@ -364,26 +382,80 @@ retry:
                goto out;
        }
 
-       err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
-       if (err) {
-               put_page(page);
-               hugetlb_put_quota(mapping);
-               if (err == -EEXIST)
-                       goto retry;
-               page = NULL;
+       if (shared) {
+               err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+               if (err) {
+                       put_page(page);
+                       hugetlb_put_quota(mapping);
+                       if (err == -EEXIST)
+                               goto retry;
+                       page = NULL;
+               }
+       } else {
+               /* Caller expects a locked page */
+               lock_page(page);
        }
 out:
        return page;
 }
 
+static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
+                       unsigned long address, pte_t *ptep, pte_t pte)
+{
+       struct page *old_page, *new_page;
+       int i, avoidcopy;
+
+       old_page = pte_page(pte);
+
+       /* If no-one else is actually using this page, avoid the copy
+        * and just make the page writable */
+       avoidcopy = (page_count(old_page) == 1);
+       if (avoidcopy) {
+               set_huge_ptep_writable(vma, address, ptep);
+               return VM_FAULT_MINOR;
+       }
+
+       page_cache_get(old_page);
+       new_page = alloc_huge_page();
+
+       if (!new_page) {
+               page_cache_release(old_page);
+
+               /* Logically this is OOM, not a SIGBUS, but an OOM
+                * could cause the kernel to go killing other
+                * processes which won't help the hugepage situation
+                * at all (?) */
+               return VM_FAULT_SIGBUS;
+       }
+
+       spin_unlock(&mm->page_table_lock);
+       for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
+               copy_user_highpage(new_page + i, old_page + i,
+                                  address + i*PAGE_SIZE);
+       spin_lock(&mm->page_table_lock);
+
+       ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+       if (likely(pte_same(*ptep, pte))) {
+               /* Break COW */
+               set_huge_pte_at(mm, address, ptep,
+                               make_huge_pte(vma, new_page, 1));
+               /* Make the old page be freed below */
+               new_page = old_page;
+       }
+       page_cache_release(new_page);
+       page_cache_release(old_page);
+       return VM_FAULT_MINOR;
+}
+
 int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                       unsigned long address, pte_t *ptep)
+                       unsigned long address, pte_t *ptep, int write_access)
 {
        int ret = VM_FAULT_SIGBUS;
        unsigned long idx;
        unsigned long size;
        struct page *page;
        struct address_space *mapping;
+       pte_t new_pte;
 
        mapping = vma->vm_file->f_mapping;
        idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -393,10 +465,13 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * Use page lock to guard against racing truncation
         * before we get page_table_lock.
         */
-       page = find_or_alloc_huge_page(mapping, idx);
+       page = find_or_alloc_huge_page(mapping, idx,
+                       vma->vm_flags & VM_SHARED);
        if (!page)
                goto out;
 
+       BUG_ON(!PageLocked(page));
+
        spin_lock(&mm->page_table_lock);
        size = i_size_read(mapping->host) >> HPAGE_SHIFT;
        if (idx >= size)
@@ -407,7 +482,15 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto backout;
 
        add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
-       set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page));
+       new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+                               && (vma->vm_flags & VM_SHARED)));
+       set_huge_pte_at(mm, address, ptep, new_pte);
+
+       if (write_access && !(vma->vm_flags & VM_SHARED)) {
+               /* Optimization, do the COW without a second fault */
+               ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+       }
+
        spin_unlock(&mm->page_table_lock);
        unlock_page(page);
 out:
@@ -426,6 +509,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        pte_t *ptep;
        pte_t entry;
+       int ret;
 
        ptep = huge_pte_alloc(mm, address);
        if (!ptep)
@@ -433,13 +517,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
        entry = *ptep;
        if (pte_none(entry))
-               return hugetlb_no_page(mm, vma, address, ptep);
+               return hugetlb_no_page(mm, vma, address, ptep, write_access);
 
-       /*
-        * We could get here if another thread instantiated the pte
-        * before the test above.
-        */
-       return VM_FAULT_MINOR;
+       ret = VM_FAULT_MINOR;
+
+       spin_lock(&mm->page_table_lock);
+       /* Check for a racing update before calling hugetlb_cow */
+       if (likely(pte_same(entry, *ptep)))
+               if (write_access && !pte_write(entry))
+                       ret = hugetlb_cow(mm, vma, address, ptep, entry);
+       spin_unlock(&mm->page_table_lock);
+
+       return ret;
 }
 
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,