mm: implement access_remote_vm
[linux-2.6.git] / mm / memory.c
index c50a195..468f507 100644 (file)
@@ -804,6 +804,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
                next = pmd_addr_end(addr, end);
                if (pmd_trans_huge(*src_pmd)) {
                        int err;
+                       VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
                        err = copy_huge_pmd(dst_mm, src_mm,
                                            dst_pmd, src_pmd, addr, vma);
                        if (err == -ENOMEM)
@@ -1015,9 +1016,10 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_trans_huge(*pmd)) {
-                       if (next-addr != HPAGE_PMD_SIZE)
+                       if (next-addr != HPAGE_PMD_SIZE) {
+                               VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
                                split_huge_page_pmd(vma->vm_mm, pmd);
-                       else if (zap_huge_pmd(tlb, vma, pmd)) {
+                       } else if (zap_huge_pmd(tlb, vma, pmd)) {
                                (*zap_work)--;
                                continue;
                        }
@@ -1288,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
        pud = pud_offset(pgd, address);
        if (pud_none(*pud))
                goto no_page_table;
-       if (pud_huge(*pud)) {
+       if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
                BUG_ON(flags & FOLL_GET);
                page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
                goto out;
@@ -1305,6 +1307,10 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                goto out;
        }
        if (pmd_trans_huge(*pmd)) {
+               if (flags & FOLL_SPLIT) {
+                       split_huge_page_pmd(mm, pmd);
+                       goto split_fallthrough;
+               }
                spin_lock(&mm->page_table_lock);
                if (likely(pmd_trans_huge(*pmd))) {
                        if (unlikely(pmd_trans_splitting(*pmd))) {
@@ -1320,6 +1326,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                        spin_unlock(&mm->page_table_lock);
                /* fall through */
        }
+split_fallthrough:
        if (unlikely(pmd_bad(*pmd)))
                goto no_page_table;
 
@@ -1403,6 +1410,55 @@ no_page_table:
        return page;
 }
 
+/**
+ * __get_user_pages() - pin user pages in memory
+ * @tsk:       task_struct of target task
+ * @mm:                mm_struct of target mm
+ * @start:     starting user address
+ * @nr_pages:  number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages:     array that receives pointers to the pages pinned.
+ *             Should be at least nr_pages long. Or NULL, if caller
+ *             only intends to ensure the pages are faulted in.
+ * @vmas:      array of pointers to vmas corresponding to each page.
+ *             Or NULL if the caller does not require them.
+ * @nonblocking: whether waiting for disk IO or mmap_sem contention
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * __get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * __get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
+ * the page is written to, set_page_dirty (or set_page_dirty_lock, as
+ * appropriate) must be called after the page is finished with, and
+ * before put_page is called.
+ *
+ * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
+ * or mmap_sem contention, and if waiting is needed to pin all pages,
+ * *@nonblocking will be set to 0.
+ *
+ * In most cases, get_user_pages or get_user_pages_fast should be used
+ * instead of __get_user_pages. __get_user_pages should be used only if
+ * you need some special @gup_flags.
+ */
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                     unsigned long start, int nr_pages, unsigned int gup_flags,
                     struct page **pages, struct vm_area_struct **vmas,
@@ -1430,9 +1486,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                struct vm_area_struct *vma;
 
                vma = find_extend_vma(mm, start);
-               if (!vma && in_gate_area(tsk, start)) {
+               if (!vma && in_gate_area(mm, start)) {
                        unsigned long pg = start & PAGE_MASK;
-                       struct vm_area_struct *gate_vma = get_gate_vma(tsk);
+                       struct vm_area_struct *gate_vma = get_gate_vma(mm);
                        pgd_t *pgd;
                        pud_t *pud;
                        pmd_t *pmd;
@@ -1451,6 +1507,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        pmd = pmd_offset(pud, pg);
                        if (pmd_none(*pmd))
                                return i ? : -EFAULT;
+                       VM_BUG_ON(pmd_trans_huge(*pmd));
                        pte = pte_offset_map(pmd, pg);
                        if (pte_none(*pte)) {
                                pte_unmap(pte);
@@ -1519,16 +1576,26 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                if (ret & VM_FAULT_ERROR) {
                                        if (ret & VM_FAULT_OOM)
                                                return i ? i : -ENOMEM;
-                                       if (ret &
-                                           (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
-                                            VM_FAULT_SIGBUS))
+                                       if (ret & (VM_FAULT_HWPOISON |
+                                                  VM_FAULT_HWPOISON_LARGE)) {
+                                               if (i)
+                                                       return i;
+                                               else if (gup_flags & FOLL_HWPOISON)
+                                                       return -EHWPOISON;
+                                               else
+                                                       return -EFAULT;
+                                       }
+                                       if (ret & VM_FAULT_SIGBUS)
                                                return i ? i : -EFAULT;
                                        BUG();
                                }
-                               if (ret & VM_FAULT_MAJOR)
-                                       tsk->maj_flt++;
-                               else
-                                       tsk->min_flt++;
+
+                               if (tsk) {
+                                       if (ret & VM_FAULT_MAJOR)
+                                               tsk->maj_flt++;
+                                       else
+                                               tsk->min_flt++;
+                               }
 
                                if (ret & VM_FAULT_RETRY) {
                                        *nonblocking = 0;
@@ -1570,10 +1637,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        } while (nr_pages);
        return i;
 }
+EXPORT_SYMBOL(__get_user_pages);
 
 /**
  * get_user_pages() - pin user pages in memory
- * @tsk:       task_struct of target task
+ * @tsk:       the task_struct to use for page fault accounting, or
+ *             NULL if faults are not to be recorded.
  * @mm:                mm_struct of target mm
  * @start:     starting user address
  * @nr_pages:  number of pages from start to pin
@@ -1675,8 +1744,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
        pud_t * pud = pud_alloc(mm, pgd, addr);
        if (pud) {
                pmd_t * pmd = pmd_alloc(mm, pud, addr);
-               if (pmd)
+               if (pmd) {
+                       VM_BUG_ON(pmd_trans_huge(*pmd));
                        return pte_alloc_map_lock(mm, pmd, addr, ptl);
+               }
        }
        return NULL;
 }
@@ -1895,6 +1966,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return -ENOMEM;
+       VM_BUG_ON(pmd_trans_huge(*pmd));
        do {
                next = pmd_addr_end(addr, end);
                if (remap_pte_range(mm, pmd, addr, next,
@@ -2104,10 +2176,10 @@ EXPORT_SYMBOL_GPL(apply_to_page_range);
  * handle_pte_fault chooses page fault handler according to an entry
  * which was read non-atomically.  Before making any commitment, on
  * those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_file_page
+ * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
  * must check under lock before unmapping the pte and proceeding
  * (but do_wp_page is only called after already making such a check;
- * and do_anonymous_page and do_no_page can safely check later on).
+ * and do_anonymous_page can safely check later on).
  */
 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
                                pte_t *page_table, pte_t orig_pte)
@@ -2208,7 +2280,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                                         &ptl);
                        if (!pte_same(*page_table, orig_pte)) {
                                unlock_page(old_page);
-                               page_cache_release(old_page);
                                goto unlock;
                        }
                        page_cache_release(old_page);
@@ -2278,7 +2349,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                                         &ptl);
                        if (!pte_same(*page_table, orig_pte)) {
                                unlock_page(old_page);
-                               page_cache_release(old_page);
                                goto unlock;
                        }
 
@@ -2305,7 +2375,7 @@ reuse:
                 * bit after it clear all dirty ptes, but before a racing
                 * do_wp_page installs a dirty pte.
                 *
-                * do_no_page is protected similarly.
+                * __do_fault is protected similarly.
                 */
                if (!page_mkwrite) {
                        wait_on_page_locked(dirty_page);
@@ -2356,16 +2426,6 @@ gotten:
        }
        __SetPageUptodate(new_page);
 
-       /*
-        * Don't let another task, with possibly unlocked vma,
-        * keep the mlocked page.
-        */
-       if ((vma->vm_flags & VM_LOCKED) && old_page) {
-               lock_page(old_page);    /* for LRU manipulation */
-               clear_page_mlock(old_page);
-               unlock_page(old_page);
-       }
-
        if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
                goto oom_free_new;
 
@@ -2433,10 +2493,20 @@ gotten:
 
        if (new_page)
                page_cache_release(new_page);
-       if (old_page)
-               page_cache_release(old_page);
 unlock:
        pte_unmap_unlock(page_table, ptl);
+       if (old_page) {
+               /*
+                * Don't let another task, with possibly unlocked vma,
+                * keep the mlocked page.
+                */
+               if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
+                       lock_page(old_page);    /* LRU manipulation */
+                       munlock_vma_page(old_page);
+                       unlock_page(old_page);
+               }
+               page_cache_release(old_page);
+       }
        return ret;
 oom_free_new:
        page_cache_release(new_page);
@@ -2639,6 +2709,7 @@ void unmap_mapping_range(struct address_space *mapping,
                details.last_index = ULONG_MAX;
        details.i_mmap_lock = &mapping->i_mmap_lock;
 
+       mutex_lock(&mapping->unmap_mutex);
        spin_lock(&mapping->i_mmap_lock);
 
        /* Protect against endless unmapping loops */
@@ -2655,6 +2726,7 @@ void unmap_mapping_range(struct address_space *mapping,
        if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
                unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
        spin_unlock(&mapping->i_mmap_lock);
+       mutex_unlock(&mapping->unmap_mutex);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
@@ -3042,12 +3114,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                goto out;
                        }
                        charged = 1;
-                       /*
-                        * Don't let another task, with possibly unlocked vma,
-                        * keep the mlocked page.
-                        */
-                       if (vma->vm_flags & VM_LOCKED)
-                               clear_page_mlock(vmf.page);
                        copy_user_highpage(page, vmf.page, address, vma);
                        __SetPageUptodate(page);
                } else {
@@ -3434,7 +3500,7 @@ static int __init gate_vma_init(void)
 __initcall(gate_vma_init);
 #endif
 
-struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
 {
 #ifdef AT_SYSINFO_EHDR
        return &gate_vma;
@@ -3443,7 +3509,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
 #endif
 }
 
-int in_gate_area_no_task(unsigned long addr)
+int in_gate_area_no_mm(unsigned long addr)
 {
 #ifdef AT_SYSINFO_EHDR
        if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
@@ -3471,6 +3537,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address,
                goto out;
 
        pmd = pmd_offset(pud, address);
+       VM_BUG_ON(pmd_trans_huge(*pmd));
        if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
                goto out;
 
@@ -3583,20 +3650,15 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
 #endif
 
 /*
- * Access another process' address space.
- * Source/target buffer must be kernel space,
- * Do not walk the page table directly, use get_user_pages
+ * Access another process' address space as given in mm.  If non-NULL, use the
+ * given task for page fault accounting.
  */
-int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+               unsigned long addr, void *buf, int len, int write)
 {
-       struct mm_struct *mm;
        struct vm_area_struct *vma;
        void *old_buf = buf;
 
-       mm = get_task_mm(tsk);
-       if (!mm)
-               return 0;
-
        down_read(&mm->mmap_sem);
        /* ignore errors, just check how much was successfully transferred */
        while (len) {
@@ -3645,11 +3707,47 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
                addr += bytes;
        }
        up_read(&mm->mmap_sem);
-       mmput(mm);
 
        return buf - old_buf;
 }
 
+/**
+ * @access_remote_vm - access another process' address space
+ * @mm:                the mm_struct of the target address space
+ * @addr:      start address to access
+ * @buf:       source or destination buffer
+ * @len:       number of bytes to transfer
+ * @write:     whether the access is a write
+ *
+ * The caller must hold a reference on @mm.
+ */
+int access_remote_vm(struct mm_struct *mm, unsigned long addr,
+               void *buf, int len, int write)
+{
+       return __access_remote_vm(NULL, mm, addr, buf, len, write);
+}
+
+/*
+ * Access another process' address space.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr,
+               void *buf, int len, int write)
+{
+       struct mm_struct *mm;
+       int ret;
+
+       mm = get_task_mm(tsk);
+       if (!mm)
+               return 0;
+
+       ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
+       mmput(mm);
+
+       return ret;
+}
+
 /*
  * Print the name of a VMA.
  */