Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/signal
[linux-3.10.git] / kernel / fork.c
index 7590bd6..3c31e87 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/cgroup.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
+#include <linux/seccomp.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
@@ -47,6 +48,7 @@
 #include <linux/audit.h>
 #include <linux/memcontrol.h>
 #include <linux/ftrace.h>
+#include <linux/proc_fs.h>
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/ksm.h>
@@ -67,6 +69,7 @@
 #include <linux/oom.h>
 #include <linux/khugepaged.h>
 #include <linux/signalfd.h>
+#include <linux/uprobes.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -111,7 +114,11 @@ int nr_processes(void)
        return total;
 }
 
-#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+void __weak arch_release_task_struct(struct task_struct *tsk)
+{
+}
+
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 static struct kmem_cache *task_struct_cachep;
 
 static inline struct task_struct *alloc_task_struct_node(int node)
@@ -119,18 +126,17 @@ static inline struct task_struct *alloc_task_struct_node(int node)
        return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
 }
 
-void __weak arch_release_task_struct(struct task_struct *tsk) { }
-
 static inline void free_task_struct(struct task_struct *tsk)
 {
-       arch_release_task_struct(tsk);
        kmem_cache_free(task_struct_cachep, tsk);
 }
 #endif
 
-#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
+void __weak arch_release_thread_info(struct thread_info *ti)
+{
+}
 
-void __weak arch_release_thread_info(struct thread_info *ti) { }
+#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
 
 /*
  * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
@@ -148,7 +154,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 
 static inline void free_thread_info(struct thread_info *ti)
 {
-       arch_release_thread_info(ti);
        free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
 }
 # else
@@ -162,7 +167,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 
 static void free_thread_info(struct thread_info *ti)
 {
-       arch_release_thread_info(ti);
        kmem_cache_free(thread_info_cache, ti);
 }
 
@@ -203,9 +207,12 @@ static void account_kernel_stack(struct thread_info *ti, int account)
 void free_task(struct task_struct *tsk)
 {
        account_kernel_stack(tsk->stack, -1);
+       arch_release_thread_info(tsk->stack);
        free_thread_info(tsk->stack);
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
+       put_seccomp_filter(tsk);
+       arch_release_task_struct(tsk);
        free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -243,7 +250,7 @@ void __init __weak arch_task_cache_init(void) { }
 
 void __init fork_init(unsigned long mempages)
 {
-#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN     L1_CACHE_BYTES
 #endif
@@ -290,21 +297,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        int node = tsk_fork_get_node(orig);
        int err;
 
-       prepare_to_copy(orig);
-
        tsk = alloc_task_struct_node(node);
        if (!tsk)
                return NULL;
 
        ti = alloc_thread_info_node(tsk, node);
-       if (!ti) {
-               free_task_struct(tsk);
-               return NULL;
-       }
+       if (!ti)
+               goto free_tsk;
 
        err = arch_dup_task_struct(tsk, orig);
        if (err)
-               goto out;
+               goto free_ti;
 
        tsk->stack = ti;
 
@@ -327,13 +330,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        tsk->btrace_seq = 0;
 #endif
        tsk->splice_pipe = NULL;
+       tsk->task_frag.page = NULL;
 
        account_kernel_stack(ti, 1);
 
        return tsk;
 
-out:
+free_ti:
        free_thread_info(ti);
+free_tsk:
        free_task_struct(tsk);
        return NULL;
 }
@@ -347,8 +352,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        unsigned long charge;
        struct mempolicy *pol;
 
+       uprobe_start_dup_mmap();
        down_write(&oldmm->mmap_sem);
        flush_cache_dup_mm(oldmm);
+       uprobe_dup_mmap(oldmm, mm);
        /*
         * Not linked in yet - no deadlock potential:
         */
@@ -377,15 +384,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                struct file *file;
 
                if (mpnt->vm_flags & VM_DONTCOPY) {
-                       long pages = vma_pages(mpnt);
-                       mm->total_vm -= pages;
                        vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
-                                                               -pages);
+                                                       -vma_pages(mpnt));
                        continue;
                }
                charge = 0;
                if (mpnt->vm_flags & VM_ACCOUNT) {
-                       unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
+                       unsigned long len = vma_pages(mpnt);
+
                        if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                                goto fail_nomem;
                        charge = len;
@@ -418,7 +424,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                                mapping->i_mmap_writable++;
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
-                       vma_prio_tree_add(tmp, mpnt);
+                       if (unlikely(tmp->vm_flags & VM_NONLINEAR))
+                               vma_nonlinear_insert(tmp,
+                                               &mapping->i_mmap_nonlinear);
+                       else
+                               vma_interval_tree_insert_after(tmp, mpnt,
+                                                       &mapping->i_mmap);
                        flush_dcache_mmap_unlock(mapping);
                        mutex_unlock(&mapping->i_mmap_mutex);
                }
@@ -459,6 +470,7 @@ out:
        up_write(&mm->mmap_sem);
        flush_tlb_mm(oldmm);
        up_write(&oldmm->mmap_sem);
+       uprobe_end_dup_mmap();
        return retval;
 fail_nomem_anon_vma_fork:
        mpol_put(pol);
@@ -599,6 +611,7 @@ void mmput(struct mm_struct *mm)
        might_sleep();
 
        if (atomic_dec_and_test(&mm->mm_users)) {
+               uprobe_clear_state(mm);
                exit_aio(mm);
                ksm_exit(mm);
                khugepaged_exit(mm); /* must run before exit_mmap */
@@ -609,7 +622,6 @@ void mmput(struct mm_struct *mm)
                        list_del(&mm->mmlist);
                        spin_unlock(&mmlist_lock);
                }
-               put_swap_token(mm);
                if (mm->binfmt)
                        module_put(mm->binfmt->module);
                mmdrop(mm);
@@ -617,26 +629,6 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
-/*
- * We added or removed a vma mapping the executable. The vmas are only mapped
- * during exec and are not mapped with the mmap system call.
- * Callers must hold down_write() on the mm's mmap_sem for these
- */
-void added_exe_file_vma(struct mm_struct *mm)
-{
-       mm->num_exe_file_vmas++;
-}
-
-void removed_exe_file_vma(struct mm_struct *mm)
-{
-       mm->num_exe_file_vmas--;
-       if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
-               fput(mm->exe_file);
-               mm->exe_file = NULL;
-       }
-
-}
-
 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 {
        if (new_exe_file)
@@ -644,15 +636,13 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
        if (mm->exe_file)
                fput(mm->exe_file);
        mm->exe_file = new_exe_file;
-       mm->num_exe_file_vmas = 0;
 }
 
 struct file *get_mm_exe_file(struct mm_struct *mm)
 {
        struct file *exe_file;
 
-       /* We need mmap_sem to protect against races with removal of
-        * VM_EXECUTABLE vmas */
+       /* We need mmap_sem to protect against races with removal of exe_file */
        down_read(&mm->mmap_sem);
        exe_file = mm->exe_file;
        if (exe_file)
@@ -777,12 +767,11 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
                exit_pi_state_list(tsk);
 #endif
 
+       uprobe_free_utask(tsk);
+
        /* Get rid of any cached register state */
        deactivate_mm(tsk, mm);
 
-       if (tsk->vfork_done)
-               complete_vfork_done(tsk);
-
        /*
         * If we're exiting normally, clear a user-space tid field if
         * requested.  We leave this alone when dying by signal, to leave
@@ -803,6 +792,13 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
                }
                tsk->clear_child_tid = NULL;
        }
+
+       /*
+        * All done, finally we can wake up parent and return this mm to him.
+        * Also kthread_stop() uses this completion for synchronization.
+        */
+       if (tsk->vfork_done)
+               complete_vfork_done(tsk);
 }
 
 /*
@@ -824,14 +820,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
        memcpy(mm, oldmm, sizeof(*mm));
        mm_init_cpumask(mm);
 
-       /* Initializing for Swap token stuff */
-       mm->token_priority = 0;
-       mm->last_interval = 0;
-
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        mm->pmd_huge_pte = NULL;
 #endif
-
        if (!mm_init(mm, tsk))
                goto fail_nomem;
 
@@ -905,10 +896,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
                goto fail_nomem;
 
 good_mm:
-       /* Initializing for Swap token stuff */
-       mm->token_priority = 0;
-       mm->last_interval = 0;
-
        tsk->mm = mm;
        tsk->active_mm = mm;
        return 0;
@@ -976,9 +963,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
         * Share io context with parent, if CLONE_IO is set
         */
        if (clone_flags & CLONE_IO) {
-               tsk->io_context = ioc_task_link(ioc);
-               if (unlikely(!tsk->io_context))
-                       return -ENOMEM;
+               ioc_task_link(ioc);
+               tsk->io_context = ioc;
        } else if (ioprio_valid(ioc->ioprio)) {
                new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
                if (unlikely(!new_ioc))
@@ -1077,7 +1063,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        init_rwsem(&sig->group_rwsem);
 #endif
 
-       sig->oom_adj = current->signal->oom_adj;
        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
@@ -1144,7 +1129,6 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
  */
 static struct task_struct *copy_process(unsigned long clone_flags,
                                        unsigned long stack_start,
-                                       struct pt_regs *regs,
                                        unsigned long stack_size,
                                        int __user *child_tidptr,
                                        struct pid *pid,
@@ -1152,7 +1136,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 {
        int retval;
        struct task_struct *p;
-       int cgroup_callbacks_done = 0;
 
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);
@@ -1192,6 +1175,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                goto fork_out;
 
        ftrace_graph_init_task(p);
+       get_seccomp_filter(p);
 
        rt_mutex_init_task(p);
 
@@ -1238,7 +1222,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->utime = p->stime = p->gtime = 0;
        p->utimescaled = p->stimescaled = 0;
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
-       p->prev_utime = p->prev_stime = 0;
+       p->prev_cputime.utime = p->prev_cputime.stime = 0;
 #endif
 #if defined(SPLIT_RSS_COUNTING)
        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
@@ -1275,11 +1259,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
        p->irq_events = 0;
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-       p->hardirqs_enabled = 1;
-#else
        p->hardirqs_enabled = 0;
-#endif
        p->hardirq_enable_ip = 0;
        p->hardirq_enable_event = 0;
        p->hardirq_disable_ip = _THIS_IP_;
@@ -1301,7 +1281,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
 #endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
        p->memcg_batch.do_batch = 0;
        p->memcg_batch.memcg = NULL;
 #endif
@@ -1340,7 +1320,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        retval = copy_io(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_namespaces;
-       retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
+       retval = copy_thread(clone_flags, stack_start, stack_size, p);
        if (retval)
                goto bad_fork_cleanup_io;
 
@@ -1372,6 +1352,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        INIT_LIST_HEAD(&p->pi_state_list);
        p->pi_state_cache = NULL;
 #endif
+       uprobe_copy_process(p);
        /*
         * sigaltstack should be cleared when sharing the same VM
         */
@@ -1410,12 +1391,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         */
        p->group_leader = p;
        INIT_LIST_HEAD(&p->thread_group);
-
-       /* Now that the task is set up, run cgroup callbacks if
-        * necessary. We need to run them before the task is visible
-        * on the tasklist. */
-       cgroup_fork_callbacks(p);
-       cgroup_callbacks_done = 1;
+       p->task_works = NULL;
 
        /* Need tasklist lock for parent etc handling! */
        write_lock_irq(&tasklist_lock);
@@ -1494,6 +1470,8 @@ bad_fork_cleanup_io:
        if (p->io_context)
                exit_io_context(p);
 bad_fork_cleanup_namespaces:
+       if (unlikely(clone_flags & CLONE_NEWPID))
+               pid_ns_release_proc(p->nsproxy->pid_ns);
        exit_task_namespaces(p);
 bad_fork_cleanup_mm:
        if (p->mm)
@@ -1519,7 +1497,7 @@ bad_fork_cleanup_cgroup:
 #endif
        if (clone_flags & CLONE_THREAD)
                threadgroup_change_end(current);
-       cgroup_exit(p, cgroup_callbacks_done);
+       cgroup_exit(p, 0);
        delayacct_tsk_free(p);
        module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
@@ -1531,12 +1509,6 @@ fork_out:
        return ERR_PTR(retval);
 }
 
-noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
-{
-       memset(regs, 0, sizeof(struct pt_regs));
-       return regs;
-}
-
 static inline void init_idle_pids(struct pid_link *links)
 {
        enum pid_type type;
@@ -1550,10 +1522,7 @@ static inline void init_idle_pids(struct pid_link *links)
 struct task_struct * __cpuinit fork_idle(int cpu)
 {
        struct task_struct *task;
-       struct pt_regs regs;
-
-       task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
-                           &init_struct_pid, 0);
+       task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
        if (!IS_ERR(task)) {
                init_idle_pids(task->pids);
                init_idle(task, cpu);
@@ -1570,7 +1539,6 @@ struct task_struct * __cpuinit fork_idle(int cpu)
  */
 long do_fork(unsigned long clone_flags,
              unsigned long stack_start,
-             struct pt_regs *regs,
              unsigned long stack_size,
              int __user *parent_tidptr,
              int __user *child_tidptr)
@@ -1600,7 +1568,7 @@ long do_fork(unsigned long clone_flags,
         * requested, no event is reported; otherwise, report if the event
         * for the type of forking is enabled.
         */
-       if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
+       if (!(clone_flags & CLONE_UNTRACED)) {
                if (clone_flags & CLONE_VFORK)
                        trace = PTRACE_EVENT_VFORK;
                else if ((clone_flags & CSIGNAL) != SIGCHLD)
@@ -1612,7 +1580,7 @@ long do_fork(unsigned long clone_flags,
                        trace = 0;
        }
 
-       p = copy_process(clone_flags, stack_start, regs, stack_size,
+       p = copy_process(clone_flags, stack_start, stack_size,
                         child_tidptr, NULL, trace);
        /*
         * Do this prior waking up the new thread - the thread pointer
@@ -1650,6 +1618,60 @@ long do_fork(unsigned long clone_flags,
        return nr;
 }
 
+#ifdef CONFIG_GENERIC_KERNEL_THREAD
+/*
+ * Create a kernel thread.
+ */
+pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+       return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
+               (unsigned long)arg, NULL, NULL);
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_FORK
+SYSCALL_DEFINE0(fork)
+{
+#ifdef CONFIG_MMU
+       return do_fork(SIGCHLD, 0, 0, NULL, NULL);
+#else
+       /* can not support in nommu mode */
+       return(-EINVAL);
+#endif
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_VFORK
+SYSCALL_DEFINE0(vfork)
+{
+       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 
+                       0, NULL, NULL);
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_CLONE
+#ifdef CONFIG_CLONE_BACKWARDS
+SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
+                int __user *, parent_tidptr,
+                int, tls_val,
+                int __user *, child_tidptr)
+#elif defined(CONFIG_CLONE_BACKWARDS2)
+SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
+                int __user *, parent_tidptr,
+                int __user *, child_tidptr,
+                int, tls_val)
+#else
+SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
+                int __user *, parent_tidptr,
+                int __user *, child_tidptr,
+                int, tls_val)
+#endif
+{
+       return do_fork(clone_flags, newsp, 0,
+               parent_tidptr, child_tidptr);
+}
+#endif
+
 #ifndef ARCH_MIN_MMSTRUCT_ALIGN
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif