Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/signal
[linux-3.10.git] / kernel / fork.c
index a9e99f3..3c31e87 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/cgroup.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
+#include <linux/seccomp.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
@@ -47,6 +48,7 @@
 #include <linux/audit.h>
 #include <linux/memcontrol.h>
 #include <linux/ftrace.h>
+#include <linux/proc_fs.h>
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/ksm.h>
@@ -67,6 +69,7 @@
 #include <linux/oom.h>
 #include <linux/khugepaged.h>
 #include <linux/signalfd.h>
+#include <linux/uprobes.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -111,24 +114,40 @@ int nr_processes(void)
        return total;
 }
 
-#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-# define alloc_task_struct_node(node)          \
-               kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
-# define free_task_struct(tsk)                 \
-               kmem_cache_free(task_struct_cachep, (tsk))
+void __weak arch_release_task_struct(struct task_struct *tsk)
+{
+}
+
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 static struct kmem_cache *task_struct_cachep;
+
+static inline struct task_struct *alloc_task_struct_node(int node)
+{
+       return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
+}
+
+static inline void free_task_struct(struct task_struct *tsk)
+{
+       kmem_cache_free(task_struct_cachep, tsk);
+}
 #endif
 
-#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
+void __weak arch_release_thread_info(struct thread_info *ti)
+{
+}
+
+#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+
+/*
+ * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
+ * kmemcache based allocator.
+ */
+# if THREAD_SIZE >= PAGE_SIZE
 static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
                                                  int node)
 {
-#ifdef CONFIG_DEBUG_STACK_USAGE
-       gfp_t mask = GFP_KERNEL | __GFP_ZERO;
-#else
-       gfp_t mask = GFP_KERNEL;
-#endif
-       struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
+       struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+                                            THREAD_SIZE_ORDER);
 
        return page ? page_address(page) : NULL;
 }
@@ -137,6 +156,27 @@ static inline void free_thread_info(struct thread_info *ti)
 {
        free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
 }
+# else
+static struct kmem_cache *thread_info_cache;
+
+static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+                                                 int node)
+{
+       return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+}
+
+static void free_thread_info(struct thread_info *ti)
+{
+       kmem_cache_free(thread_info_cache, ti);
+}
+
+void thread_info_cache_init(void)
+{
+       thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+                                             THREAD_SIZE, 0, NULL);
+       BUG_ON(thread_info_cache == NULL);
+}
+# endif
 #endif
 
 /* SLAB cache for signal_struct structures (tsk->signal) */
@@ -167,9 +207,12 @@ static void account_kernel_stack(struct thread_info *ti, int account)
 void free_task(struct task_struct *tsk)
 {
        account_kernel_stack(tsk->stack, -1);
+       arch_release_thread_info(tsk->stack);
        free_thread_info(tsk->stack);
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
+       put_seccomp_filter(tsk);
+       arch_release_task_struct(tsk);
        free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -193,6 +236,7 @@ void __put_task_struct(struct task_struct *tsk)
        WARN_ON(atomic_read(&tsk->usage));
        WARN_ON(tsk == current);
 
+       security_task_free(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
        put_signal_struct(tsk->signal);
@@ -202,17 +246,11 @@ void __put_task_struct(struct task_struct *tsk)
 }
 EXPORT_SYMBOL_GPL(__put_task_struct);
 
-/*
- * macro override instead of weak attribute alias, to workaround
- * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
- */
-#ifndef arch_task_cache_init
-#define arch_task_cache_init()
-#endif
+void __init __weak arch_task_cache_init(void) { }
 
 void __init fork_init(unsigned long mempages)
 {
-#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN     L1_CACHE_BYTES
 #endif
@@ -259,21 +297,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        int node = tsk_fork_get_node(orig);
        int err;
 
-       prepare_to_copy(orig);
-
        tsk = alloc_task_struct_node(node);
        if (!tsk)
                return NULL;
 
        ti = alloc_thread_info_node(tsk, node);
-       if (!ti) {
-               free_task_struct(tsk);
-               return NULL;
-       }
+       if (!ti)
+               goto free_tsk;
 
        err = arch_dup_task_struct(tsk, orig);
        if (err)
-               goto out;
+               goto free_ti;
 
        tsk->stack = ti;
 
@@ -296,13 +330,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        tsk->btrace_seq = 0;
 #endif
        tsk->splice_pipe = NULL;
+       tsk->task_frag.page = NULL;
 
        account_kernel_stack(ti, 1);
 
        return tsk;
 
-out:
+free_ti:
        free_thread_info(ti);
+free_tsk:
        free_task_struct(tsk);
        return NULL;
 }
@@ -316,8 +352,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        unsigned long charge;
        struct mempolicy *pol;
 
+       uprobe_start_dup_mmap();
        down_write(&oldmm->mmap_sem);
        flush_cache_dup_mm(oldmm);
+       uprobe_dup_mmap(oldmm, mm);
        /*
         * Not linked in yet - no deadlock potential:
         */
@@ -346,16 +384,15 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                struct file *file;
 
                if (mpnt->vm_flags & VM_DONTCOPY) {
-                       long pages = vma_pages(mpnt);
-                       mm->total_vm -= pages;
                        vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
-                                                               -pages);
+                                                       -vma_pages(mpnt));
                        continue;
                }
                charge = 0;
                if (mpnt->vm_flags & VM_ACCOUNT) {
-                       unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
-                       if (security_vm_enough_memory(len))
+                       unsigned long len = vma_pages(mpnt);
+
+                       if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                                goto fail_nomem;
                        charge = len;
                }
@@ -387,7 +424,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                                mapping->i_mmap_writable++;
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
-                       vma_prio_tree_add(tmp, mpnt);
+                       if (unlikely(tmp->vm_flags & VM_NONLINEAR))
+                               vma_nonlinear_insert(tmp,
+                                               &mapping->i_mmap_nonlinear);
+                       else
+                               vma_interval_tree_insert_after(tmp, mpnt,
+                                                       &mapping->i_mmap);
                        flush_dcache_mmap_unlock(mapping);
                        mutex_unlock(&mapping->i_mmap_mutex);
                }
@@ -428,6 +470,7 @@ out:
        up_write(&mm->mmap_sem);
        flush_tlb_mm(oldmm);
        up_write(&oldmm->mmap_sem);
+       uprobe_end_dup_mmap();
        return retval;
 fail_nomem_anon_vma_fork:
        mpol_put(pol);
@@ -568,6 +611,7 @@ void mmput(struct mm_struct *mm)
        might_sleep();
 
        if (atomic_dec_and_test(&mm->mm_users)) {
+               uprobe_clear_state(mm);
                exit_aio(mm);
                ksm_exit(mm);
                khugepaged_exit(mm); /* must run before exit_mmap */
@@ -578,7 +622,6 @@ void mmput(struct mm_struct *mm)
                        list_del(&mm->mmlist);
                        spin_unlock(&mmlist_lock);
                }
-               put_swap_token(mm);
                if (mm->binfmt)
                        module_put(mm->binfmt->module);
                mmdrop(mm);
@@ -586,26 +629,6 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
-/*
- * We added or removed a vma mapping the executable. The vmas are only mapped
- * during exec and are not mapped with the mmap system call.
- * Callers must hold down_write() on the mm's mmap_sem for these
- */
-void added_exe_file_vma(struct mm_struct *mm)
-{
-       mm->num_exe_file_vmas++;
-}
-
-void removed_exe_file_vma(struct mm_struct *mm)
-{
-       mm->num_exe_file_vmas--;
-       if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
-               fput(mm->exe_file);
-               mm->exe_file = NULL;
-       }
-
-}
-
 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 {
        if (new_exe_file)
@@ -613,15 +636,13 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
        if (mm->exe_file)
                fput(mm->exe_file);
        mm->exe_file = new_exe_file;
-       mm->num_exe_file_vmas = 0;
 }
 
 struct file *get_mm_exe_file(struct mm_struct *mm)
 {
        struct file *exe_file;
 
-       /* We need mmap_sem to protect against races with removal of
-        * VM_EXECUTABLE vmas */
+       /* We need mmap_sem to protect against races with removal of exe_file */
        down_read(&mm->mmap_sem);
        exe_file = mm->exe_file;
        if (exe_file)
@@ -746,12 +767,11 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
                exit_pi_state_list(tsk);
 #endif
 
+       uprobe_free_utask(tsk);
+
        /* Get rid of any cached register state */
        deactivate_mm(tsk, mm);
 
-       if (tsk->vfork_done)
-               complete_vfork_done(tsk);
-
        /*
         * If we're exiting normally, clear a user-space tid field if
         * requested.  We leave this alone when dying by signal, to leave
@@ -772,6 +792,13 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
                }
                tsk->clear_child_tid = NULL;
        }
+
+       /*
+        * All done, finally we can wake up parent and return this mm to him.
+        * Also kthread_stop() uses this completion for synchronization.
+        */
+       if (tsk->vfork_done)
+               complete_vfork_done(tsk);
 }
 
 /*
@@ -793,14 +820,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
        memcpy(mm, oldmm, sizeof(*mm));
        mm_init_cpumask(mm);
 
-       /* Initializing for Swap token stuff */
-       mm->token_priority = 0;
-       mm->last_interval = 0;
-
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        mm->pmd_huge_pte = NULL;
 #endif
-
        if (!mm_init(mm, tsk))
                goto fail_nomem;
 
@@ -874,10 +896,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
                goto fail_nomem;
 
 good_mm:
-       /* Initializing for Swap token stuff */
-       mm->token_priority = 0;
-       mm->last_interval = 0;
-
        tsk->mm = mm;
        tsk->active_mm = mm;
        return 0;
@@ -945,9 +963,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
         * Share io context with parent, if CLONE_IO is set
         */
        if (clone_flags & CLONE_IO) {
-               tsk->io_context = ioc_task_link(ioc);
-               if (unlikely(!tsk->io_context))
-                       return -ENOMEM;
+               ioc_task_link(ioc);
+               tsk->io_context = ioc;
        } else if (ioprio_valid(ioc->ioprio)) {
                new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
                if (unlikely(!new_ioc))
@@ -1046,10 +1063,12 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        init_rwsem(&sig->group_rwsem);
 #endif
 
-       sig->oom_adj = current->signal->oom_adj;
        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
+       sig->has_child_subreaper = current->signal->has_child_subreaper ||
+                                  current->signal->is_child_subreaper;
+
        mutex_init(&sig->cred_guard_mutex);
 
        return 0;
@@ -1110,7 +1129,6 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
  */
 static struct task_struct *copy_process(unsigned long clone_flags,
                                        unsigned long stack_start,
-                                       struct pt_regs *regs,
                                        unsigned long stack_size,
                                        int __user *child_tidptr,
                                        struct pid *pid,
@@ -1118,7 +1136,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 {
        int retval;
        struct task_struct *p;
-       int cgroup_callbacks_done = 0;
 
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);
@@ -1158,6 +1175,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                goto fork_out;
 
        ftrace_graph_init_task(p);
+       get_seccomp_filter(p);
 
        rt_mutex_init_task(p);
 
@@ -1204,7 +1222,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->utime = p->stime = p->gtime = 0;
        p->utimescaled = p->stimescaled = 0;
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
-       p->prev_utime = p->prev_stime = 0;
+       p->prev_cputime.utime = p->prev_cputime.stime = 0;
 #endif
 #if defined(SPLIT_RSS_COUNTING)
        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
@@ -1237,14 +1255,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_CPUSETS
        p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
        p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
+       seqcount_init(&p->mems_allowed_seq);
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
        p->irq_events = 0;
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-       p->hardirqs_enabled = 1;
-#else
        p->hardirqs_enabled = 0;
-#endif
        p->hardirq_enable_ip = 0;
        p->hardirq_enable_event = 0;
        p->hardirq_disable_ip = _THIS_IP_;
@@ -1266,7 +1281,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
 #endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
        p->memcg_batch.do_batch = 0;
        p->memcg_batch.memcg = NULL;
 #endif
@@ -1305,7 +1320,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        retval = copy_io(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_namespaces;
-       retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
+       retval = copy_thread(clone_flags, stack_start, stack_size, p);
        if (retval)
                goto bad_fork_cleanup_io;
 
@@ -1337,6 +1352,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        INIT_LIST_HEAD(&p->pi_state_list);
        p->pi_state_cache = NULL;
 #endif
+       uprobe_copy_process(p);
        /*
         * sigaltstack should be cleared when sharing the same VM
         */
@@ -1375,12 +1391,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         */
        p->group_leader = p;
        INIT_LIST_HEAD(&p->thread_group);
-
-       /* Now that the task is set up, run cgroup callbacks if
-        * necessary. We need to run them before the task is visible
-        * on the tasklist. */
-       cgroup_fork_callbacks(p);
-       cgroup_callbacks_done = 1;
+       p->task_works = NULL;
 
        /* Need tasklist lock for parent etc handling! */
        write_lock_irq(&tasklist_lock);
@@ -1459,6 +1470,8 @@ bad_fork_cleanup_io:
        if (p->io_context)
                exit_io_context(p);
 bad_fork_cleanup_namespaces:
+       if (unlikely(clone_flags & CLONE_NEWPID))
+               pid_ns_release_proc(p->nsproxy->pid_ns);
        exit_task_namespaces(p);
 bad_fork_cleanup_mm:
        if (p->mm)
@@ -1484,7 +1497,7 @@ bad_fork_cleanup_cgroup:
 #endif
        if (clone_flags & CLONE_THREAD)
                threadgroup_change_end(current);
-       cgroup_exit(p, cgroup_callbacks_done);
+       cgroup_exit(p, 0);
        delayacct_tsk_free(p);
        module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
@@ -1496,12 +1509,6 @@ fork_out:
        return ERR_PTR(retval);
 }
 
-noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
-{
-       memset(regs, 0, sizeof(struct pt_regs));
-       return regs;
-}
-
 static inline void init_idle_pids(struct pid_link *links)
 {
        enum pid_type type;
@@ -1515,10 +1522,7 @@ static inline void init_idle_pids(struct pid_link *links)
 struct task_struct * __cpuinit fork_idle(int cpu)
 {
        struct task_struct *task;
-       struct pt_regs regs;
-
-       task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
-                           &init_struct_pid, 0);
+       task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
        if (!IS_ERR(task)) {
                init_idle_pids(task->pids);
                init_idle(task, cpu);
@@ -1535,7 +1539,6 @@ struct task_struct * __cpuinit fork_idle(int cpu)
  */
 long do_fork(unsigned long clone_flags,
              unsigned long stack_start,
-             struct pt_regs *regs,
              unsigned long stack_size,
              int __user *parent_tidptr,
              int __user *child_tidptr)
@@ -1565,7 +1568,7 @@ long do_fork(unsigned long clone_flags,
         * requested, no event is reported; otherwise, report if the event
         * for the type of forking is enabled.
         */
-       if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
+       if (!(clone_flags & CLONE_UNTRACED)) {
                if (clone_flags & CLONE_VFORK)
                        trace = PTRACE_EVENT_VFORK;
                else if ((clone_flags & CSIGNAL) != SIGCHLD)
@@ -1577,7 +1580,7 @@ long do_fork(unsigned long clone_flags,
                        trace = 0;
        }
 
-       p = copy_process(clone_flags, stack_start, regs, stack_size,
+       p = copy_process(clone_flags, stack_start, stack_size,
                         child_tidptr, NULL, trace);
        /*
         * Do this prior waking up the new thread - the thread pointer
@@ -1615,6 +1618,60 @@ long do_fork(unsigned long clone_flags,
        return nr;
 }
 
+#ifdef CONFIG_GENERIC_KERNEL_THREAD
+/*
+ * Create a kernel thread.
+ */
+pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+       return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
+               (unsigned long)arg, NULL, NULL);
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_FORK
+SYSCALL_DEFINE0(fork)
+{
+#ifdef CONFIG_MMU
+       return do_fork(SIGCHLD, 0, 0, NULL, NULL);
+#else
+       /* can not support in nommu mode */
+       return(-EINVAL);
+#endif
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_VFORK
+SYSCALL_DEFINE0(vfork)
+{
+       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 
+                       0, NULL, NULL);
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_CLONE
+#ifdef CONFIG_CLONE_BACKWARDS
+SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
+                int __user *, parent_tidptr,
+                int, tls_val,
+                int __user *, child_tidptr)
+#elif defined(CONFIG_CLONE_BACKWARDS2)
+SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
+                int __user *, parent_tidptr,
+                int __user *, child_tidptr,
+                int, tls_val)
+#else
+SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
+                int __user *, parent_tidptr,
+                int __user *, child_tidptr,
+                int, tls_val)
+#endif
+{
+       return do_fork(clone_flags, newsp, 0,
+               parent_tidptr, child_tidptr);
+}
+#endif
+
 #ifndef ARCH_MIN_MMSTRUCT_ALIGN
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif