media: tegra: Tegra videobuf2
[linux-2.6.git] / kernel / fork.c
index 856eac3..f65fa06 100644 (file)
@@ -37,9 +37,9 @@
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
-#include <linux/tracehook.h>
 #include <linux/futex.h>
 #include <linux/compat.h>
+#include <linux/kthread.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/rcupdate.h>
 #include <linux/ptrace.h>
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
 #include <linux/tty.h>
-#include <linux/proc_fs.h>
 #include <linux/blkdev.h>
 #include <linux/fs_struct.h>
 #include <linux/magic.h>
 #include <linux/perf_event.h>
 #include <linux/posix-timers.h>
 #include <linux/user-return-notifier.h>
+#include <linux/oom.h>
+#include <linux/khugepaged.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -79,7 +80,7 @@
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
 unsigned long total_forks;     /* Handle normal Linux uptimes. */
-int nr_threads;                /* The idle threads do not count.. */
+int nr_threads;                        /* The idle threads do not count.. */
 
 int max_threads;               /* tunable limit on nr_threads */
 
@@ -107,20 +108,25 @@ int nr_processes(void)
 }
 
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-# define alloc_task_struct()   kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
-# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
+# define alloc_task_struct_node(node)          \
+               kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
+# define free_task_struct(tsk)                 \
+               kmem_cache_free(task_struct_cachep, (tsk))
 static struct kmem_cache *task_struct_cachep;
 #endif
 
 #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
-static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
+static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+                                                 int node)
 {
 #ifdef CONFIG_DEBUG_STACK_USAGE
        gfp_t mask = GFP_KERNEL | __GFP_ZERO;
 #else
        gfp_t mask = GFP_KERNEL;
 #endif
-       return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
+       struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
+
+       return page ? page_address(page) : NULL;
 }
 
 static inline void free_thread_info(struct thread_info *ti)
@@ -147,6 +153,9 @@ struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
+/* Notifier list called when a task struct is freed */
+static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
+
 static void account_kernel_stack(struct thread_info *ti, int account)
 {
        struct zone *zone = page_zone(virt_to_page(ti));
@@ -168,6 +177,7 @@ EXPORT_SYMBOL(free_task);
 static inline void free_signal_struct(struct signal_struct *sig)
 {
        taskstats_tgid_free(sig);
+       sched_autogroup_exit(sig);
        kmem_cache_free(signal_cachep, sig);
 }
 
@@ -177,6 +187,18 @@ static inline void put_signal_struct(struct signal_struct *sig)
                free_signal_struct(sig);
 }
 
+int task_free_register(struct notifier_block *n)
+{
+       return atomic_notifier_chain_register(&task_free_notifier, n);
+}
+EXPORT_SYMBOL(task_free_register);
+
+int task_free_unregister(struct notifier_block *n)
+{
+       return atomic_notifier_chain_unregister(&task_free_notifier, n);
+}
+EXPORT_SYMBOL(task_free_unregister);
+
 void __put_task_struct(struct task_struct *tsk)
 {
        WARN_ON(!tsk->exit_state);
@@ -187,9 +209,11 @@ void __put_task_struct(struct task_struct *tsk)
        delayacct_tsk_free(tsk);
        put_signal_struct(tsk->signal);
 
+       atomic_notifier_call_chain(&task_free_notifier, 0, tsk);
        if (!profile_handoff_task(tsk))
                free_task(tsk);
 }
+EXPORT_SYMBOL_GPL(__put_task_struct);
 
 /*
  * macro override instead of weak attribute alias, to workaround
@@ -224,7 +248,7 @@ void __init fork_init(unsigned long mempages)
        /*
         * we need to allow at least 20 threads to boot a system
         */
-       if(max_threads < 20)
+       if (max_threads < 20)
                max_threads = 20;
 
        init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
@@ -245,22 +269,22 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        struct task_struct *tsk;
        struct thread_info *ti;
        unsigned long *stackend;
-
+       int node = tsk_fork_get_node(orig);
        int err;
 
        prepare_to_copy(orig);
 
-       tsk = alloc_task_struct();
+       tsk = alloc_task_struct_node(node);
        if (!tsk)
                return NULL;
 
-       ti = alloc_thread_info(tsk);
+       ti = alloc_thread_info_node(tsk, node);
        if (!ti) {
                free_task_struct(tsk);
                return NULL;
        }
 
-       err = arch_dup_task_struct(tsk, orig);
+       err = arch_dup_task_struct(tsk, orig);
        if (err)
                goto out;
 
@@ -272,6 +296,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 
        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
+       clear_tsk_need_resched(tsk);
        stackend = end_of_stack(tsk);
        *stackend = STACK_END_MAGIC;    /* for overflow detection */
 
@@ -279,9 +304,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        tsk->stack_canary = get_random_int();
 #endif
 
-       /* One for us, one for whoever does the "release_task()" (usually parent) */
-       atomic_set(&tsk->usage,2);
-       atomic_set(&tsk->fs_excl, 0);
+       /*
+        * One for us, one for whoever does the "release_task()" (usually
+        * parent)
+        */
+       atomic_set(&tsk->usage, 2);
 #ifdef CONFIG_BLK_DEV_IO_TRACE
        tsk->btrace_seq = 0;
 #endif
@@ -300,7 +327,7 @@ out:
 #ifdef CONFIG_MMU
 static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 {
-       struct vm_area_struct *mpnt, *tmp, **pprev;
+       struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
        struct rb_node **rb_link, *rb_parent;
        int retval;
        unsigned long charge;
@@ -327,7 +354,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        retval = ksm_fork(mm, oldmm);
        if (retval)
                goto out;
+       retval = khugepaged_fork(mm, oldmm);
+       if (retval)
+               goto out;
 
+       prev = NULL;
        for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
                struct file *file;
 
@@ -355,11 +386,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                if (IS_ERR(pol))
                        goto fail_nomem_policy;
                vma_set_policy(tmp, pol);
+               tmp->vm_mm = mm;
                if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
                tmp->vm_flags &= ~VM_LOCKED;
-               tmp->vm_mm = mm;
-               tmp->vm_next = NULL;
+               tmp->vm_next = tmp->vm_prev = NULL;
                file = tmp->vm_file;
                if (file) {
                        struct inode *inode = file->f_path.dentry->d_inode;
@@ -368,15 +399,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                        get_file(file);
                        if (tmp->vm_flags & VM_DENYWRITE)
                                atomic_dec(&inode->i_writecount);
-                       spin_lock(&mapping->i_mmap_lock);
+                       mutex_lock(&mapping->i_mmap_mutex);
                        if (tmp->vm_flags & VM_SHARED)
                                mapping->i_mmap_writable++;
-                       tmp->vm_truncate_count = mpnt->vm_truncate_count;
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
                        vma_prio_tree_add(tmp, mpnt);
                        flush_dcache_mmap_unlock(mapping);
-                       spin_unlock(&mapping->i_mmap_lock);
+                       mutex_unlock(&mapping->i_mmap_mutex);
                }
 
                /*
@@ -392,6 +422,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                 */
                *pprev = tmp;
                pprev = &tmp->vm_next;
+               tmp->vm_prev = prev;
+               prev = tmp;
 
                __vma_link_rb(mm, tmp, rb_link, rb_parent);
                rb_link = &tmp->vm_rb.rb_right;
@@ -424,7 +456,7 @@ fail_nomem:
        goto out;
 }
 
-static inline int mm_alloc_pgd(struct mm_struct * mm)
+static inline int mm_alloc_pgd(struct mm_struct *mm)
 {
        mm->pgd = pgd_alloc(mm);
        if (unlikely(!mm->pgd))
@@ -432,7 +464,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm)
        return 0;
 }
 
-static inline void mm_free_pgd(struct mm_struct * mm)
+static inline void mm_free_pgd(struct mm_struct *mm)
 {
        pgd_free(mm, mm->pgd);
 }
@@ -469,7 +501,7 @@ static void mm_init_aio(struct mm_struct *mm)
 #endif
 }
 
-static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
+static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 {
        atomic_set(&mm->mm_users, 1);
        atomic_set(&mm->mm_count, 1);
@@ -485,6 +517,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
        mm->cached_hole_size = ~0UL;
        mm_init_aio(mm);
        mm_init_owner(mm, p);
+       atomic_set(&mm->oom_disable_count, 0);
 
        if (likely(!mm_alloc_pgd(mm))) {
                mm->def_flags = 0;
@@ -499,16 +532,17 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 /*
  * Allocate and initialize an mm_struct.
  */
-struct mm_struct * mm_alloc(void)
+struct mm_struct *mm_alloc(void)
 {
-       struct mm_struct * mm;
+       struct mm_struct *mm;
 
        mm = allocate_mm();
-       if (mm) {
-               memset(mm, 0, sizeof(*mm));
-               mm = mm_init(mm, current);
-       }
-       return mm;
+       if (!mm)
+               return NULL;
+
+       memset(mm, 0, sizeof(*mm));
+       mm_init_cpumask(mm);
+       return mm_init(mm, current);
 }
 
 /*
@@ -522,6 +556,9 @@ void __mmdrop(struct mm_struct *mm)
        mm_free_pgd(mm);
        destroy_context(mm);
        mmu_notifier_mm_destroy(mm);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       VM_BUG_ON(mm->pmd_huge_pte);
+#endif
        free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -536,6 +573,7 @@ void mmput(struct mm_struct *mm)
        if (atomic_dec_and_test(&mm->mm_users)) {
                exit_aio(mm);
                ksm_exit(mm);
+               khugepaged_exit(mm); /* must run before exit_mmap */
                exit_mmap(mm);
                set_mm_exe_file(mm, NULL);
                if (!list_empty(&mm->mmlist)) {
@@ -551,6 +589,57 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
+/*
+ * We added or removed a vma mapping the executable. The vmas are only mapped
+ * during exec and are not mapped with the mmap system call.
+ * Callers must hold down_write() on the mm's mmap_sem for these
+ */
+void added_exe_file_vma(struct mm_struct *mm)
+{
+       mm->num_exe_file_vmas++;
+}
+
+void removed_exe_file_vma(struct mm_struct *mm)
+{
+       mm->num_exe_file_vmas--;
+       if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
+               fput(mm->exe_file);
+               mm->exe_file = NULL;
+       }
+
+}
+
+void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+{
+       if (new_exe_file)
+               get_file(new_exe_file);
+       if (mm->exe_file)
+               fput(mm->exe_file);
+       mm->exe_file = new_exe_file;
+       mm->num_exe_file_vmas = 0;
+}
+
+struct file *get_mm_exe_file(struct mm_struct *mm)
+{
+       struct file *exe_file;
+
+       /* We need mmap_sem to protect against races with removal of
+        * VM_EXECUTABLE vmas */
+       down_read(&mm->mmap_sem);
+       exe_file = mm->exe_file;
+       if (exe_file)
+               get_file(exe_file);
+       up_read(&mm->mmap_sem);
+       return exe_file;
+}
+
+static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+       /* It's safe to write the exe_file pointer without exe_file_lock because
+        * this is called during fork when the task is not yet in /proc */
+       newmm->exe_file = get_mm_exe_file(oldmm);
+}
+
 /**
  * get_task_mm - acquire a reference to the task's mm
  *
@@ -657,11 +746,16 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
                goto fail_nomem;
 
        memcpy(mm, oldmm, sizeof(*mm));
+       mm_init_cpumask(mm);
 
        /* Initializing for Swap token stuff */
        mm->token_priority = 0;
        mm->last_interval = 0;
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       mm->pmd_huge_pte = NULL;
+#endif
+
        if (!mm_init(mm, tsk))
                goto fail_nomem;
 
@@ -700,9 +794,9 @@ fail_nocontext:
        return NULL;
 }
 
-static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
+static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
 {
-       struct mm_struct * mm, *oldmm;
+       struct mm_struct *mm, *oldmm;
        int retval;
 
        tsk->min_flt = tsk->maj_flt = 0;
@@ -738,6 +832,8 @@ good_mm:
        /* Initializing for Swap token stuff */
        mm->token_priority = 0;
        mm->last_interval = 0;
+       if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+               atomic_inc(&mm->oom_disable_count);
 
        tsk->mm = mm;
        tsk->active_mm = mm;
@@ -767,7 +863,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
        return 0;
 }
 
-static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
 {
        struct files_struct *oldf, *newf;
        int error = 0;
@@ -897,9 +993,17 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        posix_cpu_timers_init_group(sig);
 
        tty_audit_fork(sig);
+       sched_autogroup_fork(sig);
+
+#ifdef CONFIG_CGROUPS
+       init_rwsem(&sig->threadgroup_fork_lock);
+#endif
 
        sig->oom_adj = current->signal->oom_adj;
        sig->oom_score_adj = current->signal->oom_score_adj;
+       sig->oom_score_adj_min = current->signal->oom_score_adj_min;
+
+       mutex_init(&sig->cred_guard_mutex);
 
        return 0;
 }
@@ -926,7 +1030,7 @@ static void rt_mutex_init_task(struct task_struct *p)
 {
        raw_spin_lock_init(&p->pi_lock);
 #ifdef CONFIG_RT_MUTEXES
-       plist_head_init_raw(&p->pi_waiters, &p->pi_lock);
+       plist_head_init(&p->pi_waiters);
        p->pi_blocked_on = NULL;
 #endif
 }
@@ -1023,6 +1127,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                    p->real_cred->user != INIT_USER)
                        goto bad_fork_free;
        }
+       current->flags &= ~PF_NPROC_EXCEEDED;
 
        retval = copy_creds(p, clone_flags);
        if (retval < 0)
@@ -1071,22 +1176,27 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
        posix_cpu_timers_init(p);
 
-       p->lock_depth = -1;             /* -1 = no lock */
        do_posix_clock_monotonic_gettime(&p->start_time);
        p->real_start_time = p->start_time;
        monotonic_to_bootbased(&p->real_start_time);
        p->io_context = NULL;
        p->audit_context = NULL;
+       if (clone_flags & CLONE_THREAD)
+               threadgroup_fork_read_lock(current);
        cgroup_fork(p);
 #ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
-       if (IS_ERR(p->mempolicy)) {
-               retval = PTR_ERR(p->mempolicy);
-               p->mempolicy = NULL;
-               goto bad_fork_cleanup_cgroup;
-       }
+       if (IS_ERR(p->mempolicy)) {
+               retval = PTR_ERR(p->mempolicy);
+               p->mempolicy = NULL;
+               goto bad_fork_cleanup_cgroup;
+       }
        mpol_fix_fork_child_flag(p);
 #endif
+#ifdef CONFIG_CPUSETS
+       p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
+       p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
+#endif
 #ifdef CONFIG_TRACE_IRQFLAGS
        p->irq_events = 0;
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
@@ -1121,30 +1231,38 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
 
        /* Perform scheduler related setup. Assign this task to a CPU. */
-       sched_fork(p, clone_flags);
+       sched_fork(p);
 
        retval = perf_event_init_task(p);
        if (retval)
                goto bad_fork_cleanup_policy;
-
-       if ((retval = audit_alloc(p)))
+       retval = audit_alloc(p);
+       if (retval)
                goto bad_fork_cleanup_policy;
        /* copy all the process information */
-       if ((retval = copy_semundo(clone_flags, p)))
+       retval = copy_semundo(clone_flags, p);
+       if (retval)
                goto bad_fork_cleanup_audit;
-       if ((retval = copy_files(clone_flags, p)))
+       retval = copy_files(clone_flags, p);
+       if (retval)
                goto bad_fork_cleanup_semundo;
-       if ((retval = copy_fs(clone_flags, p)))
+       retval = copy_fs(clone_flags, p);
+       if (retval)
                goto bad_fork_cleanup_files;
-       if ((retval = copy_sighand(clone_flags, p)))
+       retval = copy_sighand(clone_flags, p);
+       if (retval)
                goto bad_fork_cleanup_fs;
-       if ((retval = copy_signal(clone_flags, p)))
+       retval = copy_signal(clone_flags, p);
+       if (retval)
                goto bad_fork_cleanup_sighand;
-       if ((retval = copy_mm(clone_flags, p)))
+       retval = copy_mm(clone_flags, p);
+       if (retval)
                goto bad_fork_cleanup_signal;
-       if ((retval = copy_namespaces(clone_flags, p)))
+       retval = copy_namespaces(clone_flags, p);
+       if (retval)
                goto bad_fork_cleanup_mm;
-       if ((retval = copy_io(clone_flags, p)))
+       retval = copy_io(clone_flags, p);
+       if (retval)
                goto bad_fork_cleanup_namespaces;
        retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
        if (retval)
@@ -1155,12 +1273,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                pid = alloc_pid(p->nsproxy->pid_ns);
                if (!pid)
                        goto bad_fork_cleanup_io;
-
-               if (clone_flags & CLONE_NEWPID) {
-                       retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
-                       if (retval < 0)
-                               goto bad_fork_free_pid;
-               }
        }
 
        p->pid = pid_nr(pid);
@@ -1168,17 +1280,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (clone_flags & CLONE_THREAD)
                p->tgid = current->tgid;
 
-       if (current->nsproxy != p->nsproxy) {
-               retval = ns_cgroup_clone(p, pid);
-               if (retval)
-                       goto bad_fork_free_pid;
-       }
-
        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
        /*
         * Clear TID on mm_release()?
         */
-       p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+       p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
+#ifdef CONFIG_BLOCK
+       p->plug = NULL;
+#endif
 #ifdef CONFIG_FUTEX
        p->robust_list = NULL;
 #ifdef CONFIG_COMPAT
@@ -1243,7 +1352,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         * it's process group.
         * A fatal signal pending means that current will exit, so the new
         * thread can't slip out of an OOM kill (or normal SIGKILL).
-        */
+       */
        recalc_sigpending();
        if (signal_pending(current)) {
                spin_unlock(&current->sighand->siglock);
@@ -1261,10 +1370,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        }
 
        if (likely(p->pid)) {
-               tracehook_finish_clone(p, clone_flags, trace);
+               ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
 
                if (thread_group_leader(p)) {
-                       if (clone_flags & CLONE_NEWPID)
+                       if (is_child_reaper(pid))
                                p->nsproxy->pid_ns->child_reaper = p;
 
                        p->signal->leader_pid = pid;
@@ -1273,7 +1382,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                        attach_pid(p, PIDTYPE_SID, task_session(current));
                        list_add_tail(&p->sibling, &p->real_parent->children);
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
-                       __get_cpu_var(process_counts)++;
+                       __this_cpu_inc(process_counts);
                }
                attach_pid(p, PIDTYPE_PID, pid);
                nr_threads++;
@@ -1284,6 +1393,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        write_unlock_irq(&tasklist_lock);
        proc_fork_connector(p);
        cgroup_post_fork(p);
+       if (clone_flags & CLONE_THREAD)
+               threadgroup_fork_read_unlock(current);
        perf_event_fork(p);
        return p;
 
@@ -1296,8 +1407,13 @@ bad_fork_cleanup_io:
 bad_fork_cleanup_namespaces:
        exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-       if (p->mm)
+       if (p->mm) {
+               task_lock(p);
+               if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                       atomic_dec(&p->mm->oom_disable_count);
+               task_unlock(p);
                mmput(p->mm);
+       }
 bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
                free_signal_struct(p->signal);
@@ -1317,6 +1433,8 @@ bad_fork_cleanup_policy:
        mpol_put(p->mempolicy);
 bad_fork_cleanup_cgroup:
 #endif
+       if (clone_flags & CLONE_THREAD)
+               threadgroup_fork_read_unlock(current);
        cgroup_exit(p, cgroup_callbacks_done);
        delayacct_tsk_free(p);
        module_put(task_thread_info(p)->exec_domain->module);
@@ -1393,28 +1511,23 @@ long do_fork(unsigned long clone_flags,
        }
 
        /*
-        * We hope to recycle these flags after 2.6.26
+        * Determine whether and which event to report to ptracer.  When
+        * called from kernel_thread or CLONE_UNTRACED is explicitly
+        * requested, no event is reported; otherwise, report if the event
+        * for the type of forking is enabled.
         */
-       if (unlikely(clone_flags & CLONE_STOPPED)) {
-               static int __read_mostly count = 100;
-
-               if (count > 0 && printk_ratelimit()) {
-                       char comm[TASK_COMM_LEN];
+       if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
+               if (clone_flags & CLONE_VFORK)
+                       trace = PTRACE_EVENT_VFORK;
+               else if ((clone_flags & CSIGNAL) != SIGCHLD)
+                       trace = PTRACE_EVENT_CLONE;
+               else
+                       trace = PTRACE_EVENT_FORK;
 
-                       count--;
-                       printk(KERN_INFO "fork(): process `%s' used deprecated "
-                                       "clone flags 0x%lx\n",
-                               get_task_comm(comm, current),
-                               clone_flags & CLONE_STOPPED);
-               }
+               if (likely(!ptrace_event_enabled(current, trace)))
+                       trace = 0;
        }
 
-       /*
-        * When called from kernel_thread, don't do user tracing stuff.
-        */
-       if (likely(user_mode(regs)))
-               trace = tracehook_prepare_clone(clone_flags);
-
        p = copy_process(clone_flags, stack_start, regs, stack_size,
                         child_tidptr, NULL, trace);
        /*
@@ -1437,35 +1550,26 @@ long do_fork(unsigned long clone_flags,
                }
 
                audit_finish_fork(p);
-               tracehook_report_clone(regs, clone_flags, nr, p);
 
                /*
                 * We set PF_STARTING at creation in case tracing wants to
                 * use this to distinguish a fully live task from one that
-                * hasn't gotten to tracehook_report_clone() yet.  Now we
-                * clear it and set the child going.
+                * hasn't finished SIGSTOP raising yet.  Now we clear it
+                * and set the child going.
                 */
                p->flags &= ~PF_STARTING;
 
-               if (unlikely(clone_flags & CLONE_STOPPED)) {
-                       /*
-                        * We'll start up with an immediate SIGSTOP.
-                        */
-                       sigaddset(&p->pending.signal, SIGSTOP);
-                       set_tsk_thread_flag(p, TIF_SIGPENDING);
-                       __set_task_state(p, TASK_STOPPED);
-               } else {
-                       wake_up_new_task(p, clone_flags);
-               }
+               wake_up_new_task(p);
 
-               tracehook_report_clone_complete(trace, regs,
-                                               clone_flags, nr, p);
+               /* forking complete and child started to run, tell ptracer */
+               if (unlikely(trace))
+                       ptrace_event(trace, nr);
 
                if (clone_flags & CLONE_VFORK) {
                        freezer_do_not_count();
                        wait_for_completion(&vfork);
                        freezer_count();
-                       tracehook_report_vfork_done(p, nr);
+                       ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
                }
        } else {
                nr = PTR_ERR(p);
@@ -1500,46 +1604,40 @@ void __init proc_caches_init(void)
        fs_cachep = kmem_cache_create("fs_cache",
                        sizeof(struct fs_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+       /*
+        * FIXME! The "sizeof(struct mm_struct)" currently includes the
+        * whole struct cpumask for the OFFSTACK case. We could change
+        * this to *only* allocate as much of it as required by the
+        * maximum number of CPU's we can ever have.  The cpumask_allocation
+        * is at the end of the structure, exactly for that reason.
+        */
        mm_cachep = kmem_cache_create("mm_struct",
                        sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
        vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
        mmap_init();
+       nsproxy_cache_init();
 }
 
 /*
- * Check constraints on flags passed to the unshare system call and
- * force unsharing of additional process context as appropriate.
+ * Check constraints on flags passed to the unshare system call.
  */
-static void check_unshare_flags(unsigned long *flags_ptr)
+static int check_unshare_flags(unsigned long unshare_flags)
 {
+       if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+                               CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
+                               CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+               return -EINVAL;
        /*
-        * If unsharing a thread from a thread group, must also
-        * unshare vm.
-        */
-       if (*flags_ptr & CLONE_THREAD)
-               *flags_ptr |= CLONE_VM;
-
-       /*
-        * If unsharing vm, must also unshare signal handlers.
-        */
-       if (*flags_ptr & CLONE_VM)
-               *flags_ptr |= CLONE_SIGHAND;
-
-       /*
-        * If unsharing namespace, must also unshare filesystem information.
+        * Not implemented, but pretend it works if there is nothing to
+        * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
+        * needs to unshare vm.
         */
-       if (*flags_ptr & CLONE_NEWNS)
-               *flags_ptr |= CLONE_FS;
-}
-
-/*
- * Unsharing of tasks created with CLONE_THREAD is not supported yet
- */
-static int unshare_thread(unsigned long unshare_flags)
-{
-       if (unshare_flags & CLONE_THREAD)
-               return -EINVAL;
+       if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
+               /* FIXME: get_task_mm() increments ->mm_users */
+               if (atomic_read(&current->mm->mm_users) > 1)
+                       return -EINVAL;
+       }
 
        return 0;
 }
@@ -1566,34 +1664,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 }
 
 /*
- * Unsharing of sighand is not supported yet
- */
-static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
-{
-       struct sighand_struct *sigh = current->sighand;
-
-       if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
-               return -EINVAL;
-       else
-               return 0;
-}
-
-/*
- * Unshare vm if it is being shared
- */
-static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
-{
-       struct mm_struct *mm = current->mm;
-
-       if ((unshare_flags & CLONE_VM) &&
-           (mm && atomic_read(&mm->mm_users) > 1)) {
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-/*
  * Unshare file descriptor table if it is being shared
  */
 static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
@@ -1621,45 +1691,39 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
  */
 SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 {
-       int err = 0;
        struct fs_struct *fs, *new_fs = NULL;
-       struct sighand_struct *new_sigh = NULL;
-       struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
        struct files_struct *fd, *new_fd = NULL;
        struct nsproxy *new_nsproxy = NULL;
        int do_sysvsem = 0;
+       int err;
 
-       check_unshare_flags(&unshare_flags);
-
-       /* Return -EINVAL for all unsupported flags */
-       err = -EINVAL;
-       if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
-                               CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
-                               CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+       err = check_unshare_flags(unshare_flags);
+       if (err)
                goto bad_unshare_out;
 
        /*
+        * If unsharing namespace, must also unshare filesystem information.
+        */
+       if (unshare_flags & CLONE_NEWNS)
+               unshare_flags |= CLONE_FS;
+       /*
         * CLONE_NEWIPC must also detach from the undolist: after switching
         * to a new ipc namespace, the semaphore arrays from the old
         * namespace are unreachable.
         */
        if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
                do_sysvsem = 1;
-       if ((err = unshare_thread(unshare_flags)))
+       err = unshare_fs(unshare_flags, &new_fs);
+       if (err)
                goto bad_unshare_out;
-       if ((err = unshare_fs(unshare_flags, &new_fs)))
-               goto bad_unshare_cleanup_thread;
-       if ((err = unshare_sighand(unshare_flags, &new_sigh)))
+       err = unshare_fd(unshare_flags, &new_fd);
+       if (err)
                goto bad_unshare_cleanup_fs;
-       if ((err = unshare_vm(unshare_flags, &new_mm)))
-               goto bad_unshare_cleanup_sigh;
-       if ((err = unshare_fd(unshare_flags, &new_fd)))
-               goto bad_unshare_cleanup_vm;
-       if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
-                       new_fs)))
+       err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
+       if (err)
                goto bad_unshare_cleanup_fd;
 
-       if (new_fs ||  new_mm || new_fd || do_sysvsem || new_nsproxy) {
+       if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
                if (do_sysvsem) {
                        /*
                         * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1685,15 +1749,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
                        spin_unlock(&fs->lock);
                }
 
-               if (new_mm) {
-                       mm = current->mm;
-                       active_mm = current->active_mm;
-                       current->mm = new_mm;
-                       current->active_mm = new_mm;
-                       activate_mm(active_mm, new_mm);
-                       new_mm = mm;
-               }
-
                if (new_fd) {
                        fd = current->files;
                        current->files = new_fd;
@@ -1710,20 +1765,10 @@ bad_unshare_cleanup_fd:
        if (new_fd)
                put_files_struct(new_fd);
 
-bad_unshare_cleanup_vm:
-       if (new_mm)
-               mmput(new_mm);
-
-bad_unshare_cleanup_sigh:
-       if (new_sigh)
-               if (atomic_dec_and_test(&new_sigh->count))
-                       kmem_cache_free(sighand_cachep, new_sigh);
-
 bad_unshare_cleanup_fs:
        if (new_fs)
                free_fs_struct(new_fs);
 
-bad_unshare_cleanup_thread:
 bad_unshare_out:
        return err;
 }