Merge git://git.samba.org/sfrench/cifs-2.6
[linux-2.6.git] / fs / exec.c
index 526a039..92ce83a 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -42,7 +42,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/module.h>
 #include <linux/namei.h>
-#include <linux/proc_fs.h>
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/fs_struct.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
+#include <linux/compat.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
+
+#include <trace/events/task.h>
 #include "internal.h"
 
 int core_uses_pid;
@@ -166,8 +168,13 @@ out:
 }
 
 #ifdef CONFIG_MMU
-
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+/*
+ * The nascent bprm->mm is not visible until exec_mmap() but it can
+ * use a lot of memory, account these pages in current->mm temporary
+ * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
+ * change the counter back via acct_arg_size(0).
+ */
+static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
        struct mm_struct *mm = current->mm;
        long diff = (long)(pages - bprm->vma_pages);
@@ -176,17 +183,10 @@ void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
                return;
 
        bprm->vma_pages = pages;
-
-#ifdef SPLIT_RSS_COUNTING
-       add_mm_counter(mm, MM_ANONPAGES, diff);
-#else
-       spin_lock(&mm->page_table_lock);
        add_mm_counter(mm, MM_ANONPAGES, diff);
-       spin_unlock(&mm->page_table_lock);
-#endif
 }
 
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
 {
        struct page *page;
@@ -194,7 +194,7 @@ struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 
 #ifdef CONFIG_STACK_GROWSUP
        if (write) {
-               ret = expand_stack_downwards(bprm->vma, pos);
+               ret = expand_downwards(bprm->vma, pos);
                if (ret < 0)
                        return NULL;
        }
@@ -272,7 +272,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
         * use STACK_TOP because that can depend on attributes which aren't
         * configured yet.
         */
-       BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
+       BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
        vma->vm_end = STACK_TOP_MAX;
        vma->vm_start = vma->vm_end - PAGE_SIZE;
        vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
@@ -305,11 +305,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len)
 
 #else
 
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 }
 
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
 {
        struct page *page;
@@ -399,17 +399,36 @@ err:
 }
 
 struct user_arg_ptr {
-       const char __user *const __user *native;
+#ifdef CONFIG_COMPAT
+       bool is_compat;
+#endif
+       union {
+               const char __user *const __user *native;
+#ifdef CONFIG_COMPAT
+               compat_uptr_t __user *compat;
+#endif
+       } ptr;
 };
 
 static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
 {
-       const char __user *ptr;
+       const char __user *native;
+
+#ifdef CONFIG_COMPAT
+       if (unlikely(argv.is_compat)) {
+               compat_uptr_t compat;
+
+               if (get_user(compat, argv.ptr.compat + nr))
+                       return ERR_PTR(-EFAULT);
+
+               return compat_ptr(compat);
+       }
+#endif
 
-       if (get_user(ptr, argv.native + nr))
+       if (get_user(native, argv.ptr.native + nr))
                return ERR_PTR(-EFAULT);
 
-       return ptr;
+       return native;
 }
 
 /*
@@ -419,7 +438,7 @@ static int count(struct user_arg_ptr argv, int max)
 {
        int i = 0;
 
-       if (argv.native != NULL) {
+       if (argv.ptr.native != NULL) {
                for (;;) {
                        const char __user *p = get_user_arg_ptr(argv, i);
 
@@ -542,7 +561,7 @@ int copy_strings_kernel(int argc, const char *const *__argv,
        int r;
        mm_segment_t oldfs = get_fs();
        struct user_arg_ptr argv = {
-               .native = (const char __user *const  __user *)__argv,
+               .ptr.native = (const char __user *const  __user *)__argv,
        };
 
        set_fs(KERNEL_DS);
@@ -575,7 +594,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
        unsigned long length = old_end - old_start;
        unsigned long new_start = old_start - shift;
        unsigned long new_end = old_end - shift;
-       struct mmu_gather *tlb;
+       struct mmu_gather tlb;
 
        BUG_ON(new_start > new_end);
 
@@ -601,12 +620,12 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
                return -ENOMEM;
 
        lru_add_drain();
-       tlb = tlb_gather_mmu(mm, 0);
+       tlb_gather_mmu(&tlb, mm, 0);
        if (new_end > old_start) {
                /*
                 * when the old and new regions overlap clear from new_end.
                 */
-               free_pgd_range(tlb, new_end, old_end, new_end,
+               free_pgd_range(&tlb, new_end, old_end, new_end,
                        vma->vm_next ? vma->vm_next->vm_start : 0);
        } else {
                /*
@@ -615,10 +634,10 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
                 * have constraints on va-space that make this illegal (IA64) -
                 * for the others its just a little faster.
                 */
-               free_pgd_range(tlb, old_start, old_end, new_end,
+               free_pgd_range(&tlb, old_start, old_end, new_end,
                        vma->vm_next ? vma->vm_next->vm_start : 0);
        }
-       tlb_finish_mmu(tlb, new_end, old_end);
+       tlb_finish_mmu(&tlb, new_end, old_end);
 
        /*
         * Shrink the vma to just the new range.  Always succeeds.
@@ -824,10 +843,6 @@ static int exec_mmap(struct mm_struct *mm)
        tsk->mm = mm;
        tsk->active_mm = mm;
        activate_mm(active_mm, mm);
-       if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
-               atomic_dec(&old_mm->oom_disable_count);
-               atomic_inc(&tsk->mm->oom_disable_count);
-       }
        task_unlock(tsk);
        arch_pick_mmap_layout(mm);
        if (old_mm) {
@@ -939,9 +954,18 @@ static int de_thread(struct task_struct *tsk)
                leader->group_leader = tsk;
 
                tsk->exit_signal = SIGCHLD;
+               leader->exit_signal = -1;
 
                BUG_ON(leader->exit_state != EXIT_ZOMBIE);
                leader->exit_state = EXIT_DEAD;
+
+               /*
+                * We are going to release_task()->ptrace_unlink() silently,
+                * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
+                * the tracer wont't block again waiting for this thread.
+                */
+               if (unlikely(leader->ptrace))
+                       __wake_up_parent(leader, leader->parent);
                write_unlock_irq(&tasklist_lock);
 
                release_task(leader);
@@ -1026,11 +1050,14 @@ char *get_task_comm(char *buf, struct task_struct *tsk)
        task_unlock(tsk);
        return buf;
 }
+EXPORT_SYMBOL_GPL(get_task_comm);
 
 void set_task_comm(struct task_struct *tsk, char *buf)
 {
        task_lock(tsk);
 
+       trace_task_rename(tsk, buf);
+
        /*
         * Threads may access current->comm without holding
         * the task lock, so write the string carefully.
@@ -1044,6 +1071,21 @@ void set_task_comm(struct task_struct *tsk, char *buf)
        perf_event_comm(tsk);
 }
 
+static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len)
+{
+       int i, ch;
+
+       /* Copies the binary name from after last slash */
+       for (i = 0; (ch = *(fn++)) != '\0';) {
+               if (ch == '/')
+                       i = 0; /* overwrite what we wrote */
+               else
+                       if (i < len - 1)
+                               tcomm[i++] = ch;
+       }
+       tcomm[i] = '\0';
+}
+
 int flush_old_exec(struct linux_binprm * bprm)
 {
        int retval;
@@ -1058,6 +1100,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 
        set_mm_exe_file(bprm->mm, bprm->file);
 
+       filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm));
        /*
         * Release all of the old mmap stuff
         */
@@ -1068,6 +1111,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 
        bprm->mm = NULL;                /* We're using it now */
 
+       set_fs(USER_DS);
        current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
        flush_thread();
        current->personality &= ~bprm->per_clear;
@@ -1079,12 +1123,15 @@ out:
 }
 EXPORT_SYMBOL(flush_old_exec);
 
-void setup_new_exec(struct linux_binprm * bprm)
+void would_dump(struct linux_binprm *bprm, struct file *file)
 {
-       int i, ch;
-       const char *name;
-       char tcomm[sizeof(current->comm)];
+       if (inode_permission(file->f_path.dentry->d_inode, MAY_READ) < 0)
+               bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+}
+EXPORT_SYMBOL(would_dump);
 
+void setup_new_exec(struct linux_binprm * bprm)
+{
        arch_pick_mmap_layout(current->mm);
 
        /* This is the point of no return */
@@ -1095,18 +1142,7 @@ void setup_new_exec(struct linux_binprm * bprm)
        else
                set_dumpable(current->mm, suid_dumpable);
 
-       name = bprm->filename;
-
-       /* Copies the binary name from after last slash */
-       for (i=0; (ch = *(name++)) != '\0';) {
-               if (ch == '/')
-                       i = 0; /* overwrite what we wrote */
-               else
-                       if (i < (sizeof(tcomm) - 1))
-                               tcomm[i++] = ch;
-       }
-       tcomm[i] = '\0';
-       set_task_comm(current, tcomm);
+       set_task_comm(current, bprm->tcomm);
 
        /* Set the new mm task size. We have to do that late because it may
         * depend on TIF_32BIT which is only updated in flush_thread() on
@@ -1118,9 +1154,10 @@ void setup_new_exec(struct linux_binprm * bprm)
        if (bprm->cred->uid != current_euid() ||
            bprm->cred->gid != current_egid()) {
                current->pdeath_signal = 0;
-       } else if (file_permission(bprm->file, MAY_READ) ||
-                  bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) {
-               set_dumpable(current->mm, suid_dumpable);
+       } else {
+               would_dump(bprm, bprm->file);
+               if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
+                       set_dumpable(current->mm, suid_dumpable);
        }
 
        /*
@@ -1193,13 +1230,18 @@ EXPORT_SYMBOL(install_exec_creds);
  * - the caller must hold ->cred_guard_mutex to protect against
  *   PTRACE_ATTACH
  */
-int check_unsafe_exec(struct linux_binprm *bprm)
+static int check_unsafe_exec(struct linux_binprm *bprm)
 {
        struct task_struct *p = current, *t;
        unsigned n_fs;
        int res = 0;
 
-       bprm->unsafe = tracehook_unsafe_exec(p);
+       if (p->ptrace) {
+               if (p->ptrace & PT_PTRACE_CAP)
+                       bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
+               else
+                       bprm->unsafe |= LSM_UNSAFE_PTRACE;
+       }
 
        n_fs = 1;
        spin_lock(&p->fs->lock);
@@ -1327,19 +1369,21 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
        unsigned int depth = bprm->recursion_depth;
        int try,retval;
        struct linux_binfmt *fmt;
+       pid_t old_pid;
 
        retval = security_bprm_check(bprm);
        if (retval)
                return retval;
 
-       /* kernel module loader fixup */
-       /* so we don't try to load run modprobe in kernel space. */
-       set_fs(USER_DS);
-
        retval = audit_bprm(bprm);
        if (retval)
                return retval;
 
+       /* Need to fetch pid before load_binary changes it */
+       rcu_read_lock();
+       old_pid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
+       rcu_read_unlock();
+
        retval = -ENOENT;
        for (try=0; try<2; try++) {
                read_lock(&binfmt_lock);
@@ -1359,7 +1403,8 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
                        bprm->recursion_depth = depth;
                        if (retval >= 0) {
                                if (depth == 0)
-                                       tracehook_report_exec(fmt, bprm, regs);
+                                       ptrace_event(PTRACE_EVENT_EXEC,
+                                                       old_pid);
                                put_binfmt(fmt);
                                allow_write_access(bprm->file);
                                if (bprm->file)
@@ -1379,9 +1424,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
                        }
                }
                read_unlock(&binfmt_lock);
+#ifdef CONFIG_MODULES
                if (retval != -ENOEXEC || bprm->mm == NULL) {
                        break;
-#ifdef CONFIG_MODULES
                } else {
 #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
                        if (printable(bprm->buf[0]) &&
@@ -1389,9 +1434,13 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
                            printable(bprm->buf[2]) &&
                            printable(bprm->buf[3]))
                                break; /* -ENOEXEC */
+                       if (try)
+                               break; /* -ENOEXEC */
                        request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
-#endif
                }
+#else
+               break;
+#endif
        }
        return retval;
 }
@@ -1411,6 +1460,23 @@ static int do_execve_common(const char *filename,
        struct files_struct *displaced;
        bool clear_in_exec;
        int retval;
+       const struct cred *cred = current_cred();
+
+       /*
+        * We move the actual failure in case of RLIMIT_NPROC excess from
+        * set*uid() to execve() because too many poorly written programs
+        * don't check setuid() return code.  Here we additionally recheck
+        * whether NPROC limit is still exceeded.
+        */
+       if ((current->flags & PF_NPROC_EXCEEDED) &&
+           atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) {
+               retval = -EAGAIN;
+               goto out_ret;
+       }
+
+       /* We're below the limit (still or again), so we don't want to make
+        * further execve() calls fail. */
+       current->flags &= ~PF_NPROC_EXCEEDED;
 
        retval = unshare_files(&displaced);
        if (retval)
@@ -1516,11 +1582,29 @@ int do_execve(const char *filename,
        const char __user *const __user *__envp,
        struct pt_regs *regs)
 {
-       struct user_arg_ptr argv = { .native = __argv };
-       struct user_arg_ptr envp = { .native = __envp };
+       struct user_arg_ptr argv = { .ptr.native = __argv };
+       struct user_arg_ptr envp = { .ptr.native = __envp };
        return do_execve_common(filename, argv, envp, regs);
 }
 
+#ifdef CONFIG_COMPAT
+int compat_do_execve(char *filename,
+       compat_uptr_t __user *__argv,
+       compat_uptr_t __user *__envp,
+       struct pt_regs *regs)
+{
+       struct user_arg_ptr argv = {
+               .is_compat = true,
+               .ptr.compat = __argv,
+       };
+       struct user_arg_ptr envp = {
+               .is_compat = true,
+               .ptr.compat = __envp,
+       };
+       return do_execve_common(filename, argv, envp, regs);
+}
+#endif
+
 void set_binfmt(struct linux_binfmt *new)
 {
        struct mm_struct *mm = current->mm;
@@ -1580,6 +1664,50 @@ expand_fail:
        return ret;
 }
 
+static void cn_escape(char *str)
+{
+       for (; *str; str++)
+               if (*str == '/')
+                       *str = '!';
+}
+
+static int cn_print_exe_file(struct core_name *cn)
+{
+       struct file *exe_file;
+       char *pathbuf, *path;
+       int ret;
+
+       exe_file = get_mm_exe_file(current->mm);
+       if (!exe_file) {
+               char *commstart = cn->corename + cn->used;
+               ret = cn_printf(cn, "%s (path unknown)", current->comm);
+               cn_escape(commstart);
+               return ret;
+       }
+
+       pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
+       if (!pathbuf) {
+               ret = -ENOMEM;
+               goto put_exe_file;
+       }
+
+       path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
+       if (IS_ERR(path)) {
+               ret = PTR_ERR(path);
+               goto free_buf;
+       }
+
+       cn_escape(path);
+
+       ret = cn_printf(cn, "%s", path);
+
+free_buf:
+       kfree(pathbuf);
+put_exe_file:
+       fput(exe_file);
+       return ret;
+}
+
 /* format_corename will inspect the pattern parameter, and output a
  * name into corename, which must have space for at least
  * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
@@ -1641,15 +1769,24 @@ static int format_corename(struct core_name *cn, long signr)
                                break;
                        }
                        /* hostname */
-                       case 'h':
+                       case 'h': {
+                               char *namestart = cn->corename + cn->used;
                                down_read(&uts_sem);
                                err = cn_printf(cn, "%s",
                                              utsname()->nodename);
                                up_read(&uts_sem);
+                               cn_escape(namestart);
                                break;
+                       }
                        /* executable */
-                       case 'e':
+                       case 'e': {
+                               char *commstart = cn->corename + cn->used;
                                err = cn_printf(cn, "%s", current->comm);
+                               cn_escape(commstart);
+                               break;
+                       }
+                       case 'E':
+                               err = cn_print_exe_file(cn);
                                break;
                        /* core limit size */
                        case 'c':
@@ -1691,6 +1828,7 @@ static int zap_process(struct task_struct *start, int exit_code)
 
        t = start;
        do {
+               task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                if (t != current && t->mm) {
                        sigaddset(&t->pending.signal, SIGKILL);
                        signal_wake_up(t, 1);
@@ -1917,7 +2055,7 @@ static void wait_for_dump_helpers(struct file *file)
  * is a special value that we use to trap recursive
  * core dumps
  */
-static int umh_pipe_setup(struct subprocess_info *info)
+static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 {
        struct file *rp, *wp;
        struct fdtable *fdt;
@@ -2010,16 +2148,16 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 
        ispipe = format_corename(&cn, signr);
 
-       if (ispipe == -ENOMEM) {
-               printk(KERN_WARNING "format_corename failed\n");
-               printk(KERN_WARNING "Aborting core\n");
-               goto fail_corename;
-       }
-
        if (ispipe) {
                int dump_count;
                char **helper_argv;
 
+               if (ispipe < 0) {
+                       printk(KERN_WARNING "format_corename failed\n");
+                       printk(KERN_WARNING "Aborting core\n");
+                       goto fail_corename;
+               }
+
                if (cprm.limit == 1) {
                        /*
                         * Normally core limits are irrelevant to pipes, since