Merge branch 'smp-hotplug-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-3.10.git] / arch / x86 / kernel / vsyscall_64.c
index dda7dff..9a907a6 100644 (file)
@@ -18,8 +18,7 @@
  *  use the vDSO.
  */
 
-/* Disable profiling for userspace code: */
-#define DISABLE_BRANCH_PROFILING
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/time.h>
 #include <linux/init.h>
@@ -28,7 +27,8 @@
 #include <linux/seqlock.h>
 #include <linux/jiffies.h>
 #include <linux/sysctl.h>
-#include <linux/clocksource.h>
+#include <linux/topology.h>
+#include <linux/timekeeper_internal.h>
 #include <linux/getcpu.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
 #include <asm/vgtod.h>
 #include <asm/traps.h>
 
+#define CREATE_TRACE_POINTS
+#include "vsyscall_trace.h"
+
 DEFINE_VVAR(int, vgetcpu_mode);
-DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
+
+static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
+
+static int __init vsyscall_setup(char *str)
 {
-       .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
-};
+       if (str) {
+               if (!strcmp("emulate", str))
+                       vsyscall_mode = EMULATE;
+               else if (!strcmp("native", str))
+                       vsyscall_mode = NATIVE;
+               else if (!strcmp("none", str))
+                       vsyscall_mode = NONE;
+               else
+                       return -EINVAL;
+
+               return 0;
+       }
+
+       return -EINVAL;
+}
+early_param("vsyscall", vsyscall_setup);
 
 void update_vsyscall_tz(void)
 {
-       unsigned long flags;
-
-       write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
-       /* sys_tz has changed */
        vsyscall_gtod_data.sys_tz = sys_tz;
-       write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
-void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
-                       struct clocksource *clock, u32 mult)
+void update_vsyscall(struct timekeeper *tk)
 {
-       unsigned long flags;
+       struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
 
-       write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+       write_seqcount_begin(&vdata->seq);
 
        /* copy vsyscall data */
-       vsyscall_gtod_data.clock.vclock_mode    = clock->archdata.vclock_mode;
-       vsyscall_gtod_data.clock.cycle_last     = clock->cycle_last;
-       vsyscall_gtod_data.clock.mask           = clock->mask;
-       vsyscall_gtod_data.clock.mult           = mult;
-       vsyscall_gtod_data.clock.shift          = clock->shift;
-       vsyscall_gtod_data.wall_time_sec        = wall_time->tv_sec;
-       vsyscall_gtod_data.wall_time_nsec       = wall_time->tv_nsec;
-       vsyscall_gtod_data.wall_to_monotonic    = *wtm;
-       vsyscall_gtod_data.wall_time_coarse     = __current_kernel_time();
-
-       write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
+       vdata->clock.vclock_mode        = tk->clock->archdata.vclock_mode;
+       vdata->clock.cycle_last         = tk->clock->cycle_last;
+       vdata->clock.mask               = tk->clock->mask;
+       vdata->clock.mult               = tk->mult;
+       vdata->clock.shift              = tk->shift;
+
+       vdata->wall_time_sec            = tk->xtime_sec;
+       vdata->wall_time_snsec          = tk->xtime_nsec;
+
+       vdata->monotonic_time_sec       = tk->xtime_sec
+                                       + tk->wall_to_monotonic.tv_sec;
+       vdata->monotonic_time_snsec     = tk->xtime_nsec
+                                       + (tk->wall_to_monotonic.tv_nsec
+                                               << tk->shift);
+       while (vdata->monotonic_time_snsec >=
+                                       (((u64)NSEC_PER_SEC) << tk->shift)) {
+               vdata->monotonic_time_snsec -=
+                                       ((u64)NSEC_PER_SEC) << tk->shift;
+               vdata->monotonic_time_sec++;
+       }
+
+       vdata->wall_time_coarse.tv_sec  = tk->xtime_sec;
+       vdata->wall_time_coarse.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
+
+       vdata->monotonic_time_coarse    = timespec_add(vdata->wall_time_coarse,
+                                                       tk->wall_to_monotonic);
+
+       write_seqcount_end(&vdata->seq);
 }
 
 static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
                              const char *message)
 {
-       static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
-       struct task_struct *tsk;
-
-       if (!show_unhandled_signals || !__ratelimit(&rs))
+       if (!show_unhandled_signals)
                return;
 
-       tsk = current;
-
-       printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
-              level, tsk->comm, task_pid_nr(tsk),
-              message, regs->ip - 2, regs->cs,
-              regs->sp, regs->ax, regs->si, regs->di);
+       pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+                             level, current->comm, task_pid_nr(current),
+                             message, regs->ip, regs->cs,
+                             regs->sp, regs->ax, regs->si, regs->di);
 }
 
 static int addr_to_vsyscall_nr(unsigned long addr)
@@ -118,53 +145,138 @@ static int addr_to_vsyscall_nr(unsigned long addr)
        return nr;
 }
 
-void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code)
+static bool write_ok_or_segv(unsigned long ptr, size_t size)
+{
+       /*
+        * XXX: if access_ok, get_user, and put_user handled
+        * sig_on_uaccess_error, this could go away.
+        */
+
+       if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
+               siginfo_t info;
+               struct thread_struct *thread = &current->thread;
+
+               thread->error_code      = 6;  /* user fault, no page, write */
+               thread->cr2             = ptr;
+               thread->trap_nr         = X86_TRAP_PF;
+
+               memset(&info, 0, sizeof(info));
+               info.si_signo           = SIGSEGV;
+               info.si_errno           = 0;
+               info.si_code            = SEGV_MAPERR;
+               info.si_addr            = (void __user *)ptr;
+
+               force_sig_info(SIGSEGV, &info, current);
+               return false;
+       } else {
+               return true;
+       }
+}
+
+bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 {
        struct task_struct *tsk;
        unsigned long caller;
-       int vsyscall_nr;
+       int vsyscall_nr, syscall_nr, tmp;
+       int prev_sig_on_uaccess_error;
        long ret;
 
-       local_irq_enable();
-
        /*
-        * Real 64-bit user mode code has cs == __USER_CS.  Anything else
-        * is bogus.
+        * No point in checking CS -- the only way to get here is a user mode
+        * trap to a high address, which means that we're in 64-bit user code.
         */
-       if (regs->cs != __USER_CS) {
-               /*
-                * If we trapped from kernel mode, we might as well OOPS now
-                * instead of returning to some random address and OOPSing
-                * then.
-                */
-               BUG_ON(!user_mode(regs));
 
-               /* Compat mode and non-compat 32-bit CS should both segfault. */
-               warn_bad_vsyscall(KERN_WARNING, regs,
-                                 "illegal int 0xcc from 32-bit mode");
-               goto sigsegv;
+       WARN_ON_ONCE(address != regs->ip);
+
+       if (vsyscall_mode == NONE) {
+               warn_bad_vsyscall(KERN_INFO, regs,
+                                 "vsyscall attempted with vsyscall=none");
+               return false;
        }
 
-       /*
-        * x86-ism here: regs->ip points to the instruction after the int 0xcc,
-        * and int 0xcc is two bytes long.
-        */
-       vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2);
+       vsyscall_nr = addr_to_vsyscall_nr(address);
+
+       trace_emulate_vsyscall(vsyscall_nr);
+
        if (vsyscall_nr < 0) {
                warn_bad_vsyscall(KERN_WARNING, regs,
-                                 "illegal int 0xcc (exploit attempt?)");
+                                 "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
                goto sigsegv;
        }
 
        if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
-               warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)");
+               warn_bad_vsyscall(KERN_WARNING, regs,
+                                 "vsyscall with bad stack (exploit attempt?)");
                goto sigsegv;
        }
 
        tsk = current;
-       if (seccomp_mode(&tsk->seccomp))
-               do_exit(SIGKILL);
 
+       /*
+        * Check for access_ok violations and find the syscall nr.
+        *
+        * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
+        * 64-bit, so we don't need to special-case it here.  For all the
+        * vsyscalls, NULL means "don't write anything" not "write it at
+        * address 0".
+        */
+       switch (vsyscall_nr) {
+       case 0:
+               if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
+                   !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
+                       ret = -EFAULT;
+                       goto check_fault;
+               }
+
+               syscall_nr = __NR_gettimeofday;
+               break;
+
+       case 1:
+               if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
+                       ret = -EFAULT;
+                       goto check_fault;
+               }
+
+               syscall_nr = __NR_time;
+               break;
+
+       case 2:
+               if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+                   !write_ok_or_segv(regs->si, sizeof(unsigned))) {
+                       ret = -EFAULT;
+                       goto check_fault;
+               }
+
+               syscall_nr = __NR_getcpu;
+               break;
+       }
+
+       /*
+        * Handle seccomp.  regs->ip must be the original value.
+        * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
+        *
+        * We could optimize the seccomp disabled case, but performance
+        * here doesn't matter.
+        */
+       regs->orig_ax = syscall_nr;
+       regs->ax = -ENOSYS;
+       tmp = secure_computing(syscall_nr);
+       if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
+               warn_bad_vsyscall(KERN_DEBUG, regs,
+                                 "seccomp tried to change syscall nr or ip");
+               do_exit(SIGSYS);
+       }
+       if (tmp)
+               goto do_ret;  /* skip requested */
+
+       /*
+        * With a real vsyscall, page faults cause SIGSEGV.  We want to
+        * preserve that behavior to make writing exploits harder.
+        */
+       prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
+       current_thread_info()->sig_on_uaccess_error = 1;
+
+       ret = -EFAULT;
        switch (vsyscall_nr) {
        case 0:
                ret = sys_gettimeofday(
@@ -179,36 +291,40 @@ void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code)
        case 2:
                ret = sys_getcpu((unsigned __user *)regs->di,
                                 (unsigned __user *)regs->si,
-                                0);
+                                NULL);
                break;
        }
 
+       current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
+
+check_fault:
        if (ret == -EFAULT) {
-               /*
-                * Bad news -- userspace fed a bad pointer to a vsyscall.
-                *
-                * With a real vsyscall, that would have caused SIGSEGV.
-                * To make writing reliable exploits using the emulated
-                * vsyscalls harder, generate SIGSEGV here as well.
-                */
+               /* Bad news -- userspace fed a bad pointer to a vsyscall. */
                warn_bad_vsyscall(KERN_INFO, regs,
                                  "vsyscall fault (exploit attempt?)");
-               goto sigsegv;
+
+               /*
+                * If we failed to generate a signal for any reason,
+                * generate one here.  (This should be impossible.)
+                */
+               if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
+                                !sigismember(&tsk->pending.signal, SIGSEGV)))
+                       goto sigsegv;
+
+               return true;  /* Don't emulate the ret. */
        }
 
        regs->ax = ret;
 
+do_ret:
        /* Emulate a ret instruction. */
        regs->ip = caller;
        regs->sp += 8;
-
-       local_irq_disable();
-       return;
+       return true;
 
 sigsegv:
-       regs->ip -= 2;  /* The faulting instruction should be the int 0xcc. */
        force_sig(SIGSEGV, current);
-       local_irq_disable();
+       return true;
 }
 
 /*
@@ -256,15 +372,21 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
 
 void __init map_vsyscall(void)
 {
-       extern char __vsyscall_0;
-       unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
+       extern char __vsyscall_page;
+       unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
        extern char __vvar_page;
        unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
 
-       /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
-       __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
+       __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall,
+                    vsyscall_mode == NATIVE
+                    ? PAGE_KERNEL_VSYSCALL
+                    : PAGE_KERNEL_VVAR);
+       BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) !=
+                    (unsigned long)VSYSCALL_START);
+
        __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
-       BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS);
+       BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
+                    (unsigned long)VVAR_ADDRESS);
 }
 
 static int __init vsyscall_init(void)