Merge branch 'linus' into timers/core
Thomas Gleixner [Mon, 10 May 2010 09:59:37 +0000 (11:59 +0200)]
Reason: Further posix_cpu_timer patches depend on mainline changes

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

1  2 
Documentation/feature-removal-schedule.txt
ipc/mqueue.c
kernel/posix-cpu-timers.c
kernel/time.c
kernel/time/timekeeping.c
kernel/timer.c

@@@ -564,16 -564,6 +564,16 @@@ Who:     Avi Kivity <avi@redhat.com
  
  ----------------------------
  
 +What: xtime, wall_to_monotonic
 +When: 2.6.36+
 +Files:        kernel/time/timekeeping.c include/linux/time.h
 +Why:  Cleaning up timekeeping internal values. Please use
 +      existing timekeeping accessor functions to access
 +      the equivalent functionality.
 +Who:  John Stultz <johnstul@us.ibm.com>
 +
 +----------------------------
 +
  What: KVM kernel-allocated memory slots
  When: July 2010
  Why:  Since 2.6.25, kvm supports user-allocated memory slots, which are
@@@ -599,3 -589,26 +599,26 @@@ Why:     Useful in 2003, implementation is 
        Generally invoked by accident today.
        Seen as doing more harm than good.
  Who:  Len Brown <len.brown@intel.com>
+ ----------------------------
+ What: video4linux /dev/vtx teletext API support
+ When: 2.6.35
+ Files:        drivers/media/video/saa5246a.c drivers/media/video/saa5249.c
+       include/linux/videotext.h
+ Why:  The vtx device nodes have been superseded by vbi device nodes
+       for many years. No applications exist that use the vtx support.
+       Of the two i2c drivers that actually support this API the saa5249
+       has been impossible to use for a year now and no known hardware
+       that supports this device exists. The saa5246a is theoretically
+       supported by the old mxb boards, but it never actually worked.
+       In summary: there is no hardware that can use this API and there
+       are no applications actually implementing this API.
+       The vtx support still reserves minors 192-223 and we would really
+       like to reuse those for upcoming new functionality. In the unlikely
+       event that new hardware appears that wants to use the functionality
+       provided by the vtx API, then that functionality should be build
+       around the sliced VBI API instead.
+ Who:  Hans Verkuil <hverkuil@xs4all.nl>
diff --combined ipc/mqueue.c
@@@ -32,6 -32,7 +32,7 @@@
  #include <linux/nsproxy.h>
  #include <linux/pid.h>
  #include <linux/ipc_namespace.h>
+ #include <linux/slab.h>
  
  #include <net/sock.h>
  #include "util.h"
@@@ -428,7 -429,7 +429,7 @@@ static void wq_add(struct mqueue_inode_
   * sr: SEND or RECV
   */
  static int wq_sleep(struct mqueue_inode_info *info, int sr,
 -                      long timeout, struct ext_wait_queue *ewp)
 +                  ktime_t *timeout, struct ext_wait_queue *ewp)
  {
        int retval;
        signed long time;
                set_current_state(TASK_INTERRUPTIBLE);
  
                spin_unlock(&info->lock);
 -              time = schedule_timeout(timeout);
 +              time = schedule_hrtimeout_range_clock(timeout,
 +                  HRTIMER_MODE_ABS, 0, CLOCK_REALTIME);
  
                while (ewp->state == STATE_PENDING)
                        cpu_relax();
@@@ -552,16 -552,31 +553,16 @@@ static void __do_notify(struct mqueue_i
        wake_up(&info->wait_q);
  }
  
 -static long prepare_timeout(struct timespec *p)
 +static int prepare_timeout(const struct timespec __user *u_abs_timeout,
 +                         ktime_t *expires, struct timespec *ts)
  {
 -      struct timespec nowts;
 -      long timeout;
 -
 -      if (p) {
 -              if (unlikely(p->tv_nsec < 0 || p->tv_sec < 0
 -                      || p->tv_nsec >= NSEC_PER_SEC))
 -                      return -EINVAL;
 -              nowts = CURRENT_TIME;
 -              /* first subtract as jiffies can't be too big */
 -              p->tv_sec -= nowts.tv_sec;
 -              if (p->tv_nsec < nowts.tv_nsec) {
 -                      p->tv_nsec += NSEC_PER_SEC;
 -                      p->tv_sec--;
 -              }
 -              p->tv_nsec -= nowts.tv_nsec;
 -              if (p->tv_sec < 0)
 -                      return 0;
 -
 -              timeout = timespec_to_jiffies(p) + 1;
 -      } else
 -              return MAX_SCHEDULE_TIMEOUT;
 +      if (copy_from_user(ts, u_abs_timeout, sizeof(struct timespec)))
 +              return -EFAULT;
 +      if (!timespec_valid(ts))
 +              return -EINVAL;
  
 -      return timeout;
 +      *expires = timespec_to_ktime(*ts);
 +      return 0;
  }
  
  static void remove_notification(struct mqueue_inode_info *info)
@@@ -847,21 -862,22 +848,21 @@@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mq
        struct ext_wait_queue *receiver;
        struct msg_msg *msg_ptr;
        struct mqueue_inode_info *info;
 -      struct timespec ts, *p = NULL;
 -      long timeout;
 +      ktime_t expires, *timeout = NULL;
 +      struct timespec ts;
        int ret;
  
        if (u_abs_timeout) {
 -              if (copy_from_user(&ts, u_abs_timeout, 
 -                                      sizeof(struct timespec)))
 -                      return -EFAULT;
 -              p = &ts;
 +              int res = prepare_timeout(u_abs_timeout, &expires, &ts);
 +              if (res)
 +                      return res;
 +              timeout = &expires;
        }
  
        if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
                return -EINVAL;
  
 -      audit_mq_sendrecv(mqdes, msg_len, msg_prio, p);
 -      timeout = prepare_timeout(p);
 +      audit_mq_sendrecv(mqdes, msg_len, msg_prio, timeout ? &ts : NULL);
  
        filp = fget(mqdes);
        if (unlikely(!filp)) {
                if (filp->f_flags & O_NONBLOCK) {
                        spin_unlock(&info->lock);
                        ret = -EAGAIN;
 -              } else if (unlikely(timeout < 0)) {
 -                      spin_unlock(&info->lock);
 -                      ret = timeout;
                } else {
                        wait.task = current;
                        wait.msg = (void *) msg_ptr;
@@@ -935,23 -954,24 +936,23 @@@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t
                size_t, msg_len, unsigned int __user *, u_msg_prio,
                const struct timespec __user *, u_abs_timeout)
  {
 -      long timeout;
        ssize_t ret;
        struct msg_msg *msg_ptr;
        struct file *filp;
        struct inode *inode;
        struct mqueue_inode_info *info;
        struct ext_wait_queue wait;
 -      struct timespec ts, *p = NULL;
 +      ktime_t expires, *timeout = NULL;
 +      struct timespec ts;
  
        if (u_abs_timeout) {
 -              if (copy_from_user(&ts, u_abs_timeout, 
 -                                      sizeof(struct timespec)))
 -                      return -EFAULT;
 -              p = &ts;
 +              int res = prepare_timeout(u_abs_timeout, &expires, &ts);
 +              if (res)
 +                      return res;
 +              timeout = &expires;
        }
  
 -      audit_mq_sendrecv(mqdes, msg_len, 0, p);
 -      timeout = prepare_timeout(p);
 +      audit_mq_sendrecv(mqdes, msg_len, 0, timeout ? &ts : NULL);
  
        filp = fget(mqdes);
        if (unlikely(!filp)) {
                if (filp->f_flags & O_NONBLOCK) {
                        spin_unlock(&info->lock);
                        ret = -EAGAIN;
 -                      msg_ptr = NULL;
 -              } else if (unlikely(timeout < 0)) {
 -                      spin_unlock(&info->lock);
 -                      ret = timeout;
 -                      msg_ptr = NULL;
                } else {
                        wait.task = current;
                        wait.state = STATE_NONE;
  #include <trace/events/timer.h>
  
  /*
 - * Called after updating RLIMIT_CPU to set timer expiration if necessary.
 + * Called after updating RLIMIT_CPU to run cpu timer and update
 + * tsk->signal->cputime_expires expiration cache if necessary. Needs
 + * siglock protection since other code may update expiration cache as
 + * well.
   */
  void update_rlimit_cpu(unsigned long rlim_new)
  {
        cputime_t cputime = secs_to_cputime(rlim_new);
 -      struct signal_struct *const sig = current->signal;
  
 -      if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) ||
 -          cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) {
 -              spin_lock_irq(&current->sighand->siglock);
 -              set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
 -              spin_unlock_irq(&current->sighand->siglock);
 -      }
 +      spin_lock_irq(&current->sighand->siglock);
 +      set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
 +      spin_unlock_irq(&current->sighand->siglock);
  }
  
  static int check_clock(const clockid_t which_clock)
@@@ -547,62 -548,111 +547,62 @@@ static inline int expires_gt(cputime_t 
               cputime_gt(expires, new_exp);
  }
  
  /*
   * Insert the timer on the appropriate list before any timers that
   * expire later.  This must be called with the tasklist_lock held
 - * for reading, and interrupts disabled.
 + * for reading, interrupts disabled and p->sighand->siglock taken.
   */
 -static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 +static void arm_timer(struct k_itimer *timer)
  {
        struct task_struct *p = timer->it.cpu.task;
        struct list_head *head, *listpos;
 +      struct task_cputime *cputime_expires;
        struct cpu_timer_list *const nt = &timer->it.cpu;
        struct cpu_timer_list *next;
 -      unsigned long i;
  
 -      head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
 -              p->cpu_timers : p->signal->cpu_timers);
 +      if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
 +              head = p->cpu_timers;
 +              cputime_expires = &p->cputime_expires;
 +      } else {
 +              head = p->signal->cpu_timers;
 +              cputime_expires = &p->signal->cputime_expires;
 +      }
        head += CPUCLOCK_WHICH(timer->it_clock);
  
 -      BUG_ON(!irqs_disabled());
 -      spin_lock(&p->sighand->siglock);
 -
        listpos = head;
 -      if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
 -              list_for_each_entry(next, head, entry) {
 -                      if (next->expires.sched > nt->expires.sched)
 -                              break;
 -                      listpos = &next->entry;
 -              }
 -      } else {
 -              list_for_each_entry(next, head, entry) {
 -                      if (cputime_gt(next->expires.cpu, nt->expires.cpu))
 -                              break;
 -                      listpos = &next->entry;
 -              }
 +      list_for_each_entry(next, head, entry) {
 +              if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
 +                      break;
 +              listpos = &next->entry;
        }
        list_add(&nt->entry, listpos);
  
        if (listpos == head) {
 +              union cpu_time_count *exp = &nt->expires;
 +
                /*
 -               * We are the new earliest-expiring timer.
 -               * If we are a thread timer, there can always
 -               * be a process timer telling us to stop earlier.
 +               * We are the new earliest-expiring POSIX 1.b timer, hence
 +               * need to update expiration cache. Take into account that
 +               * for process timers we share expiration cache with itimers
 +               * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
                 */
  
 -              if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
 -                      union cpu_time_count *exp = &nt->expires;
 -
 -                      switch (CPUCLOCK_WHICH(timer->it_clock)) {
 -                      default:
 -                              BUG();
 -                      case CPUCLOCK_PROF:
 -                              if (expires_gt(p->cputime_expires.prof_exp,
 -                                             exp->cpu))
 -                                      p->cputime_expires.prof_exp = exp->cpu;
 -                              break;
 -                      case CPUCLOCK_VIRT:
 -                              if (expires_gt(p->cputime_expires.virt_exp,
 -                                             exp->cpu))
 -                                      p->cputime_expires.virt_exp = exp->cpu;
 -                              break;
 -                      case CPUCLOCK_SCHED:
 -                              if (p->cputime_expires.sched_exp == 0 ||
 -                                  p->cputime_expires.sched_exp > exp->sched)
 -                                      p->cputime_expires.sched_exp =
 -                                                              exp->sched;
 -                              break;
 -                      }
 -              } else {
 -                      struct signal_struct *const sig = p->signal;
 -                      union cpu_time_count *exp = &timer->it.cpu.expires;
 -
 -                      /*
 -                       * For a process timer, set the cached expiration time.
 -                       */
 -                      switch (CPUCLOCK_WHICH(timer->it_clock)) {
 -                      default:
 -                              BUG();
 -                      case CPUCLOCK_VIRT:
 -                              if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
 -                                             exp->cpu))
 -                                      break;
 -                              sig->cputime_expires.virt_exp = exp->cpu;
 -                              break;
 -                      case CPUCLOCK_PROF:
 -                              if (expires_le(sig->it[CPUCLOCK_PROF].expires,
 -                                             exp->cpu))
 -                                      break;
 -                              i = sig->rlim[RLIMIT_CPU].rlim_cur;
 -                              if (i != RLIM_INFINITY &&
 -                                  i <= cputime_to_secs(exp->cpu))
 -                                      break;
 -                              sig->cputime_expires.prof_exp = exp->cpu;
 -                              break;
 -                      case CPUCLOCK_SCHED:
 -                              sig->cputime_expires.sched_exp = exp->sched;
 -                              break;
 -                      }
 +              switch (CPUCLOCK_WHICH(timer->it_clock)) {
 +              case CPUCLOCK_PROF:
 +                      if (expires_gt(cputime_expires->prof_exp, exp->cpu))
 +                              cputime_expires->prof_exp = exp->cpu;
 +                      break;
 +              case CPUCLOCK_VIRT:
 +                      if (expires_gt(cputime_expires->virt_exp, exp->cpu))
 +                              cputime_expires->virt_exp = exp->cpu;
 +                      break;
 +              case CPUCLOCK_SCHED:
 +                      if (cputime_expires->sched_exp == 0 ||
 +                          cputime_expires->sched_exp > exp->sched)
 +                              cputime_expires->sched_exp = exp->sched;
 +                      break;
                }
        }
 -
 -      spin_unlock(&p->sighand->siglock);
  }
  
  /*
   */
  static void cpu_timer_fire(struct k_itimer *timer)
  {
 -      if (unlikely(timer->sigq == NULL)) {
 +      if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
 +              /*
 +               * User don't want any signal.
 +               */
 +              timer->it.cpu.expires.sched = 0;
 +      } else if (unlikely(timer->sigq == NULL)) {
                /*
                 * This a special case for clock_nanosleep,
                 * not a normal timer from sys_timer_create.
@@@ -676,7 -721,7 +676,7 @@@ int posix_cpu_timer_set(struct k_itime
                        struct itimerspec *new, struct itimerspec *old)
  {
        struct task_struct *p = timer->it.cpu.task;
 -      union cpu_time_count old_expires, new_expires, val;
 +      union cpu_time_count old_expires, new_expires, old_incr, val;
        int ret;
  
        if (unlikely(p == NULL)) {
        BUG_ON(!irqs_disabled());
  
        ret = 0;
 +      old_incr = timer->it.cpu.incr;
        spin_lock(&p->sighand->siglock);
        old_expires = timer->it.cpu.expires;
        if (unlikely(timer->it.cpu.firing)) {
                ret = TIMER_RETRY;
        } else
                list_del_init(&timer->it.cpu.entry);
 -      spin_unlock(&p->sighand->siglock);
  
        /*
         * We need to sample the current value to convert the new
                 * disable this firing since we are already reporting
                 * it as an overrun (thanks to bump_cpu_timer above).
                 */
 +              spin_unlock(&p->sighand->siglock);
                read_unlock(&tasklist_lock);
                goto out;
        }
         */
        timer->it.cpu.expires = new_expires;
        if (new_expires.sched != 0 &&
 -          (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
            cpu_time_before(timer->it_clock, val, new_expires)) {
 -              arm_timer(timer, val);
 +              arm_timer(timer);
        }
  
 +      spin_unlock(&p->sighand->siglock);
        read_unlock(&tasklist_lock);
  
        /*
        timer->it_overrun = -1;
  
        if (new_expires.sched != 0 &&
 -          (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
            !cpu_time_before(timer->it_clock, val, new_expires)) {
                /*
                 * The designated time already passed, so we notify
   out:
        if (old) {
                sample_to_timespec(timer->it_clock,
 -                                 timer->it.cpu.incr, &old->it_interval);
 +                                 old_incr, &old->it_interval);
        }
        return ret;
  }
@@@ -882,6 -927,25 +882,6 @@@ void posix_cpu_timer_get(struct k_itime
                read_unlock(&tasklist_lock);
        }
  
 -      if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
 -              if (timer->it.cpu.incr.sched == 0 &&
 -                  cpu_time_before(timer->it_clock,
 -                                  timer->it.cpu.expires, now)) {
 -                      /*
 -                       * Do-nothing timer expired and has no reload,
 -                       * so it's as if it was never set.
 -                       */
 -                      timer->it.cpu.expires.sched = 0;
 -                      itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
 -                      return;
 -              }
 -              /*
 -               * Account for any expirations and reloads that should
 -               * have happened.
 -               */
 -              bump_cpu_timer(timer, now);
 -      }
 -
        if (unlikely(clear_dead)) {
                /*
                 * We've noticed that the thread is dead, but
@@@ -997,9 -1061,9 +997,9 @@@ static void check_thread_timers(struct 
        }
  }
  
- static void stop_process_timers(struct task_struct *tsk)
+ static void stop_process_timers(struct signal_struct *sig)
  {
-       struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+       struct thread_group_cputimer *cputimer = &sig->cputimer;
        unsigned long flags;
  
        if (!cputimer->running)
        spin_lock_irqsave(&cputimer->lock, flags);
        cputimer->running = 0;
        spin_unlock_irqrestore(&cputimer->lock, flags);
+       sig->cputime_expires.prof_exp = cputime_zero;
+       sig->cputime_expires.virt_exp = cputime_zero;
+       sig->cputime_expires.sched_exp = 0;
  }
  
  static u32 onecputick;
@@@ -1069,7 -1137,7 +1073,7 @@@ static void check_process_timers(struc
            list_empty(&timers[CPUCLOCK_VIRT]) &&
            cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
            list_empty(&timers[CPUCLOCK_SCHED])) {
-               stop_process_timers(tsk);
+               stop_process_timers(sig);
                return;
        }
  
@@@ -1202,7 -1270,6 +1206,7 @@@ void posix_cpu_timer_schedule(struct k_
                        goto out;
                }
                read_lock(&tasklist_lock); /* arm_timer needs it.  */
 +              spin_lock(&p->sighand->siglock);
        } else {
                read_lock(&tasklist_lock);
                if (unlikely(p->signal == NULL)) {
                        clear_dead_task(timer, now);
                        goto out_unlock;
                }
 +              spin_lock(&p->sighand->siglock);
                cpu_timer_sample_group(timer->it_clock, p, &now);
                bump_cpu_timer(timer, now);
                /* Leave the tasklist_lock locked for the call below.  */
        /*
         * Now re-arm for the new expiry time.
         */
 -      arm_timer(timer, now);
 +      BUG_ON(!irqs_disabled());
 +      arm_timer(timer);
 +      spin_unlock(&p->sighand->siglock);
  
  out_unlock:
        read_unlock(&tasklist_lock);
@@@ -1326,7 -1390,7 +1330,7 @@@ static inline int fastpath_timer_check(
                        return 1;
        }
  
 -      return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY;
 +      return 0;
  }
  
  /*
@@@ -1392,23 -1456,21 +1396,23 @@@ void run_posix_cpu_timers(struct task_s
  }
  
  /*
 - * Set one of the process-wide special case CPU timers.
 + * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
   * The tsk->sighand->siglock must be held by the caller.
 - * The *newval argument is relative and we update it to be absolute, *oldval
 - * is absolute and we update it to be relative.
   */
  void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
                           cputime_t *newval, cputime_t *oldval)
  {
        union cpu_time_count now;
 -      struct list_head *head;
  
        BUG_ON(clock_idx == CPUCLOCK_SCHED);
        cpu_timer_sample_group(clock_idx, tsk, &now);
  
        if (oldval) {
 +              /*
 +               * We are setting itimer. The *oldval is absolute and we update
 +               * it to be relative, *newval argument is relative and we update
 +               * it to be absolute.
 +               */
                if (!cputime_eq(*oldval, cputime_zero)) {
                        if (cputime_le(*oldval, now.cpu)) {
                                /* Just about to fire. */
                if (cputime_eq(*newval, cputime_zero))
                        return;
                *newval = cputime_add(*newval, now.cpu);
        }
  
        /*
 -       * Check whether there are any process timers already set to fire
 -       * before this one.  If so, we don't have anything more to do.
 +       * Update expiration cache if we are the earliest timer, or eventually
 +       * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
         */
 -      head = &tsk->signal->cpu_timers[clock_idx];
 -      if (list_empty(head) ||
 -          cputime_ge(list_first_entry(head,
 -                                struct cpu_timer_list, entry)->expires.cpu,
 -                     *newval)) {
 -              switch (clock_idx) {
 -              case CPUCLOCK_PROF:
 +      switch (clock_idx) {
 +      case CPUCLOCK_PROF:
 +              if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
                        tsk->signal->cputime_expires.prof_exp = *newval;
 -                      break;
 -              case CPUCLOCK_VIRT:
 +              break;
 +      case CPUCLOCK_VIRT:
 +              if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
                        tsk->signal->cputime_expires.virt_exp = *newval;
 -                      break;
 -              }
 +              break;
        }
  }
  
diff --combined kernel/time.c
@@@ -35,7 -35,6 +35,6 @@@
  #include <linux/syscalls.h>
  #include <linux/security.h>
  #include <linux/fs.h>
- #include <linux/slab.h>
  #include <linux/math64.h>
  #include <linux/ptrace.h>
  
@@@ -133,11 -132,12 +132,11 @@@ SYSCALL_DEFINE2(gettimeofday, struct ti
   */
  static inline void warp_clock(void)
  {
 -      write_seqlock_irq(&xtime_lock);
 -      wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
 -      xtime.tv_sec += sys_tz.tz_minuteswest * 60;
 -      update_xtime_cache(0);
 -      write_sequnlock_irq(&xtime_lock);
 -      clock_was_set();
 +      struct timespec delta, adjust;
 +      delta.tv_sec = sys_tz.tz_minuteswest * 60;
 +      delta.tv_nsec = 0;
 +      adjust = timespec_add_safe(current_kernel_time(), delta);
 +      do_settimeofday(&adjust);
  }
  
  /*
@@@ -165,6 -165,13 +165,6 @@@ struct timespec raw_time
  /* flag for if timekeeping is suspended */
  int __read_mostly timekeeping_suspended;
  
 -static struct timespec xtime_cache __attribute__ ((aligned (16)));
 -void update_xtime_cache(u64 nsec)
 -{
 -      xtime_cache = xtime;
 -      timespec_add_ns(&xtime_cache, nsec);
 -}
 -
  /* must hold xtime_lock */
  void timekeeping_leap_insert(int leapsecond)
  {
@@@ -325,6 -332,8 +325,6 @@@ int do_settimeofday(struct timespec *tv
  
        xtime = *tv;
  
 -      update_xtime_cache(0);
 -
        timekeeper.ntp_error = 0;
        ntp_clear();
  
@@@ -550,6 -559,7 +550,6 @@@ void __init timekeeping_init(void
        }
        set_normalized_timespec(&wall_to_monotonic,
                                -boot.tv_sec, -boot.tv_nsec);
 -      update_xtime_cache(0);
        total_sleep_time.tv_sec = 0;
        total_sleep_time.tv_nsec = 0;
        write_sequnlock_irqrestore(&xtime_lock, flags);
@@@ -583,6 -593,7 +583,6 @@@ static int timekeeping_resume(struct sy
                wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
                total_sleep_time = timespec_add_safe(total_sleep_time, ts);
        }
 -      update_xtime_cache(0);
        /* re-base the last cycle value */
        timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
        timekeeper.ntp_error = 0;
@@@ -777,6 -788,7 +777,6 @@@ void update_wall_time(void
  {
        struct clocksource *clock;
        cycle_t offset;
 -      u64 nsecs;
        int shift = 0, maxshift;
  
        /* Make sure we're fully resumed: */
        shift = min(shift, maxshift);
        while (offset >= timekeeper.cycle_interval) {
                offset = logarithmic_accumulation(offset, shift);
-               shift--;
+               if(offset < timekeeper.cycle_interval<<shift)
+                       shift--;
        }
  
        /* correct the clock when NTP error is too big */
                timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
        }
  
 -      /* store full nanoseconds into xtime after rounding it up and
 +
 +      /*
 +       * Store full nanoseconds into xtime after rounding it up and
         * add the remainder to the error difference.
         */
        xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
        timekeeper.ntp_error += timekeeper.xtime_nsec <<
                                timekeeper.ntp_error_shift;
  
 -      nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
 -      update_xtime_cache(nsecs);
 +      /*
 +       * Finally, make sure that after the rounding
 +       * xtime.tv_nsec isn't larger then NSEC_PER_SEC
 +       */
 +      if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
 +              xtime.tv_nsec -= NSEC_PER_SEC;
 +              xtime.tv_sec++;
 +              second_overflow();
 +      }
  
        /* check to see if there is a new clocksource to use */
        update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
@@@ -892,13 -896,13 +893,13 @@@ EXPORT_SYMBOL_GPL(monotonic_to_bootbase
  
  unsigned long get_seconds(void)
  {
 -      return xtime_cache.tv_sec;
 +      return xtime.tv_sec;
  }
  EXPORT_SYMBOL(get_seconds);
  
  struct timespec __current_kernel_time(void)
  {
 -      return xtime_cache;
 +      return xtime;
  }
  
  struct timespec current_kernel_time(void)
        do {
                seq = read_seqbegin(&xtime_lock);
  
 -              now = xtime_cache;
 +              now = xtime;
        } while (read_seqretry(&xtime_lock, seq));
  
        return now;
@@@ -924,7 -928,7 +925,7 @@@ struct timespec get_monotonic_coarse(vo
        do {
                seq = read_seqbegin(&xtime_lock);
  
 -              now = xtime_cache;
 +              now = xtime;
                mono = wall_to_monotonic;
        } while (read_seqretry(&xtime_lock, seq));
  
diff --combined kernel/timer.c
@@@ -39,6 -39,7 +39,7 @@@
  #include <linux/kallsyms.h>
  #include <linux/perf_event.h>
  #include <linux/sched.h>
+ #include <linux/slab.h>
  
  #include <asm/uaccess.h>
  #include <asm/unistd.h>
@@@ -318,24 -319,6 +319,24 @@@ unsigned long round_jiffies_up_relative
  }
  EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
  
 +/**
 + * set_timer_slack - set the allowed slack for a timer
 + * @slack_hz: the amount of time (in jiffies) allowed for rounding
 + *
 + * Set the amount of time, in jiffies, that a certain timer has
 + * in terms of slack. By setting this value, the timer subsystem
 + * will schedule the actual timer somewhere between
 + * the time mod_timer() asks for, and that time plus the slack.
 + *
 + * By setting the slack to -1, a percentage of the delay is used
 + * instead.
 + */
 +void set_timer_slack(struct timer_list *timer, int slack_hz)
 +{
 +      timer->slack = slack_hz;
 +}
 +EXPORT_SYMBOL_GPL(set_timer_slack);
 +
  
  static inline void set_running_timer(struct tvec_base *base,
                                        struct timer_list *timer)
@@@ -567,7 -550,6 +568,7 @@@ static void __init_timer(struct timer_l
  {
        timer->entry.next = NULL;
        timer->base = __raw_get_cpu_var(tvec_bases);
 +      timer->slack = -1;
  #ifdef CONFIG_TIMER_STATS
        timer->start_site = NULL;
        timer->start_pid = -1;
@@@ -733,41 -715,6 +734,41 @@@ int mod_timer_pending(struct timer_lis
  }
  EXPORT_SYMBOL(mod_timer_pending);
  
 +/*
 + * Decide where to put the timer while taking the slack into account
 + *
 + * Algorithm:
 + *   1) calculate the maximum (absolute) time
 + *   2) calculate the highest bit where the expires and new max are different
 + *   3) use this bit to make a mask
 + *   4) use the bitmask to round down the maximum time, so that all last
 + *      bits are zeros
 + */
 +static inline
 +unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
 +{
 +      unsigned long expires_limit, mask;
 +      int bit;
 +
 +      expires_limit = expires + timer->slack;
 +
 +      if (timer->slack < 0) /* auto slack: use 0.4% */
 +              expires_limit = expires + (expires - jiffies)/256;
 +
 +      mask = expires ^ expires_limit;
 +
 +      if (mask == 0)
 +              return expires;
 +
 +      bit = find_last_bit(&mask, BITS_PER_LONG);
 +
 +      mask = (1 << bit) - 1;
 +
 +      expires_limit = expires_limit & ~(mask);
 +
 +      return expires_limit;
 +}
 +
  /**
   * mod_timer - modify a timer's timeout
   * @timer: the timer to be modified
@@@ -798,8 -745,6 +799,8 @@@ int mod_timer(struct timer_list *timer
        if (timer_pending(timer) && timer->expires == expires)
                return 1;
  
 +      expires = apply_slack(timer, expires);
 +
        return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
  }
  EXPORT_SYMBOL(mod_timer);
@@@ -936,6 -881,7 +937,7 @@@ int try_to_del_timer_sync(struct timer_
        if (base->running_timer == timer)
                goto out;
  
+       timer_stats_timer_clear_start_info(timer);
        ret = 0;
        if (timer_pending(timer)) {
                detach_timer(timer, 1);
@@@ -1009,47 -955,6 +1011,47 @@@ static int cascade(struct tvec_base *ba
        return index;
  }
  
 +static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
 +                        unsigned long data)
 +{
 +      int preempt_count = preempt_count();
 +
 +#ifdef CONFIG_LOCKDEP
 +      /*
 +       * It is permissible to free the timer from inside the
 +       * function that is called from it, this we need to take into
 +       * account for lockdep too. To avoid bogus "held lock freed"
 +       * warnings as well as problems when looking into
 +       * timer->lockdep_map, make a copy and use that here.
 +       */
 +      struct lockdep_map lockdep_map = timer->lockdep_map;
 +#endif
 +      /*
 +       * Couple the lock chain with the lock chain at
 +       * del_timer_sync() by acquiring the lock_map around the fn()
 +       * call here and in del_timer_sync().
 +       */
 +      lock_map_acquire(&lockdep_map);
 +
 +      trace_timer_expire_entry(timer);
 +      fn(data);
 +      trace_timer_expire_exit(timer);
 +
 +      lock_map_release(&lockdep_map);
 +
 +      if (preempt_count != preempt_count()) {
 +              WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
 +                        fn, preempt_count, preempt_count());
 +              /*
 +               * Restore the preempt count. That gives us a decent
 +               * chance to survive and extract information. If the
 +               * callback kept a lock held, bad luck, but not worse
 +               * than the BUG() we had.
 +               */
 +              preempt_count() = preempt_count;
 +      }
 +}
 +
  #define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
  
  /**
@@@ -1093,7 -998,45 +1095,7 @@@ static inline void __run_timers(struct 
                        detach_timer(timer, 1);
  
                        spin_unlock_irq(&base->lock);
 -                      {
 -                              int preempt_count = preempt_count();
 -
 -#ifdef CONFIG_LOCKDEP
 -                              /*
 -                               * It is permissible to free the timer from
 -                               * inside the function that is called from
 -                               * it, this we need to take into account for
 -                               * lockdep too. To avoid bogus "held lock
 -                               * freed" warnings as well as problems when
 -                               * looking into timer->lockdep_map, make a
 -                               * copy and use that here.
 -                               */
 -                              struct lockdep_map lockdep_map =
 -                                      timer->lockdep_map;
 -#endif
 -                              /*
 -                               * Couple the lock chain with the lock chain at
 -                               * del_timer_sync() by acquiring the lock_map
 -                               * around the fn() call here and in
 -                               * del_timer_sync().
 -                               */
 -                              lock_map_acquire(&lockdep_map);
 -
 -                              trace_timer_expire_entry(timer);
 -                              fn(data);
 -                              trace_timer_expire_exit(timer);
 -
 -                              lock_map_release(&lockdep_map);
 -
 -                              if (preempt_count != preempt_count()) {
 -                                      printk(KERN_ERR "huh, entered %p "
 -                                             "with preempt_count %08x, exited"
 -                                             " with %08x?\n",
 -                                             fn, preempt_count,
 -                                             preempt_count());
 -                                      BUG();
 -                              }
 -                      }
 +                      call_timer_fn(timer, fn, data);
                        spin_lock_irq(&base->lock);
                }
        }