timerfd: Manage cancelable timers in timerfd
Thomas Gleixner [Fri, 20 May 2011 14:18:50 +0000 (16:18 +0200)]
Peter is concerned about the extra scan of CLOCK_REALTIME_COS in the
timer interrupt. Yes, I did not think about it, because the solution
was so elegant. I didn't like the extra list in timerfd when it was
proposed some time ago, but with a rcu based list the list walk it's
less horrible than the original global lock, which was held over the
list iteration.

Requested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>

fs/timerfd.c
include/linux/hrtimer.h
include/linux/time.h
include/linux/timerfd.h
kernel/hrtimer.c

index 7e14c9e..f67acbd 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/timerfd.h>
 #include <linux/syscalls.h>
+#include <linux/rcupdate.h>
 
 struct timerfd_ctx {
        struct hrtimer tmr;
@@ -31,9 +32,14 @@ struct timerfd_ctx {
        u64 ticks;
        int expired;
        int clockid;
+       struct rcu_head rcu;
+       struct list_head clist;
        bool might_cancel;
 };
 
+static LIST_HEAD(cancel_list);
+static DEFINE_SPINLOCK(cancel_lock);
+
 /*
  * This gets called when the timer event triggers. We set the "expired"
  * flag, but we do not re-arm the timer (in case it's necessary,
@@ -53,28 +59,69 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
        return HRTIMER_NORESTART;
 }
 
-static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
+/*
+ * Called when the clock was set to cancel the timers in the cancel
+ * list.
+ */
+void timerfd_clock_was_set(void)
 {
-       ktime_t remaining;
+       ktime_t moffs = ktime_get_monotonic_offset();
+       struct timerfd_ctx *ctx;
+       unsigned long flags;
 
-       remaining = hrtimer_expires_remaining(&ctx->tmr);
-       return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
+       rcu_read_lock();
+       list_for_each_entry_rcu(ctx, &cancel_list, clist) {
+               if (!ctx->might_cancel)
+                       continue;
+               spin_lock_irqsave(&ctx->wqh.lock, flags);
+               if (ctx->moffs.tv64 != moffs.tv64) {
+                       ctx->moffs.tv64 = KTIME_MAX;
+                       wake_up_locked(&ctx->wqh);
+               }
+               spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+       }
+       rcu_read_unlock();
 }
 
-static bool timerfd_canceled(struct timerfd_ctx *ctx)
+static void timerfd_remove_cancel(struct timerfd_ctx *ctx)
 {
-       ktime_t moffs;
+       if (ctx->might_cancel) {
+               ctx->might_cancel = false;
+               spin_lock(&cancel_lock);
+               list_del_rcu(&ctx->clist);
+               spin_unlock(&cancel_lock);
+       }
+}
 
-       if (!ctx->might_cancel)
+static bool timerfd_canceled(struct timerfd_ctx *ctx)
+{
+       if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX)
                return false;
+       ctx->moffs = ktime_get_monotonic_offset();
+       return true;
+}
 
-       moffs = ktime_get_monotonic_offset();
+static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
+{
+       if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) &&
+           (flags & TFD_TIMER_CANCEL_ON_SET)) {
+               if (!ctx->might_cancel) {
+                       ctx->might_cancel = true;
+                       spin_lock(&cancel_lock);
+                       list_add_rcu(&ctx->clist, &cancel_list);
+                       spin_unlock(&cancel_lock);
+               }
+       } else if (ctx->might_cancel) {
+               timerfd_remove_cancel(ctx);
+       }
+}
 
-       if (moffs.tv64 == ctx->moffs.tv64)
-               return false;
+static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
+{
+       ktime_t remaining;
 
-       ctx->moffs = moffs;
-       return true;
+       remaining = hrtimer_expires_remaining(&ctx->tmr);
+       return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
 }
 
 static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
@@ -87,13 +134,6 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
        htmode = (flags & TFD_TIMER_ABSTIME) ?
                HRTIMER_MODE_ABS: HRTIMER_MODE_REL;
 
-       ctx->might_cancel = false;
-       if (htmode == HRTIMER_MODE_ABS && ctx->clockid == CLOCK_REALTIME &&
-           (flags & TFD_TIMER_CANCELON_SET)) {
-               clockid = CLOCK_REALTIME_COS;
-               ctx->might_cancel = true;
-       }
-
        texp = timespec_to_ktime(ktmr->it_value);
        ctx->expired = 0;
        ctx->ticks = 0;
@@ -113,8 +153,9 @@ static int timerfd_release(struct inode *inode, struct file *file)
 {
        struct timerfd_ctx *ctx = file->private_data;
 
+       timerfd_remove_cancel(ctx);
        hrtimer_cancel(&ctx->tmr);
-       kfree(ctx);
+       kfree_rcu(ctx, rcu);
        return 0;
 }
 
@@ -149,20 +190,20 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
        else
                res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
 
+       /*
+        * If clock has changed, we do not care about the
+        * ticks and we do not rearm the timer. Userspace must
+        * reevaluate anyway.
+        */
+       if (timerfd_canceled(ctx)) {
+               ctx->ticks = 0;
+               ctx->expired = 0;
+               res = -ECANCELED;
+       }
+
        if (ctx->ticks) {
                ticks = ctx->ticks;
 
-               /*
-                * If clock has changed, we do not care about the
-                * ticks and we do not rearm the timer. Userspace must
-                * reevaluate anyway.
-                */
-               if (timerfd_canceled(ctx)) {
-                       ticks = 0;
-                       ctx->expired = 0;
-                       res = -ECANCELED;
-               }
-
                if (ctx->expired && ctx->tintv.tv64) {
                        /*
                         * If tintv.tv64 != 0, this is a periodic timer that
@@ -258,6 +299,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
                return PTR_ERR(file);
        ctx = file->private_data;
 
+       timerfd_setup_cancel(ctx, flags);
+
        /*
         * We need to stop the existing timer before reprogramming
         * it to the new values.
index eda4ccd..925c8c0 100644 (file)
@@ -155,7 +155,6 @@ enum  hrtimer_base_type {
        HRTIMER_BASE_REALTIME,
        HRTIMER_BASE_MONOTONIC,
        HRTIMER_BASE_BOOTTIME,
-       HRTIMER_BASE_REALTIME_COS,
        HRTIMER_MAX_CLOCK_BASES,
 };
 
@@ -306,6 +305,11 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 #endif
 
 extern void clock_was_set(void);
+#ifdef CONFIG_TIMERFD
+extern void timerfd_clock_was_set(void);
+#else
+static inline void timerfd_clock_was_set(void) { }
+#endif
 extern void hrtimers_resume(void);
 
 extern ktime_t ktime_get(void);
index a924277..b306178 100644 (file)
@@ -302,12 +302,6 @@ struct itimerval {
  * The IDs of various hardware clocks:
  */
 #define CLOCK_SGI_CYCLE                        10
-
-#ifdef __KERNEL__
-/* This clock is not exposed to user space */
-#define CLOCK_REALTIME_COS             15
-#endif
-
 #define MAX_CLOCKS                     16
 #define CLOCKS_MASK                    (CLOCK_REALTIME | CLOCK_MONOTONIC)
 #define CLOCKS_MONO                    CLOCK_MONOTONIC
index e9571fc..d3b57fa 100644 (file)
@@ -19,7 +19,7 @@
  * shared O_* flags.
  */
 #define TFD_TIMER_ABSTIME (1 << 0)
-#define TFD_TIMER_CANCELON_SET (1 << 1)
+#define TFD_TIMER_CANCEL_ON_SET (1 << 1)
 #define TFD_CLOEXEC O_CLOEXEC
 #define TFD_NONBLOCK O_NONBLOCK
 
@@ -27,6 +27,6 @@
 /* Flags for timerfd_create.  */
 #define TFD_CREATE_FLAGS TFD_SHARED_FCNTL_FLAGS
 /* Flags for timerfd_settime.  */
-#define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCELON_SET)
+#define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET)
 
 #endif /* _LINUX_TIMERFD_H */
index eabcbd7..26dd32f 100644 (file)
@@ -78,11 +78,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
                        .get_time = &ktime_get_boottime,
                        .resolution = KTIME_LOW_RES,
                },
-               {
-                       .index = CLOCK_REALTIME_COS,
-                       .get_time = &ktime_get_real,
-                       .resolution = KTIME_LOW_RES,
-               },
        }
 };
 
@@ -90,7 +85,6 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
        [CLOCK_REALTIME]        = HRTIMER_BASE_REALTIME,
        [CLOCK_MONOTONIC]       = HRTIMER_BASE_MONOTONIC,
        [CLOCK_BOOTTIME]        = HRTIMER_BASE_BOOTTIME,
-       [CLOCK_REALTIME_COS]    = HRTIMER_BASE_REALTIME_COS,
 };
 
 static inline int hrtimer_clockid_to_base(clockid_t clock_id)
@@ -116,7 +110,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
        base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
        base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
        base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
-       base->clock_base[HRTIMER_BASE_REALTIME_COS].softirq_time = xtim;
 }
 
 /*
@@ -486,8 +479,6 @@ static inline void debug_deactivate(struct hrtimer *timer)
        trace_hrtimer_cancel(timer);
 }
 
-static void hrtimer_expire_cancelable(struct hrtimer_cpu_base *cpu_base);
-
 /* High resolution timer related functions */
 #ifdef CONFIG_HIGH_RES_TIMERS
 
@@ -663,7 +654,33 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
        return 0;
 }
 
-static void retrigger_next_event(void *arg);
+/*
+ * Retrigger next event is called after clock was set
+ *
+ * Called with interrupts disabled via on_each_cpu()
+ */
+static void retrigger_next_event(void *arg)
+{
+       struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+       struct timespec realtime_offset, xtim, wtm, sleep;
+
+       if (!hrtimer_hres_active())
+               return;
+
+       /* Optimized out for !HIGH_RES */
+       get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
+       set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
+
+       /* Adjust CLOCK_REALTIME offset */
+       raw_spin_lock(&base->lock);
+       base->clock_base[HRTIMER_BASE_REALTIME].offset =
+               timespec_to_ktime(realtime_offset);
+       base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
+               timespec_to_ktime(sleep);
+
+       hrtimer_force_reprogram(base, 0);
+       raw_spin_unlock(&base->lock);
+}
 
 /*
  * Switch to high resolution mode
@@ -711,46 +728,11 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
        return 0;
 }
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
+static inline void retrigger_next_event(void *arg) { }
 
 #endif /* CONFIG_HIGH_RES_TIMERS */
 
 /*
- * Retrigger next event is called after clock was set
- *
- * Called with interrupts disabled via on_each_cpu()
- */
-static void retrigger_next_event(void *arg)
-{
-       struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
-       struct timespec realtime_offset, xtim, wtm, sleep;
-
-       if (!hrtimer_hres_active()) {
-               raw_spin_lock(&base->lock);
-               hrtimer_expire_cancelable(base);
-               raw_spin_unlock(&base->lock);
-               return;
-       }
-
-       /* Optimized out for !HIGH_RES */
-       get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
-       set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
-
-       /* Adjust CLOCK_REALTIME offset */
-       raw_spin_lock(&base->lock);
-       base->clock_base[HRTIMER_BASE_REALTIME].offset =
-               timespec_to_ktime(realtime_offset);
-       base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
-               timespec_to_ktime(sleep);
-       base->clock_base[HRTIMER_BASE_REALTIME_COS].offset =
-               timespec_to_ktime(realtime_offset);
-
-       hrtimer_expire_cancelable(base);
-
-       hrtimer_force_reprogram(base, 0);
-       raw_spin_unlock(&base->lock);
-}
-
-/*
  * Clock realtime was set
  *
  * Change the offset of the realtime clock vs. the monotonic
@@ -763,8 +745,11 @@ static void retrigger_next_event(void *arg)
  */
 void clock_was_set(void)
 {
+#ifdef CONFIG_HIGHRES_TIMERS
        /* Retrigger the CPU local events everywhere */
        on_each_cpu(retrigger_next_event, NULL, 1);
+#endif
+       timerfd_clock_was_set();
 }
 
 /*
@@ -777,6 +762,7 @@ void hrtimers_resume(void)
                  KERN_INFO "hrtimers_resume() called with IRQs enabled!");
 
        retrigger_next_event(NULL);
+       timerfd_clock_was_set();
 }
 
 static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
@@ -1240,22 +1226,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
        timer->state &= ~HRTIMER_STATE_CALLBACK;
 }
 
-static void hrtimer_expire_cancelable(struct hrtimer_cpu_base *cpu_base)
-{
-       struct timerqueue_node *node;
-       struct hrtimer_clock_base *base;
-       ktime_t now = ktime_get_real();
-
-       base = &cpu_base->clock_base[HRTIMER_BASE_REALTIME_COS];
-
-       while ((node = timerqueue_getnext(&base->active))) {
-                       struct hrtimer *timer;
-
-                       timer = container_of(node, struct hrtimer, node);
-                       __run_hrtimer(timer, &now);
-       }
-}
-
 #ifdef CONFIG_HIGH_RES_TIMERS
 
 /*