posix-timers: RCU conversion
Eric Dumazet [Tue, 24 May 2011 09:12:58 +0000 (11:12 +0200)]
Ben Nagy reported a scalability problem with KVM/QEMU that hit very hard
a single spinlock (idr_lock) in posix-timers code, on its 48 core
machine.

Even on a 16 cpu machine (2x4x2), a single test can show 98% of cpu time
used in ticket_spin_lock, from lock_timer

Ref: http://www.spinics.net/lists/kvm/msg51526.html

Switching to RCU is quite easy, IDR being already RCU ready. idr_lock
should be locked only for an insert/delete, not a lookup.

Benchmark on a 2x4x2 machine, 16 processes calling timer_gettime().

Before :

real    1m18.669s
user    0m1.346s
sys     1m17.180s

After :

real    0m3.296s
user    0m1.366s
sys     0m1.926s

Reported-by: Ben Nagy <ben@iagu.net>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Ben Nagy <ben@iagu.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Richard Cochran <richard.cochran@omicron.at>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

include/linux/posix-timers.h
kernel/posix-timers.c

index 808227d..959c141 100644 (file)
@@ -82,6 +82,7 @@ struct k_itimer {
                        unsigned long expires;
                } mmtimer;
                struct alarm alarmtimer;
+               struct rcu_head rcu;
        } it;
 };
 
index a1b5edf..4556182 100644 (file)
@@ -491,6 +491,13 @@ static struct k_itimer * alloc_posix_timer(void)
        return tmr;
 }
 
+static void k_itimer_rcu_free(struct rcu_head *head)
+{
+       struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
+
+       kmem_cache_free(posix_timers_cache, tmr);
+}
+
 #define IT_ID_SET      1
 #define IT_ID_NOT_SET  0
 static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
@@ -503,7 +510,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
        }
        put_pid(tmr->it_pid);
        sigqueue_free(tmr->sigq);
-       kmem_cache_free(posix_timers_cache, tmr);
+       call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
 }
 
 static struct k_clock *clockid_to_kclock(const clockid_t id)
@@ -631,22 +638,18 @@ out:
 static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
 {
        struct k_itimer *timr;
-       /*
-        * Watch out here.  We do a irqsave on the idr_lock and pass the
-        * flags part over to the timer lock.  Must not let interrupts in
-        * while we are moving the lock.
-        */
-       spin_lock_irqsave(&idr_lock, *flags);
+
+       rcu_read_lock();
        timr = idr_find(&posix_timers_id, (int)timer_id);
        if (timr) {
-               spin_lock(&timr->it_lock);
+               spin_lock_irqsave(&timr->it_lock, *flags);
                if (timr->it_signal == current->signal) {
-                       spin_unlock(&idr_lock);
+                       rcu_read_unlock();
                        return timr;
                }
-               spin_unlock(&timr->it_lock);
+               spin_unlock_irqrestore(&timr->it_lock, *flags);
        }
-       spin_unlock_irqrestore(&idr_lock, *flags);
+       rcu_read_unlock();
 
        return NULL;
 }