xen: implement Xen-specific spinlocks
Jeremy Fitzhardinge [Mon, 7 Jul 2008 19:07:53 +0000 (12:07 -0700)]
The standard ticket spinlocks are very expensive in a virtual
environment, because their performance depends on Xen's scheduler
giving vcpus time in the order that they're supposed to take the
spinlock.

This implements a Xen-specific spinlock, which should be much more
efficient.

The fast-path is essentially the old Linux-x86 locks, using a single
lock byte.  The locker decrements the byte; if the result is 0, then
they have the lock.  If the lock is negative, then locker must spin
until the lock is positive again.

When there's contention, the locker spin for 2^16[*] iterations waiting
to get the lock.  If it fails to get the lock in that time, it adds
itself to the contention count in the lock and blocks on a per-cpu
event channel.

When unlocking the spinlock, the locker looks to see if there's anyone
blocked waiting for the lock by checking for a non-zero waiter count.
If there's a waiter, it traverses the per-cpu "lock_spinners"
variable, which contains which lock each CPU is waiting on.  It picks
one CPU waiting on the lock and sends it an event to wake it up.

This allows efficient fast-path spinlock operation, while allowing
spinning vcpus to give up their processor time while waiting for a
contended lock.

[*] 2^16 iterations is threshold at which 98% locks have been taken
according to Thomas Friebel's Xen Summit talk "Preventing Guests from
Spinning Around".  Therefore, we'd expect the lock and unlock slow
paths will only be entered 2% of the time.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <clameter@linux-foundation.org>
Cc: Petr Tesarik <ptesarik@suse.cz>
Cc: Virtualization <virtualization@lists.linux-foundation.org>
Cc: Xen devel <xen-devel@lists.xensource.com>
Cc: Thomas Friebel <thomas.friebel@amd.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

arch/x86/xen/smp.c
drivers/xen/events.c
include/asm-x86/xen/events.h
include/xen/events.h

index a8ebafc..e693812 100644 (file)
@@ -15,6 +15,7 @@
  * This does not handle HOTPLUG_CPU yet.
  */
 #include <linux/sched.h>
+#include <linux/kernel_stat.h>
 #include <linux/err.h>
 #include <linux/smp.h>
 
@@ -35,6 +36,8 @@
 #include "xen-ops.h"
 #include "mmu.h"
 
+static void __cpuinit xen_init_lock_cpu(int cpu);
+
 cpumask_t xen_cpu_initialized_map;
 
 static DEFINE_PER_CPU(int, resched_irq);
@@ -179,6 +182,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
 {
        unsigned cpu;
 
+       xen_init_lock_cpu(0);
+
        smp_store_cpu_info(0);
        cpu_data(0).x86_max_cores = 1;
        set_cpu_sibling_map(0);
@@ -301,6 +306,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
        clear_tsk_thread_flag(idle, TIF_FORK);
 #endif
        xen_setup_timer(cpu);
+       xen_init_lock_cpu(cpu);
 
        per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
 
@@ -413,6 +419,170 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
        return IRQ_HANDLED;
 }
 
+struct xen_spinlock {
+       unsigned char lock;             /* 0 -> free; 1 -> locked */
+       unsigned short spinners;        /* count of waiting cpus */
+};
+
+static int xen_spin_is_locked(struct raw_spinlock *lock)
+{
+       struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+
+       return xl->lock != 0;
+}
+
+static int xen_spin_is_contended(struct raw_spinlock *lock)
+{
+       struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+
+       /* Not strictly true; this is only the count of contended
+          lock-takers entering the slow path. */
+       return xl->spinners != 0;
+}
+
+static int xen_spin_trylock(struct raw_spinlock *lock)
+{
+       struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+       u8 old = 1;
+
+       asm("xchgb %b0,%1"
+           : "+q" (old), "+m" (xl->lock) : : "memory");
+
+       return old == 0;
+}
+
+static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
+static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
+
+static inline void spinning_lock(struct xen_spinlock *xl)
+{
+       __get_cpu_var(lock_spinners) = xl;
+       wmb();                  /* set lock of interest before count */
+       asm(LOCK_PREFIX " incw %0"
+           : "+m" (xl->spinners) : : "memory");
+}
+
+static inline void unspinning_lock(struct xen_spinlock *xl)
+{
+       asm(LOCK_PREFIX " decw %0"
+           : "+m" (xl->spinners) : : "memory");
+       wmb();                  /* decrement count before clearing lock */
+       __get_cpu_var(lock_spinners) = NULL;
+}
+
+static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
+{
+       struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+       int irq = __get_cpu_var(lock_kicker_irq);
+       int ret;
+
+       /* If kicker interrupts not initialized yet, just spin */
+       if (irq == -1)
+               return 0;
+
+       /* announce we're spinning */
+       spinning_lock(xl);
+
+       /* clear pending */
+       xen_clear_irq_pending(irq);
+
+       /* check again make sure it didn't become free while
+          we weren't looking  */
+       ret = xen_spin_trylock(lock);
+       if (ret)
+               goto out;
+
+       /* block until irq becomes pending */
+       xen_poll_irq(irq);
+       kstat_this_cpu.irqs[irq]++;
+
+out:
+       unspinning_lock(xl);
+       return ret;
+}
+
+static void xen_spin_lock(struct raw_spinlock *lock)
+{
+       struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+       int timeout;
+       u8 oldval;
+
+       do {
+               timeout = 1 << 10;
+
+               asm("1: xchgb %1,%0\n"
+                   "   testb %1,%1\n"
+                   "   jz 3f\n"
+                   "2: rep;nop\n"
+                   "   cmpb $0,%0\n"
+                   "   je 1b\n"
+                   "   dec %2\n"
+                   "   jnz 2b\n"
+                   "3:\n"
+                   : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
+                   : "1" (1)
+                   : "memory");
+
+       } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
+}
+
+static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
+{
+       int cpu;
+
+       for_each_online_cpu(cpu) {
+               /* XXX should mix up next cpu selection */
+               if (per_cpu(lock_spinners, cpu) == xl) {
+                       xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
+                       break;
+               }
+       }
+}
+
+static void xen_spin_unlock(struct raw_spinlock *lock)
+{
+       struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+
+       smp_wmb();              /* make sure no writes get moved after unlock */
+       xl->lock = 0;           /* release lock */
+
+       /* make sure unlock happens before kick */
+       barrier();
+
+       if (unlikely(xl->spinners))
+               xen_spin_unlock_slow(xl);
+}
+
+static __cpuinit void xen_init_lock_cpu(int cpu)
+{
+       int irq;
+       const char *name;
+
+       name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
+       irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
+                                    cpu,
+                                    xen_reschedule_interrupt,
+                                    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                    name,
+                                    NULL);
+
+       if (irq >= 0) {
+               disable_irq(irq); /* make sure it's never delivered */
+               per_cpu(lock_kicker_irq, cpu) = irq;
+       }
+
+       printk("cpu %d spinlock event irq %d\n", cpu, irq);
+}
+
+static void __init xen_init_spinlocks(void)
+{
+       pv_lock_ops.spin_is_locked = xen_spin_is_locked;
+       pv_lock_ops.spin_is_contended = xen_spin_is_contended;
+       pv_lock_ops.spin_lock = xen_spin_lock;
+       pv_lock_ops.spin_trylock = xen_spin_trylock;
+       pv_lock_ops.spin_unlock = xen_spin_unlock;
+}
+
 static const struct smp_ops xen_smp_ops __initdata = {
        .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
        .smp_prepare_cpus = xen_smp_prepare_cpus,
@@ -430,5 +600,5 @@ void __init xen_smp_init(void)
 {
        smp_ops = xen_smp_ops;
        xen_fill_possible_map();
-       paravirt_use_bytelocks();
+       xen_init_spinlocks();
 }
index 332dd63..0e0c285 100644 (file)
@@ -734,6 +734,33 @@ static void restore_cpu_ipis(unsigned int cpu)
        }
 }
 
+/* Clear an irq's pending state, in preparation for polling on it */
+void xen_clear_irq_pending(int irq)
+{
+       int evtchn = evtchn_from_irq(irq);
+
+       if (VALID_EVTCHN(evtchn))
+               clear_evtchn(evtchn);
+}
+
+/* Poll waiting for an irq to become pending.  In the usual case, the
+   irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq(int irq)
+{
+       evtchn_port_t evtchn = evtchn_from_irq(irq);
+
+       if (VALID_EVTCHN(evtchn)) {
+               struct sched_poll poll;
+
+               poll.nr_ports = 1;
+               poll.timeout = 0;
+               poll.ports = &evtchn;
+
+               if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0)
+                       BUG();
+       }
+}
+
 void xen_irq_resume(void)
 {
        unsigned int cpu, irq, evtchn;
index f8d57ea..8ded747 100644 (file)
@@ -5,6 +5,7 @@ enum ipi_vector {
        XEN_RESCHEDULE_VECTOR,
        XEN_CALL_FUNCTION_VECTOR,
        XEN_CALL_FUNCTION_SINGLE_VECTOR,
+       XEN_SPIN_UNLOCK_VECTOR,
 
        XEN_NR_IPIS,
 };
index 67c4436..4680ff3 100644 (file)
@@ -44,4 +44,11 @@ extern void notify_remote_via_irq(int irq);
 
 extern void xen_irq_resume(void);
 
+/* Clear an irq's pending state, in preparation for polling on it */
+void xen_clear_irq_pending(int irq);
+
+/* Poll waiting for an irq to become pending.  In the usual case, the
+   irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq(int irq);
+
 #endif /* _XEN_EVENTS_H */