mutex: speed up generic mutex implementations
Nick Piggin [Tue, 21 Oct 2008 08:59:15 +0000 (10:59 +0200)]
- atomic operations which both modify the variable and return something imply
  full smp memory barriers before and after the memory operations involved
  (failing atomic_cmpxchg, atomic_add_unless, etc don't imply a barrier because
  they don't modify the target). See Documentation/atomic_ops.txt.
  So remove extra barriers and branches.

- All architectures support atomic_cmpxchg. This has no relation to
  __HAVE_ARCH_CMPXCHG. We can just take the atomic_cmpxchg path unconditionally

This reduces a simple single threaded fastpath lock+unlock test from 590 cycles
to 203 cycles on a ppc970 system.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

include/asm-generic/mutex-dec.h
include/asm-generic/mutex-xchg.h

index ed108be..f104af7 100644 (file)
@@ -22,8 +22,6 @@ __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
        if (unlikely(atomic_dec_return(count) < 0))
                fail_fn(count);
-       else
-               smp_mb();
 }
 
 /**
@@ -41,10 +39,7 @@ __mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *))
 {
        if (unlikely(atomic_dec_return(count) < 0))
                return fail_fn(count);
-       else {
-               smp_mb();
-               return 0;
-       }
+       return 0;
 }
 
 /**
@@ -63,7 +58,6 @@ __mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *))
 static inline void
 __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
-       smp_mb();
        if (unlikely(atomic_inc_return(count) <= 0))
                fail_fn(count);
 }
@@ -88,25 +82,9 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 {
-       /*
-        * We have two variants here. The cmpxchg based one is the best one
-        * because it never induce a false contention state.  It is included
-        * here because architectures using the inc/dec algorithms over the
-        * xchg ones are much more likely to support cmpxchg natively.
-        *
-        * If not we fall back to the spinlock based variant - that is
-        * just as efficient (and simpler) as a 'destructive' probing of
-        * the mutex state would be.
-        */
-#ifdef __HAVE_ARCH_CMPXCHG
-       if (likely(atomic_cmpxchg(count, 1, 0) == 1)) {
-               smp_mb();
+       if (likely(atomic_cmpxchg(count, 1, 0) == 1))
                return 1;
-       }
        return 0;
-#else
-       return fail_fn(count);
-#endif
 }
 
 #endif
index 7b9cd2c..580a6d3 100644 (file)
@@ -27,8 +27,6 @@ __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
        if (unlikely(atomic_xchg(count, 0) != 1))
                fail_fn(count);
-       else
-               smp_mb();
 }
 
 /**
@@ -46,10 +44,7 @@ __mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *))
 {
        if (unlikely(atomic_xchg(count, 0) != 1))
                return fail_fn(count);
-       else {
-               smp_mb();
-               return 0;
-       }
+       return 0;
 }
 
 /**
@@ -67,7 +62,6 @@ __mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *))
 static inline void
 __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
-       smp_mb();
        if (unlikely(atomic_xchg(count, 1) != 0))
                fail_fn(count);
 }
@@ -110,7 +104,6 @@ __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
                if (prev < 0)
                        prev = 0;
        }
-       smp_mb();
 
        return prev;
 }