cpuops: Use cmpxchg for xchg to avoid lock semantics
authorChristoph Lameter <cl@linux.com>
Tue, 14 Dec 2010 16:28:47 +0000 (10:28 -0600)
committerTejun Heo <tj@kernel.org>
Sat, 18 Dec 2010 14:54:04 +0000 (15:54 +0100)
Use cmpxchg instead of xchg to realize this_cpu_xchg.

xchg will cause LOCK overhead since LOCK is always implied but cmpxchg
will not.

Baselines:

xchg() = 18 cycles (no segment prefix, LOCK semantics)
__this_cpu_xchg = 1 cycle

(simulated using this_cpu_read/write, two prefixes. Looks like the
cpu can use loop optimization to get rid of most of the overhead)

Cycles before:

this_cpu_xchg  = 37 cycles (segment prefix and LOCK (implied by xchg))

After:

this_cpu_xchg = 11 cycle (using cmpxchg without lock semantics)

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
arch/x86/include/asm/percpu.h

index b85ade5..8ee4516 100644 (file)
@@ -263,8 +263,9 @@ do {                                                                        \
 })
 
 /*
- * Beware: xchg on x86 has an implied lock prefix. There will be the cost of
- * full lock semantics even though they are not needed.
+ * xchg is implemented using cmpxchg without a lock prefix. xchg is
+ * expensive due to the implied lock prefix.  The processor cannot prefetch
+ * cachelines if xchg is used.
  */
 #define percpu_xchg_op(var, nval)                                      \
 ({                                                                     \
@@ -272,25 +273,33 @@ do {                                                                      \
        typeof(var) pxo_new__ = (nval);                                 \
        switch (sizeof(var)) {                                          \
        case 1:                                                         \
-               asm("xchgb %2, "__percpu_arg(1)                         \
+               asm("\n1:mov "__percpu_arg(1)",%%al"                    \
+                   "\n\tcmpxchgb %2, "__percpu_arg(1)                  \
+                   "\n\tjnz 1b"                                        \
                            : "=a" (pxo_ret__), "+m" (var)              \
                            : "q" (pxo_new__)                           \
                            : "memory");                                \
                break;                                                  \
        case 2:                                                         \
-               asm("xchgw %2, "__percpu_arg(1)                         \
+               asm("\n1:mov "__percpu_arg(1)",%%ax"                    \
+                   "\n\tcmpxchgw %2, "__percpu_arg(1)                  \
+                   "\n\tjnz 1b"                                        \
                            : "=a" (pxo_ret__), "+m" (var)              \
                            : "r" (pxo_new__)                           \
                            : "memory");                                \
                break;                                                  \
        case 4:                                                         \
-               asm("xchgl %2, "__percpu_arg(1)                         \
+               asm("\n1:mov "__percpu_arg(1)",%%eax"                   \
+                   "\n\tcmpxchgl %2, "__percpu_arg(1)                  \
+                   "\n\tjnz 1b"                                        \
                            : "=a" (pxo_ret__), "+m" (var)              \
                            : "r" (pxo_new__)                           \
                            : "memory");                                \
                break;                                                  \
        case 8:                                                         \
-               asm("xchgq %2, "__percpu_arg(1)                         \
+               asm("\n1:mov "__percpu_arg(1)",%%rax"                   \
+                   "\n\tcmpxchgq %2, "__percpu_arg(1)                  \
+                   "\n\tjnz 1b"                                        \
                            : "=a" (pxo_ret__), "+m" (var)              \
                            : "r" (pxo_new__)                           \
                            : "memory");                                \