s390/percpu: make use of interlocked-access facility 1 instructions
authorHeiko Carstens <heiko.carstens@de.ibm.com>
Mon, 21 Oct 2013 05:57:41 +0000 (07:57 +0200)
committerMartin Schwidefsky <schwidefsky@de.ibm.com>
Thu, 24 Oct 2013 15:17:13 +0000 (17:17 +0200)
Optimize this_cpu_* functions for 64 bit by making use of new instructions
that came with the interlocked-access facility 1 (load-and-*) and the
general-instructions-extension facility (asi, agsi).
That way we get rid of the compare-and-swap loop in most cases.
Code size reduction (defconfig, -march=z196): 11,555 bytes.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
arch/s390/include/asm/percpu.h

index 41baca8..061ab45 100644 (file)
 #define ARCH_NEEDS_WEAK_PER_CPU
 #endif
 
-#define arch_this_cpu_to_op(pcp, val, op)                              \
+/*
+ * We use a compare-and-swap loop since that uses less cpu cycles than
+ * disabling and enabling interrupts like the generic variant would do.
+ */
+#define arch_this_cpu_to_op_simple(pcp, val, op)                       \
 ({                                                                     \
        typedef typeof(pcp) pcp_op_T__;                                 \
        pcp_op_T__ old__, new__, prev__;                                \
        new__;                                                          \
 })
 
-#define this_cpu_add_1(pcp, val) arch_this_cpu_to_op(pcp, val, +)
-#define this_cpu_add_2(pcp, val) arch_this_cpu_to_op(pcp, val, +)
-#define this_cpu_add_4(pcp, val) arch_this_cpu_to_op(pcp, val, +)
-#define this_cpu_add_8(pcp, val) arch_this_cpu_to_op(pcp, val, +)
-
-#define this_cpu_add_return_1(pcp, val) arch_this_cpu_to_op(pcp, val, +)
-#define this_cpu_add_return_2(pcp, val) arch_this_cpu_to_op(pcp, val, +)
-#define this_cpu_add_return_4(pcp, val) arch_this_cpu_to_op(pcp, val, +)
-#define this_cpu_add_return_8(pcp, val) arch_this_cpu_to_op(pcp, val, +)
-
-#define this_cpu_and_1(pcp, val) arch_this_cpu_to_op(pcp, val, &)
-#define this_cpu_and_2(pcp, val) arch_this_cpu_to_op(pcp, val, &)
-#define this_cpu_and_4(pcp, val) arch_this_cpu_to_op(pcp, val, &)
-#define this_cpu_and_8(pcp, val) arch_this_cpu_to_op(pcp, val, &)
-
-#define this_cpu_or_1(pcp, val) arch_this_cpu_to_op(pcp, val, |)
-#define this_cpu_or_2(pcp, val) arch_this_cpu_to_op(pcp, val, |)
-#define this_cpu_or_4(pcp, val) arch_this_cpu_to_op(pcp, val, |)
-#define this_cpu_or_8(pcp, val) arch_this_cpu_to_op(pcp, val, |)
-
-#define this_cpu_xor_1(pcp, val) arch_this_cpu_to_op(pcp, val, ^)
-#define this_cpu_xor_2(pcp, val) arch_this_cpu_to_op(pcp, val, ^)
-#define this_cpu_xor_4(pcp, val) arch_this_cpu_to_op(pcp, val, ^)
-#define this_cpu_xor_8(pcp, val) arch_this_cpu_to_op(pcp, val, ^)
+#define this_cpu_add_1(pcp, val)       arch_this_cpu_to_op_simple(pcp, val, +)
+#define this_cpu_add_2(pcp, val)       arch_this_cpu_to_op_simple(pcp, val, +)
+#define this_cpu_add_return_1(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +)
+#define this_cpu_add_return_2(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +)
+#define this_cpu_and_1(pcp, val)       arch_this_cpu_to_op_simple(pcp, val, &)
+#define this_cpu_and_2(pcp, val)       arch_this_cpu_to_op_simple(pcp, val, &)
+#define this_cpu_or_1(pcp, val)                arch_this_cpu_to_op_simple(pcp, val, |)
+#define this_cpu_or_2(pcp, val)                arch_this_cpu_to_op_simple(pcp, val, |)
+#define this_cpu_xor_1(pcp, val)       arch_this_cpu_to_op_simple(pcp, val, ^)
+#define this_cpu_xor_2(pcp, val)       arch_this_cpu_to_op_simple(pcp, val, ^)
+
+#ifndef CONFIG_HAVE_MARCH_Z196_FEATURES
+
+#define this_cpu_add_4(pcp, val)       arch_this_cpu_to_op_simple(pcp, val, +)
+#define this_cpu_add_8(pcp, val)       arch_this_cpu_to_op_simple(pcp, val, +)
+#define this_cpu_add_return_4(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +)
+#define this_cpu_add_return_8(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +)
+#define this_cpu_and_4(pcp, val)       arch_this_cpu_to_op_simple(pcp, val, &)
+#define this_cpu_and_8(pcp, val)       arch_this_cpu_to_op_simple(pcp, val, &)
+#define this_cpu_or_4(pcp, val)                arch_this_cpu_to_op_simple(pcp, val, |)
+#define this_cpu_or_8(pcp, val)                arch_this_cpu_to_op_simple(pcp, val, |)
+#define this_cpu_xor_4(pcp, val)       arch_this_cpu_to_op_simple(pcp, val, ^)
+#define this_cpu_xor_8(pcp, val)       arch_this_cpu_to_op_simple(pcp, val, ^)
+
+#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */
+
+#define arch_this_cpu_add(pcp, val, op1, op2, szcast)                  \
+{                                                                      \
+       typedef typeof(pcp) pcp_op_T__;                                 \
+       pcp_op_T__ val__ = (val);                                       \
+       pcp_op_T__ old__, *ptr__;                                       \
+       preempt_disable();                                              \
+       ptr__ = __this_cpu_ptr(&(pcp));                                 \
+       if (__builtin_constant_p(val__) &&                              \
+           ((szcast)val__ > -129) && ((szcast)val__ < 128)) {          \
+               asm volatile(                                           \
+                       op2 "   %[ptr__],%[val__]\n"                    \
+                       : [ptr__] "+Q" (*ptr__)                         \
+                       : [val__] "i" ((szcast)val__)                   \
+                       : "cc");                                        \
+       } else {                                                        \
+               asm volatile(                                           \
+                       op1 "   %[old__],%[val__],%[ptr__]\n"           \
+                       : [old__] "=d" (old__), [ptr__] "+Q" (*ptr__)   \
+                       : [val__] "d" (val__)                           \
+                       : "cc");                                        \
+       }                                                               \
+       preempt_enable();                                               \
+}
+
+#define this_cpu_add_4(pcp, val) arch_this_cpu_add(pcp, val, "laa", "asi", int)
+#define this_cpu_add_8(pcp, val) arch_this_cpu_add(pcp, val, "laag", "agsi", long)
+
+#define arch_this_cpu_add_return(pcp, val, op)                         \
+({                                                                     \
+       typedef typeof(pcp) pcp_op_T__;                                 \
+       pcp_op_T__ val__ = (val);                                       \
+       pcp_op_T__ old__, *ptr__;                                       \
+       preempt_disable();                                              \
+       ptr__ = __this_cpu_ptr(&(pcp));                                 \
+       asm volatile(                                                   \
+               op "    %[old__],%[val__],%[ptr__]\n"                   \
+               : [old__] "=d" (old__), [ptr__] "+Q" (*ptr__)           \
+               : [val__] "d" (val__)                                   \
+               : "cc");                                                \
+       preempt_enable();                                               \
+       old__ + val__;                                                  \
+})
+
+#define this_cpu_add_return_4(pcp, val) arch_this_cpu_add_return(pcp, val, "laa")
+#define this_cpu_add_return_8(pcp, val) arch_this_cpu_add_return(pcp, val, "laag")
+
+#define arch_this_cpu_to_op(pcp, val, op)                              \
+{                                                                      \
+       typedef typeof(pcp) pcp_op_T__;                                 \
+       pcp_op_T__ val__ = (val);                                       \
+       pcp_op_T__ old__, *ptr__;                                       \
+       preempt_disable();                                              \
+       ptr__ = __this_cpu_ptr(&(pcp));                                 \
+       asm volatile(                                                   \
+               op "    %[old__],%[val__],%[ptr__]\n"                   \
+               : [old__] "=d" (old__), [ptr__] "+Q" (*ptr__)           \
+               : [val__] "d" (val__)                                   \
+               : "cc");                                                \
+       preempt_enable();                                               \
+}
+
+#define this_cpu_and_4(pcp, val)       arch_this_cpu_to_op(pcp, val, "lan")
+#define this_cpu_and_8(pcp, val)       arch_this_cpu_to_op(pcp, val, "lang")
+#define this_cpu_or_4(pcp, val)                arch_this_cpu_to_op(pcp, val, "lao")
+#define this_cpu_or_8(pcp, val)                arch_this_cpu_to_op(pcp, val, "laog")
+#define this_cpu_xor_4(pcp, val)       arch_this_cpu_to_op(pcp, val, "lax")
+#define this_cpu_xor_8(pcp, val)       arch_this_cpu_to_op(pcp, val, "laxg")
+
+#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */
 
 #define arch_this_cpu_cmpxchg(pcp, oval, nval)                         \
 ({                                                                     \