Benchmarks for local_t variants

(This patch also fixes the x86 cpu_local_* macros, which are obviously
unused).

I chose a large array (1M longs) for the inc/add/add_return tests so
the trivalue case would show some cache pressure.

The cpu_local_inc case is always cache-hot, so it's not comparable to
the others.

Time in ns per iteration (brackets is with CONFIG_PREEMPT=y):

		inc	add	add_return	cpu_local_inc	read
x86-32: 2.13 Ghz Core Duo 2
atomic_long	118	118	115		17		17
irqsave/rest	77	78	77		23		16
trivalue	45	45	127		3(6)		21
local_t		36	36	36		1(5)		17

x86-64: 2.6 GHz Dual-Core AMD Opteron 2218
atomic_long	55	60	-		6		19
irqsave/rest	54	54	-		11		19
trivalue	47	47	-		5		28
local_t		47	46	-		1		19

PPC64: 2.7 GHz PPC970MP [normalized]
atomic_long	18	18	20		3(4)		8
irqsave/rest	10(4)	5(4)	4		8(9)		10(9)
trivalue	9	9	2		1(3)		10
local_t		18	18	18		3(4)		8

Sparc64: UltraSPARC-IIIi
atomic_long	243	222	-		37		169
irqsave/rest	205	205	-		25		169
trivalue	193	193	-		11		221
local_t		221	221	-		37		169

Sparc64: Niagara-2
atomic_long	207	206	-		72		160
irqsave/rest	228	228	-		78		160
trivalue:	172	172	-		20		222
local_t		206	207	-		73		160

S/390: [normalized]
atomic_long     19	 18	-		 3		 17
irqsave/rest    57	 58	-		39		 22
trivalue        43	 43	-		 4		 45
local_t         18	 20	-		 2		 16

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/include/asm/local.h |   20 +--
 lib/Kconfig.debug            |    6 +
 lib/Makefile                 |    1 
 lib/test_local_t.c           |  224 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 241 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -220,16 +220,16 @@ static inline long local_sub_return(long
 	preempt_enable();		\
 })					\
 
-#define cpu_local_read(l)    cpu_local_wrap_v(local_read(&__get_cpu_var((l))))
-#define cpu_local_set(l, i)  cpu_local_wrap(local_set(&__get_cpu_var((l)), (i)))
-#define cpu_local_inc(l)     cpu_local_wrap(local_inc(&__get_cpu_var((l))))
-#define cpu_local_dec(l)     cpu_local_wrap(local_dec(&__get_cpu_var((l))))
-#define cpu_local_add(i, l)  cpu_local_wrap(local_add((i), &__get_cpu_var((l))))
-#define cpu_local_sub(i, l)  cpu_local_wrap(local_sub((i), &__get_cpu_var((l))))
+#define cpu_local_read(l)    cpu_local_wrap_v(local_read(&__get_cpu_var(l)))
+#define cpu_local_set(l, i)  cpu_local_wrap(local_set(&__get_cpu_var(l), (i)))
+#define cpu_local_inc(l)     cpu_local_wrap(local_inc(&__get_cpu_var(l)))
+#define cpu_local_dec(l)     cpu_local_wrap(local_dec(&__get_cpu_var(l)))
+#define cpu_local_add(i, l)  cpu_local_wrap(local_add((i), &__get_cpu_var(l)))
+#define cpu_local_sub(i, l)  cpu_local_wrap(local_sub((i), &__get_cpu_var(l)))
 
-#define __cpu_local_inc(l)	cpu_local_inc((l))
-#define __cpu_local_dec(l)	cpu_local_dec((l))
-#define __cpu_local_add(i, l)	cpu_local_add((i), (l))
-#define __cpu_local_sub(i, l)	cpu_local_sub((i), (l))
+#define __cpu_local_inc(l)	cpu_local_inc(l)
+#define __cpu_local_dec(l)	cpu_local_dec(l)
+#define __cpu_local_add(i, l)	cpu_local_add((i), l)
+#define __cpu_local_sub(i, l)	cpu_local_sub((i), l)
 
 #endif /* _ASM_X86_LOCAL_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -133,6 +133,12 @@ config DEBUG_KERNEL
 	help
 	  Say Y here if you are developing drivers or trying to debug and
 	  identify kernel problems.
+
+config BENCHMARK_LOCAL_T
+       bool "Benchmark local_t variants"
+       help
+          Bloat the kernel with pseudo-scientific benchmarks of
+	  different local_t implementations, run at boot.  Say N.
 
 config DEBUG_SHIRQ
 	bool "Debug shared IRQ handlers"
diff --git a/lib/Makefile b/lib/Makefile
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -27,6 +27,7 @@ CFLAGS_kobject_uevent.o += -DDEBUG
 CFLAGS_kobject_uevent.o += -DDEBUG
 endif
 
+obj-$(CONFIG_BENCHMARK_LOCAL_T) += test_local_t.o
 lib-$(CONFIG_HOTPLUG) += kobject_uevent.o
 obj-$(CONFIG_GENERIC_IOMAP) += iomap.o
 obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
diff --git a/lib/test_local_t.c b/lib/test_local_t.c
new file mode 100644
--- /dev/null
+++ b/lib/test_local_t.c
@@ -0,0 +1,224 @@
+/* There are three obvious ways to implement local_t on an arch which
+ * can't do single-instruction inc/dec etc.
+ * 1) atomic_long
+ * 2) irq_save/irq_restore
+ * 3) multiple counters.
+ *
+ * This does a very rough benchmark on each one.
+ */
+#include <asm/local.h>
+#include <asm/atomic.h>
+#include <linux/hardirq.h>
+
+struct local1 {
+	atomic_long_t v;
+};
+struct local2 {
+	unsigned long v;
+};
+struct local3 {
+	unsigned long v[3];
+};
+
+/* Enough to put some pressure on the caches. */
+#define NUM_LOCAL_TEST (1024*1024)
+#define NUM_LOCAL_RUNS (NUM_LOCAL_TEST*32)
+/* This will make it jump around looking random */
+#define STRIDE 514001
+
+static __initdata union {
+	struct local1 l1[NUM_LOCAL_TEST];
+	struct local2 l2[NUM_LOCAL_TEST];
+	struct local3 l3[NUM_LOCAL_TEST];
+	local_t l4[NUM_LOCAL_TEST];
+} test_data;
+
+/* This doesn't test cache effects at all */
+#define NUM_PERCPU_VARS		16
+DEFINE_PER_CPU(struct local1[NUM_PERCPU_VARS], local1_test);
+DEFINE_PER_CPU(struct local2[NUM_PERCPU_VARS], local2_test);
+DEFINE_PER_CPU(struct local3[NUM_PERCPU_VARS], local3_test);
+DEFINE_PER_CPU(local_t[NUM_PERCPU_VARS], local4_test);
+
+static __init void print_result(const char *str,
+				struct timespec start, struct timespec end)
+{
+	s64 diff;
+
+	diff = ktime_to_ns(ktime_sub(timespec_to_ktime(end),
+				     timespec_to_ktime(start)));
+	printk("   %s=%lli (total = %lli)\n",
+	       str, diff/NUM_LOCAL_RUNS, diff);
+}
+
+static __init unsigned int warm_local_test_cache(const void *mem, size_t len)
+{
+	unsigned int i, total = 0;
+	for (i = 0; i < len; i++)
+		total += ((char *)mem)[i];
+	return total;
+}
+
+#define TEST_LOOP(expr)				\
+	n = 0;					\
+	getnstimeofday(&start);			\
+	for (i = 0; i < NUM_LOCAL_RUNS; i++) {	\
+		expr;				\
+		n += STRIDE;			\
+		n %= NUM_LOCAL_TEST;		\
+	}					\
+	getnstimeofday(&end);
+
+static __init int test_local_variants(void)
+{
+	struct timespec start, end;
+	unsigned int i, n;
+	unsigned long total, warm_total = 0;
+	struct local1 *l1;
+	struct local2 *l2;
+	struct local3 *l3;
+	local_t *l4;
+
+	printk("Running local_t variant benchmarks\n");
+	l1 = test_data.l1;
+	l2 = test_data.l2;
+	l3 = test_data.l3;
+	l4 = test_data.l4;
+
+	printk("atomic_long:\n");
+	memset(l1, 0, sizeof(*l1)*NUM_LOCAL_TEST);
+	TEST_LOOP(atomic_long_inc(&l1[n].v));
+	print_result("local_inc", start, end);
+
+	warm_total += warm_local_test_cache(l1, sizeof(*l1)*NUM_LOCAL_TEST);
+	TEST_LOOP(atomic_long_add(1234, &l1[n].v));
+	print_result("local_add", start, end);
+
+	warm_total += warm_local_test_cache(l1, sizeof(*l1)*NUM_LOCAL_TEST);
+	TEST_LOOP(atomic_long_inc(&__get_cpu_var(local1_test)[n%NUM_PERCPU_VARS].v));
+	print_result("cpu_local_inc", start, end);
+
+	warm_total += warm_local_test_cache(l1, sizeof(*l1)*NUM_LOCAL_TEST);
+	total = 0;
+	TEST_LOOP(total += atomic_long_read(&l1[n].v));
+	print_result("local_read", start, end);
+
+	warm_total += warm_local_test_cache(l1, sizeof(*l1)*NUM_LOCAL_TEST);
+	TEST_LOOP(total += atomic_long_add_return(7, &l1[n].v));
+	print_result("local_add_return", start, end);
+
+	/* Correctness test, and also ensure gcc doesn't cheat. */
+	BUG_ON(total != 1728053248);
+
+	printk("irqsave/restore:\n");
+	memset(l2, 0, sizeof(*l2)*NUM_LOCAL_TEST);
+	TEST_LOOP(unsigned long flags;
+		  local_irq_save(flags);
+		  l2[n].v++;
+		  local_irq_restore(flags));
+	print_result("local_inc", start, end);
+
+	warm_total += warm_local_test_cache(l2, sizeof(*l2)*NUM_LOCAL_TEST);
+	TEST_LOOP(unsigned long flags;
+		  local_irq_save(flags);
+		  l2[n].v += 1234;
+		  local_irq_restore(flags));
+	print_result("local_add", start, end);
+
+	warm_total += warm_local_test_cache(l2, sizeof(*l2)*NUM_LOCAL_TEST);
+	TEST_LOOP(unsigned long flags;
+		  local_irq_save(flags);
+		  __get_cpu_var(local2_test)[n%NUM_PERCPU_VARS].v++;
+		  local_irq_restore(flags));
+	print_result("cpu_local_inc", start, end);
+
+	warm_total += warm_local_test_cache(l2, sizeof(*l2)*NUM_LOCAL_TEST);
+	total = 0;
+	TEST_LOOP(total += l2[n].v);
+	print_result("local_read", start, end);
+
+	warm_total += warm_local_test_cache(l1, sizeof(*l1)*NUM_LOCAL_TEST);
+	TEST_LOOP(unsigned long flags;
+		  local_irq_save(flags);
+		  l2[n].v += 7;
+		  total += l2[n].v;
+		  local_irq_restore(flags));
+	print_result("local_add_return", start, end);
+
+	/* Correctness test, and also ensure gcc doesn't cheat. */
+	BUG_ON(total != 1728053248);
+
+	printk("trivalue:\n");
+	memset(l3, 0, sizeof(*l3)*NUM_LOCAL_TEST);
+	TEST_LOOP(unsigned int idx
+			= !(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK)) +
+			!(preempt_count() & HARDIRQ_MASK);
+		  preempt_disable();
+		  l3[n].v[idx]++;
+		  preempt_enable());
+	print_result("local_inc", start, end);
+
+	warm_total += warm_local_test_cache(l3, sizeof(*l3)*NUM_LOCAL_TEST);
+	TEST_LOOP(unsigned int idx
+			= !(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK)) +
+			!(preempt_count() & HARDIRQ_MASK);
+		  preempt_disable();
+		  l3[n].v[idx] += 1234;
+		  preempt_enable());
+	print_result("local_add", start, end);
+
+	warm_total += warm_local_test_cache(l3, sizeof(*l3)*NUM_LOCAL_TEST);
+	TEST_LOOP(unsigned int idx
+			= !(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK)) +
+			!(preempt_count() & HARDIRQ_MASK);
+		  get_cpu_var(local3_test)[n%NUM_PERCPU_VARS].v[idx]++;
+		  put_cpu_var());
+	print_result("cpu_local_inc", start, end);
+
+	warm_total += warm_local_test_cache(l3, sizeof(*l3)*NUM_LOCAL_TEST);
+	total = 0;
+	TEST_LOOP(preempt_disable();
+		  total += l3[n].v[0] + l3[n].v[1] + l3[n].v[2];
+		  preempt_enable());
+	print_result("local_read", start, end);
+
+	warm_total += warm_local_test_cache(l1, sizeof(*l1)*NUM_LOCAL_TEST);
+	TEST_LOOP(unsigned long flags;
+		  local_irq_save(flags);
+		  l3[n].v[0] += 7;
+		  total += l3[n].v[0] + l3[n].v[1] + l3[n].v[2];
+		  local_irq_restore(flags));
+	print_result("local_add_return", start, end);
+
+	/* Correctness test, and also ensure gcc doesn't cheat. */
+	BUG_ON(total != 1728053248);
+
+	printk("local_t:\n");
+	memset(l4, 0, sizeof(*l4)*NUM_LOCAL_TEST);
+	TEST_LOOP(local_inc(&l4[n]));
+	print_result("local_inc", start, end);
+
+	warm_total += warm_local_test_cache(l4, sizeof(*l4)*NUM_LOCAL_TEST);
+	TEST_LOOP(local_add(1234, &l4[n]));
+	print_result("local_add", start, end);
+
+	warm_total += warm_local_test_cache(l4, sizeof(*l4)*NUM_LOCAL_TEST);
+	TEST_LOOP(cpu_local_inc(local4_test[n%NUM_PERCPU_VARS]));
+	print_result("cpu_local_inc", start, end);
+
+	warm_total += warm_local_test_cache(l4, sizeof(*l4)*NUM_LOCAL_TEST);
+	total = 0;
+	TEST_LOOP(total += local_read(&l4[n]));
+	print_result("local_read", start, end);
+
+	warm_total += warm_local_test_cache(l4, sizeof(*l4)*NUM_LOCAL_TEST);
+	TEST_LOOP(total += local_add_return(7, &l4[n]));
+	print_result("local_add_return", start, end);
+
+	/* Correctness test, and also ensure gcc doesn't cheat. */
+	BUG_ON(total != 1728053248);
+
+	/* This ensures gcc runs the warming loops. */
+	return warm_total == 0;
+}
+core_initcall(test_local_variants);
