percpu: measure use

With the idea of using virtual mappings for percpu regions, we wonder
how often we access other CPU's per-cpu variables.

32-bit 4-way SMP (under kvm), kernel make -j4:
get_cpu_var()		52,358,618
raw_get_cpu_var()	   287,191
per_cpu():		17,371,648
per_cpu(same):		16,020,390

Total same-cpu calls:	68,666,199
Cross-per-cpu calls:	 1,351,258

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/Makefile_32.cpu     |    2 +-
 include/asm-generic/percpu.h |   10 +++++++---
 kernel/module.c              |   11 +++++++++++
 kernel/smp.c                 |   21 +++++++++++++++++++++
 4 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -47,5 +47,5 @@ cflags-$(CONFIG_X86_GENERIC) 	+= $(call 
 # Bug fix for binutils: this option is required in order to keep
 # binutils from generating NOPL instructions against our will.
 ifneq ($(CONFIG_X86_P6_NOP),y)
-cflags-y			+= $(call cc-option,-Wa$(comma)-mtune=generic32,)
+#cflags-y			+= $(call cc-option,-Wa$(comma)-mtune=generic32,)
 endif
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -53,12 +53,16 @@ extern unsigned long __per_cpu_offset[NR
  * established ways to produce a usable pointer from the percpu variable
  * offset.
  */
+void count_per_cpu(unsigned int cpu);
+void count_get_cpu_var(void);
+void count_raw_get_cpu_var(void);
+
 #define per_cpu(var, cpu) \
-	(*SHIFT_PERCPU_PTR(&per_cpu_var(var), per_cpu_offset(cpu)))
+	(*(count_per_cpu(cpu), SHIFT_PERCPU_PTR(&per_cpu_var(var), per_cpu_offset(cpu))))
 #define __get_cpu_var(var) \
-	(*SHIFT_PERCPU_PTR(&per_cpu_var(var), my_cpu_offset))
+	(*(count_get_cpu_var(), SHIFT_PERCPU_PTR(&per_cpu_var(var), my_cpu_offset)))
 #define __raw_get_cpu_var(var) \
-	(*SHIFT_PERCPU_PTR(&per_cpu_var(var), __my_cpu_offset))
+	(*(count_raw_get_cpu_var(), SHIFT_PERCPU_PTR(&per_cpu_var(var), __my_cpu_offset)))
 
 
 #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
diff --git a/kernel/module.c b/kernel/module.c
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2736,6 +2736,17 @@ static const struct seq_operations modul
 
 static int modules_open(struct inode *inode, struct file *file)
 {
+	extern atomic_t get_cpu_var_count, raw_get_cpu_var_count, per_cpu_count[], unnecessary_count[];
+	unsigned int i;
+
+	printk("get_cpu_var_count: %i\n", atomic_xchg(&get_cpu_var_count, 0));
+	printk("raw_get_cpu_var_count: %i\n",
+	       atomic_xchg(&raw_get_cpu_var_count, 0));
+	for_each_online_cpu(i)
+		printk("per_cpu %i: %u (%u self)\n",
+		       i, atomic_xchg(&per_cpu_count[i], 0),
+		       atomic_xchg(&unnecessary_count[i], 0));
+
 	return seq_open(file, &modules_op);
 }
 
diff --git a/kernel/smp.c b/kernel/smp.c
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,27 @@
 #include <linux/cpu.h>
 #include <linux/hardirq.h>
 
+atomic_t get_cpu_var_count, raw_get_cpu_var_count, per_cpu_count[CONFIG_NR_CPUS], unnecessary_count[CONFIG_NR_CPUS];
+void count_per_cpu(unsigned int cpu)
+{
+	if (cpu == raw_smp_processor_id())
+		atomic_inc(&unnecessary_count[cpu]);
+	atomic_inc(&per_cpu_count[cpu]);
+}
+EXPORT_SYMBOL(count_per_cpu);
+
+void count_get_cpu_var(void)
+{
+	atomic_inc(&get_cpu_var_count);
+}
+EXPORT_SYMBOL(count_get_cpu_var);
+
+void count_raw_get_cpu_var(void)
+{
+	atomic_inc(&raw_get_cpu_var_count);
+}
+EXPORT_SYMBOL(count_raw_get_cpu_var);
+
 static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
 
 static struct {
