cpualloc: annotate the users.

With x86/32 allyesconfig (trimmed a little, until it booted under kvm)
we have 37148 bytes of static percpu data, and 117228 bytes of dynamic
percpu data.

File and line			Number		Size		Total
net/ipv4/af_inet.c:1287		 21		2048		43008
net/ipv4/af_inet.c:1290		 21		2048		43008
kernel/workqueue.c:819		 72		 128		 9126
net/ipv4/af_inet.c:1287		 48		 128		 6144
net/ipv4/af_inet.c:1290		 48		 128		 6144
net/ipv4/route.c:3258		  1		4096		 4096
include/linux/genhd.h:271	 72		  40		 2880
lib/percpu_counter.c:77		194		   4		  776
net/ipv4/af_inet.c:1287		  1		 288		  288
net/ipv4/af_inet.c:1290		  1		 288		  288
net/ipv4/af_inet.c:1287		  1		 256		  256
net/ipv4/af_inet.c:1290		  1		 256		  256
net/core/neighbour.c:1424	  4		  44		  176
kernel/kexec.c:1143		  1		 176		  176
net/ipv4/af_inet.c:1287		  1		 104		  104
net/ipv4/af_inet.c:1290		  1		 104		  104
arch/x86/.../acpi-cpufreq.c:528	 96		   1		   96
arch/x86/acpi/cstate.c:153	  1		  64		   64
net/.../nf_conntrack_core.c:1209  1		  60		   60

Others:								  178
---
 include/linux/percpu.h |   11 ++++++-----
 kernel/panic.c         |   35 +++++++++++++++++++++++++++++++++++
 mm/allocpercpu.c       |   35 ++++++++++++++++++++++++++++++++++-
 3 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -81,7 +81,8 @@ struct percpu_data {
         (__typeof__(ptr))__p->ptrs[(cpu)];	          \
 })
 
-extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask);
+extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask,
+				 const char *file, unsigned int line);
 extern void percpu_free(void *__pdata);
 
 #else /* CONFIG_SMP */
@@ -100,15 +101,15 @@ static inline void percpu_free(void *__p
 
 #endif /* CONFIG_SMP */
 
-#define percpu_alloc_mask(size, gfp, mask) \
-	__percpu_alloc_mask((size), (gfp), &(mask))
+#define percpu_alloc_mask(size, gfp, mask, file, line)	\
+	__percpu_alloc_mask((size), (gfp), &(mask), file, line)
 
-#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map)
+#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map, __FILE__, __LINE__)
 
 /* (legacy) interface for use without CPU hotplug handling */
 
 #define __alloc_percpu(size)	percpu_alloc_mask((size), GFP_KERNEL, \
-						  cpu_possible_map)
+						  cpu_possible_map, __FILE__, __LINE__)
 #define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type))
 #define free_percpu(ptr)	percpu_free((ptr))
 #define per_cpu_ptr(ptr, cpu)	percpu_ptr((ptr), (cpu))
diff --git a/kernel/panic.c b/kernel/panic.c
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -21,6 +21,7 @@
 #include <linux/debug_locks.h>
 #include <linux/random.h>
 #include <linux/kallsyms.h>
+#include <linux/sort.h>
 
 int panic_on_oops;
 static unsigned long tainted_mask;
@@ -42,6 +43,19 @@ static long no_blink(long time)
 /* Returns how long it waited in ms */
 long (*panic_blink)(long time);
 EXPORT_SYMBOL(panic_blink);
+
+struct alloc_bufs {
+	const char *file;
+	unsigned int line;
+	unsigned int size;
+	unsigned int count;
+};
+
+static int cmp_alloc_bufs(const void *_a, const void *_b)
+{
+	const struct alloc_bufs *a = _a, *b = _b;
+	return a->size * a->count - b->size * b->count;
+}
 
 /**
  *	panic - halt the system
@@ -91,6 +105,27 @@ NORET_TYPE void panic(const char * fmt, 
 	smp_send_stop();
 #endif
 
+	{
+		extern atomic_t percpu_total;
+		extern struct alloc_bufs *alloc_bufs;
+		extern unsigned long num_alloc_bufs;
+		unsigned int i, total = 0;
+
+		sort(alloc_bufs, num_alloc_bufs, sizeof(*alloc_bufs),
+		     cmp_alloc_bufs, NULL);
+		for (i = 0; i < num_alloc_bufs; i++) {
+			printk("%s:%i=%ux%u(%u) ",
+			       alloc_bufs[i].file,
+			       alloc_bufs[i].line,
+			       alloc_bufs[i].count,
+			       alloc_bufs[i].size,
+			       alloc_bufs[i].count *
+			       alloc_bufs[i].size);
+			total += alloc_bufs[i].count * alloc_bufs[i].size;
+		}
+		printk("\n");
+		printk("total %u (%u)\n", atomic_read(&percpu_total), total);
+	}
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
 	if (!panic_blink)
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -98,6 +98,16 @@ static int __percpu_populate_mask(void *
 #define percpu_populate_mask(__pdata, size, gfp, mask) \
 	__percpu_populate_mask((__pdata), (size), (gfp), &(mask))
 
+atomic_t percpu_total;
+struct alloc_bufs {
+	const char *file;
+	unsigned int line;
+	unsigned int size;
+	unsigned int count;
+};
+struct alloc_bufs *alloc_bufs;
+unsigned long num_alloc_bufs;
+
 /**
  * percpu_alloc_mask - initial setup of per-cpu data
  * @size: size of per-cpu object
@@ -108,7 +118,8 @@ static int __percpu_populate_mask(void *
  * which is simplified by the percpu_alloc() wrapper.
  * Per-cpu objects are populated with zeroed buffers.
  */
-void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask,
+			  const char *file, unsigned int line)
 {
 	/*
 	 * We allocate whole cache lines to avoid false sharing
@@ -116,6 +127,28 @@ void *__percpu_alloc_mask(size_t size, g
 	size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
 	void *pdata = kzalloc(sz, gfp);
 	void *__pdata = __percpu_disguise(pdata);
+
+	atomic_add(size, &percpu_total);
+	{
+		unsigned int i;
+		for (i = 0; i < num_alloc_bufs; i++) {
+			if (strcmp(file, alloc_bufs[i].file) == 0
+			    && line == alloc_bufs[i].line
+			    && size == alloc_bufs[i].size) {
+				alloc_bufs[i].count++;
+				break;
+			}
+		}
+		if (i == num_alloc_bufs) {
+			alloc_bufs = krealloc(alloc_bufs,
+					      sizeof(*alloc_bufs)*(++num_alloc_bufs),
+					      GFP_KERNEL);
+			alloc_bufs[i].file = file;
+			alloc_bufs[i].line = line;
+			alloc_bufs[i].size = size;
+			alloc_bufs[i].count = 1;
+		}
+	}
 
 	if (unlikely(!pdata))
 		return NULL;
