FIXME: Split patch by users.

Let's use __get_cpu_ptr and get_cpu_ptr now, rather than
per_cpu_ptr(..., smp_processor_id()).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 block/blktrace.c                            |    2 -
 crypto/async_tx/async_tx.c                  |    5 +---
 drivers/dma/dmaengine.c                     |   32 ++++++++++++++--------------
 drivers/infiniband/hw/ehca/ehca_irq.c       |    3 --
 drivers/net/chelsio/sge.c                   |    5 +---
 drivers/net/loopback.c                      |    4 +--
 drivers/net/veth.c                          |    7 ++----
 fs/ext4/mballoc.c                           |    2 -
 fs/nfs/iostat.h                             |   10 ++------
 fs/xfs/xfs_mount.c                          |    7 ++----
 include/net/netfilter/nf_conntrack.h        |    4 +--
 include/net/netfilter/nf_conntrack_ecache.h |    2 -
 kernel/posix-cpu-timers.c                   |    2 -
 kernel/sched.c                              |    5 +---
 kernel/sched_stats.h                        |    4 +--
 kernel/srcu.c                               |    4 +--
 kernel/workqueue.c                          |    4 +--
 lib/percpu_counter.c                        |    5 +---
 net/core/sock.c                             |    3 --
 net/netfilter/nf_conntrack_ecache.c         |    4 +--
 net/xfrm/xfrm_ipcomp.c                      |   20 +++++++----------
 21 files changed, 60 insertions(+), 74 deletions(-)

diff --git a/block/blktrace.c b/block/blktrace.c
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -91,7 +91,7 @@ void __trace_note_message(struct blk_tra
 	char *buf;
 
 	local_irq_save(flags);
-	buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
+	buf = __get_cpu_ptr(bt->msg_data);
 	va_start(args, fmt);
 	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
 	va_end(args);
diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
--- a/crypto/async_tx/async_tx.c
+++ b/crypto/async_tx/async_tx.c
@@ -391,9 +391,8 @@ __async_tx_find_channel(struct dma_async
 		return depend_tx->chan;
 	else if (likely(channel_table_initialized)) {
 		struct dma_chan_ref *ref;
-		int cpu = get_cpu();
-		ref = per_cpu_ptr(channel_table[tx_type], cpu)->ref;
-		put_cpu();
+		ref = get_cpu_ptr(channel_table[tx_type])->ref;
+		put_cpu_ptr(channel_table[tx_type]);
 		return ref ? ref->chan : NULL;
 	} else
 		return NULL;
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -111,10 +111,9 @@ static ssize_t show_in_use(struct device
 		atomic_read(&chan->refcount.refcount) > 1)
 		in_use = 1;
 	else {
-		if (local_read(&(per_cpu_ptr(chan->local,
-			get_cpu())->refcount)) > 0)
+		if (local_read(&get_cpu_ptr(chan->local)->refcount) > 0)
 			in_use = 1;
-		put_cpu();
+		put_cpu_ptr(chan->local);
 	}
 
 	return sprintf(buf, "%d\n", in_use);
@@ -498,6 +497,7 @@ dma_async_memcpy_buf_to_buf(struct dma_c
 	struct dma_async_tx_descriptor *tx;
 	dma_addr_t dma_dest, dma_src;
 	dma_cookie_t cookie;
+	struct dma_chan_percpu *local;
 	int cpu;
 
 	dma_src = dma_map_single(dev->dev, src, len, DMA_TO_DEVICE);
@@ -514,10 +514,10 @@ dma_async_memcpy_buf_to_buf(struct dma_c
 	tx->callback = NULL;
 	cookie = tx->tx_submit(tx);
 
-	cpu = get_cpu();
-	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
-	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
-	put_cpu();
+	local = get_cpu_ptr(chan->local);
+	local->bytes_transferred += len;
+	local->memcpy_count++;
+	put_cpu_ptr(chan->local);
 
 	return cookie;
 }
@@ -544,6 +544,7 @@ dma_async_memcpy_buf_to_pg(struct dma_ch
 	struct dma_async_tx_descriptor *tx;
 	dma_addr_t dma_dest, dma_src;
 	dma_cookie_t cookie;
+	struct dma_chan_percpu *local;
 	int cpu;
 
 	dma_src = dma_map_single(dev->dev, kdata, len, DMA_TO_DEVICE);
@@ -560,10 +561,10 @@ dma_async_memcpy_buf_to_pg(struct dma_ch
 	tx->callback = NULL;
 	cookie = tx->tx_submit(tx);
 
-	cpu = get_cpu();
-	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
-	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
-	put_cpu();
+	local = get_cpu_ptr(chan->local);
+	local->bytes_transferred += len;
+	local->memcpy_count++;
+	put_cpu_ptr(chan->local);
 
 	return cookie;
 }
@@ -592,6 +593,7 @@ dma_async_memcpy_pg_to_pg(struct dma_cha
 	struct dma_async_tx_descriptor *tx;
 	dma_addr_t dma_dest, dma_src;
 	dma_cookie_t cookie;
+	struct dma_chan_percpu *local;
 	int cpu;
 
 	dma_src = dma_map_page(dev->dev, src_pg, src_off, len, DMA_TO_DEVICE);
@@ -609,10 +611,10 @@ dma_async_memcpy_pg_to_pg(struct dma_cha
 	tx->callback = NULL;
 	cookie = tx->tx_submit(tx);
 
-	cpu = get_cpu();
-	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
-	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
-	put_cpu();
+	local = get_cpu_ptr(chan->local);
+	local->bytes_transferred += len;
+	local->memcpy_count++;
+	put_cpu_ptr(chan->local);
 
 	return cookie;
 }
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
--- a/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -827,8 +827,7 @@ static void __cpuinit take_over_work(str
 		cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
 
 		list_del(&cq->entry);
-		__queue_comp_task(cq, per_cpu_ptr(pool->cpu_comp_tasks,
-						  smp_processor_id()));
+		__queue_comp_task(cq, __get_cpu_ptr(pool->cpu_comp_tasks));
 	}
 
 	spin_unlock_irqrestore(&cct->task_lock, flags_cct);
diff --git a/drivers/net/chelsio/sge.c b/drivers/net/chelsio/sge.c
--- a/drivers/net/chelsio/sge.c
+++ b/drivers/net/chelsio/sge.c
@@ -1378,7 +1378,7 @@ static void sge_rx(struct sge *sge, stru
 	}
 	__skb_pull(skb, sizeof(*p));
 
-	st = per_cpu_ptr(sge->port_stats[p->iff], smp_processor_id());
+	st = __get_cpu_ptr(sge->port_stats[p->iff]);
 
 	skb->protocol = eth_type_trans(skb, adapter->port[p->iff].dev);
 	if ((adapter->flags & RX_CSUM_ENABLED) && p->csum == 0xffff &&
@@ -1780,8 +1780,7 @@ int t1_start_xmit(struct sk_buff *skb, s
 {
 	struct adapter *adapter = dev->ml_priv;
 	struct sge *sge = adapter->sge;
-	struct sge_port_stats *st = per_cpu_ptr(sge->port_stats[dev->if_port],
-						smp_processor_id());
+	struct sge_port_stats *st = __get_cpu_ptr(sge->port_stats[dev->if_port]);
 	struct cpl_tx_pkt *cpl;
 	struct sk_buff *orig_skb = skb;
 	int ret;
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -76,9 +76,9 @@ static int loopback_xmit(struct sk_buff 
 
 	skb->protocol = eth_type_trans(skb,dev);
 
-	/* it's OK to use per_cpu_ptr() because BHs are off */
+	/* it's OK to use __get_cpu_ptr() because BHs are off */
 	pcpu_lstats = dev->ml_priv;
-	lb_stats = per_cpu_ptr(pcpu_lstats, smp_processor_id());
+	lb_stats = __get_cpu_ptr(pcpu_lstats);
 	lb_stats->bytes += skb->len;
 	lb_stats->packets++;
 
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -148,7 +148,7 @@ static int veth_xmit(struct sk_buff *skb
 	struct net_device *rcv = NULL;
 	struct veth_priv *priv, *rcv_priv;
 	struct veth_net_stats *stats;
-	int length, cpu;
+	int length;
 
 	skb_orphan(skb);
 
@@ -156,8 +156,7 @@ static int veth_xmit(struct sk_buff *skb
 	rcv = priv->peer;
 	rcv_priv = netdev_priv(rcv);
 
-	cpu = smp_processor_id();
-	stats = per_cpu_ptr(priv->stats, cpu);
+	stats = __get_cpu_ptr(priv->stats);
 
 	if (!(rcv->flags & IFF_UP))
 		goto outf;
@@ -178,7 +177,7 @@ static int veth_xmit(struct sk_buff *skb
 	stats->tx_bytes += length;
 	stats->tx_packets++;
 
-	stats = per_cpu_ptr(rcv_priv->stats, cpu);
+	stats = __get_cpu_ptr(rcv_priv->stats);
 	stats->rx_bytes += length;
 	stats->rx_packets++;
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4007,7 +4007,7 @@ static void ext4_mb_group_or_file(struct
 	 * per cpu locality group is to reduce the contention between block
 	 * request from multiple CPUs.
 	 */
-	ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id());
+	ac->ac_lg = __raw_get_cpu_ptr(sbi->s_locality_groups);
 
 	/* we're going to use group allocation */
 	ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -17,16 +17,14 @@ struct nfs_iostats {
 struct nfs_iostats {
 	unsigned long long	bytes[__NFSIOS_BYTESMAX];
 	unsigned long		events[__NFSIOS_COUNTSMAX];
-} ____cacheline_aligned;
+};
 
 static inline void nfs_inc_server_stats(const struct nfs_server *server,
 					enum nfs_stat_eventcounters stat)
 {
 	struct nfs_iostats *iostats;
-	int cpu;
 
-	cpu = get_cpu();
-	iostats = per_cpu_ptr(server->io_stats, cpu);
+	iostats = get_cpu_ptr(server->io_stats);
 	iostats->events[stat]++;
 	put_cpu_no_resched();
 }
@@ -42,10 +40,8 @@ static inline void nfs_add_server_stats(
 					unsigned long addend)
 {
 	struct nfs_iostats *iostats;
-	int cpu;
 
-	cpu = get_cpu();
-	iostats = per_cpu_ptr(server->io_stats, cpu);
+	iostats = get_cpu_ptr(server->io_stats);
 	iostats->bytes[stat] += addend;
 	put_cpu_no_resched();
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -2321,8 +2321,7 @@ xfs_icsb_modify_counters(
 
 	might_sleep();
 again:
-	cpu = get_cpu();
-	icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu);
+	icsbp = (xfs_icsb_cnts_t *)get_cpu_ptr(mp->m_sb_cnts);
 
 	/*
 	 * if the counter is disabled, go to slow path
@@ -2370,7 +2369,7 @@ again:
 	return 0;
 
 slow_path:
-	put_cpu();
+	put_cpu_ptr();
 
 	/*
 	 * serialise with a mutex so we don't burn lots of cpu on
@@ -2418,7 +2417,7 @@ slow_path:
 
 balance_counter:
 	xfs_icsb_unlock_cntr(icsbp);
-	put_cpu();
+	put_cpu_ptr(mp->m_sb_cnts);
 
 	/*
 	 * We may have multiple threads here if multiple per-cpu
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -290,11 +290,11 @@ extern int nf_conntrack_max;
 extern int nf_conntrack_max;
 
 #define NF_CT_STAT_INC(net, count)	\
-	(per_cpu_ptr((net)->ct.stat, raw_smp_processor_id())->count++)
+	(__get_cpu_ptr((net)->ct.stat)->count++)
 #define NF_CT_STAT_INC_ATOMIC(net, count)		\
 do {							\
 	local_bh_disable();				\
-	per_cpu_ptr((net)->ct.stat, raw_smp_processor_id())->count++;	\
+	__get_cpu_ptr((net)->ct.stat)->count++;		\
 	local_bh_enable();				\
 } while (0)
 
diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -39,7 +39,7 @@ nf_conntrack_event_cache(enum ip_conntra
 	struct nf_conntrack_ecache *ecache;
 
 	local_bh_disable();
-	ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id());
+	ecache = __get_cpu_ptr(net->ct.ecache);
 	if (ct != ecache->ct)
 		__nf_ct_event_cache_init(ct);
 	ecache->events |= event;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -37,7 +37,7 @@ int thread_group_cputime_alloc(struct ta
 		return 0;
 	}
 	sig->cputime.totals = cputime;
-	cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id());
+	cputime = __get_cpu_ptr(sig->cputime.totals);
 	cputime->utime = tsk->utime;
 	cputime->stime = tsk->stime;
 	cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
diff --git a/kernel/sched.c b/kernel/sched.c
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -682,12 +682,11 @@ static inline void update_rq_clock(struc
  */
 int runqueue_is_locked(void)
 {
-	int cpu = get_cpu();
-	struct rq *rq = cpu_rq(cpu);
+	struct rq *rq = &get_cpu_var(runqueues);
 	int ret;
 
 	ret = spin_is_locked(&rq->lock);
-	put_cpu();
+	put_cpu_var(runqueues);
 	return ret;
 }
 
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -305,7 +305,7 @@ static inline void account_group_user_ti
 	if (sig->cputime.totals) {
 		struct task_cputime *times;
 
-		times = per_cpu_ptr(sig->cputime.totals, get_cpu());
+		times = get_cpu_ptr(sig->cputime.totals);
 		times->utime = cputime_add(times->utime, cputime);
 		put_cpu_no_resched();
 	}
@@ -364,7 +364,7 @@ static inline void account_group_exec_ru
 	if (sig->cputime.totals) {
 		struct task_cputime *times;
 
-		times = per_cpu_ptr(sig->cputime.totals, get_cpu());
+		times = get_cpu_ptr(sig->cputime.totals);
 		times->sum_exec_runtime += ns;
 		put_cpu_no_resched();
 	}
diff --git a/kernel/srcu.c b/kernel/srcu.c
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -113,7 +113,7 @@ int srcu_read_lock(struct srcu_struct *s
 	preempt_disable();
 	idx = sp->completed & 0x1;
 	barrier();  /* ensure compiler looks -once- at sp->completed. */
-	per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++;
+	__get_cpu_ptr(sp->per_cpu_ref)->c[idx]++;
 	srcu_barrier();  /* ensure compiler won't misorder critical section. */
 	preempt_enable();
 	return idx;
@@ -133,7 +133,7 @@ void srcu_read_unlock(struct srcu_struct
 {
 	preempt_disable();
 	srcu_barrier();  /* ensure compiler won't misorder critical section. */
-	per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
+	__get_cpu_ptr(sp->per_cpu_ref)->c[idx]--;
 	preempt_enable();
 }
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -739,12 +739,12 @@ int current_is_keventd(void)
 int current_is_keventd(void)
 {
 	struct cpu_workqueue_struct *cwq;
-	int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */
 	int ret = 0;
 
 	BUG_ON(!keventd_wq);
 
-	cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu);
+	/* preempt-safe: keventd is per-cpu */
+	cwq = __get_cpu_ptr(keventd_wq->cpu_wq);
 	if (current == cwq->thread)
 		ret = 1;
 
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -32,9 +32,8 @@ void __percpu_counter_add(struct percpu_
 {
 	s64 count;
 	s32 *pcount;
-	int cpu = get_cpu();
 
-	pcount = per_cpu_ptr(fbc->counters, cpu);
+	pcount = get_cpu_ptr(fbc->counters);
 	count = *pcount + amount;
 	if (count >= batch || count <= -batch) {
 		spin_lock(&fbc->lock);
@@ -44,7 +43,7 @@ void __percpu_counter_add(struct percpu_
 	} else {
 		*pcount = count;
 	}
-	put_cpu();
+	put_cpu_ptr();
 }
 EXPORT_SYMBOL(__percpu_counter_add);
 
diff --git a/net/core/sock.c b/net/core/sock.c
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1946,8 +1946,7 @@ static DECLARE_BITMAP(proto_inuse_idx, P
 #ifdef CONFIG_NET_NS
 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
 {
-	int cpu = smp_processor_id();
-	per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
+	__get_cpu_ptr(net->core.inuse)->val[prot->inuse_idx] += val;
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
 
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -60,7 +60,7 @@ void nf_ct_deliver_cached_events(const s
 	struct nf_conntrack_ecache *ecache;
 
 	local_bh_disable();
-	ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id());
+	ecache = __get_cpu_ptr(net->ct.ecache);
 	if (ecache->ct == ct)
 		__nf_ct_deliver_cached_events(ecache);
 	local_bh_enable();
@@ -74,7 +74,7 @@ void __nf_ct_event_cache_init(struct nf_
 	struct nf_conntrack_ecache *ecache;
 
 	/* take care of delivering potentially old events */
-	ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id());
+	ecache = __get_cpu_ptr(net->ct.ecache);
 	BUG_ON(ecache->ct == ct);
 	if (ecache->ct)
 		__nf_ct_deliver_cached_events(ecache);
diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c
--- a/net/xfrm/xfrm_ipcomp.c
+++ b/net/xfrm/xfrm_ipcomp.c
@@ -45,9 +45,8 @@ static int ipcomp_decompress(struct xfrm
 	const int plen = skb->len;
 	int dlen = IPCOMP_SCRATCH_SIZE;
 	const u8 *start = skb->data;
-	const int cpu = get_cpu();
-	u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
-	struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu);
+	u8 *scratch = *get_cpu_ptr(ipcomp_scratches);
+	struct crypto_comp *tfm = *__get_cpu_ptr(ipcd->tfms);
 	int err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen);
 	int len;
 
@@ -101,7 +100,7 @@ static int ipcomp_decompress(struct xfrm
 	err = 0;
 
 out:
-	put_cpu();
+	put_cpu_ptr(ipcomp_scratches);
 	return err;
 }
 
@@ -139,9 +138,8 @@ static int ipcomp_compress(struct xfrm_s
 	const int plen = skb->len;
 	int dlen = IPCOMP_SCRATCH_SIZE;
 	u8 *start = skb->data;
-	const int cpu = get_cpu();
-	u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
-	struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu);
+	u8 *scratch = *get_cpu_ptr(ipcomp_scratches);
+	struct crypto_comp *tfm = *__get_cpu_ptr(ipcd->tfms);
 	int err;
 
 	local_bh_disable();
@@ -162,7 +160,7 @@ static int ipcomp_compress(struct xfrm_s
 	return 0;
 
 out:
-	put_cpu();
+	put_cpu_ptr(ipcomp_scratches);
 	return err;
 }
 
@@ -274,14 +272,12 @@ static struct crypto_comp **ipcomp_alloc
 	struct crypto_comp **tfms;
 	int cpu;
 
-	/* This can be any valid CPU ID so we don't need locking. */
-	cpu = raw_smp_processor_id();
-
 	list_for_each_entry(pos, &ipcomp_tfms_list, list) {
 		struct crypto_comp *tfm;
 
 		tfms = pos->tfms;
-		tfm = *per_cpu_ptr(tfms, cpu);
+		/* This can be any valid CPU ID so we don't need locking. */
+		tfm = *__raw_get_cpu_ptr(tfms);
 
 		if (!strcmp(crypto_comp_name(tfm), alg_name)) {
 			pos->users++;