From: Rusty Russell <rusty@rustcorp.com.au>
Subject: cpumask: make task_struct.cpus_allowed a cpumask_var_t

This turns it into a pointer for everyone.  No change for those
already using the tsk_cpus_allowed() accessor; I've enhanced some of
the sched/ code to use that.  For others, I just changed them
directly.

For CONFIG_CPUMASK_OFFSTACK=y, we now allocate it off the end; it
would be better to avoid the indirection and use a dangling bitmap,
but I didn't want to alter the layout of task_struct and risk breaking
carefully balanced caches.

Even better would be to point to the fixed "one cpu" and "all cpus"
masks where possible, and make a copy when setting it to something
else.  But you'd have to track down those naughty places which frob it
directly...

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: anton@samba.org
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Mike Travis <travis@sgi.com>
---
 arch/arm/mach-integrator/cpu.c               |    4 ++--
 arch/ia64/kernel/cpufreq/acpi-cpufreq.c      |    4 ++--
 arch/ia64/kernel/mca.c                       |    2 +-
 arch/ia64/kernel/salinfo.c                   |    2 +-
 arch/ia64/kernel/topology.c                  |    2 +-
 arch/ia64/sn/kernel/sn2/sn_hwperf.c          |    2 +-
 arch/mips/kernel/cpufreq/loongson2_cpufreq.c |    2 +-
 arch/mips/kernel/traps.c                     |    8 ++++----
 arch/sh/kernel/cpufreq.c                     |    2 +-
 arch/x86/kernel/cpu/mcheck/mce_intel.c       |    2 +-
 drivers/acpi/processor_throttling.c          |    4 ++--
 drivers/firmware/dcdbas.c                    |    2 +-
 fs/proc/array.c                              |    4 ++--
 include/linux/init_task.h                    |    9 ++++++++-
 include/linux/sched.h                        |    4 ++--
 kernel/cpuset.c                              |    2 +-
 kernel/fork.c                                |   18 +++++++++++++++++-
 kernel/rcutree_plugin.h                      |    4 ++--
 kernel/sched/core.c                          |    6 +++---
 kernel/sched/cpupri.c                        |    4 ++--
 kernel/trace/trace_workqueue.c               |    6 +++---
 kernel/workqueue.c                           |    2 +-
 22 files changed, 59 insertions(+), 36 deletions(-)

diff --git a/arch/arm/mach-integrator/cpu.c b/arch/arm/mach-integrator/cpu.c
index fbb4577..1441731 100644
--- a/arch/arm/mach-integrator/cpu.c
+++ b/arch/arm/mach-integrator/cpu.c
@@ -92,7 +92,7 @@ static int integrator_set_target(struct cpufreq_policy *policy,
 	/*
 	 * Save this threads cpus_allowed mask.
 	 */
-	cpus_allowed = current->cpus_allowed;
+	cpus_allowed = *current->cpus_allowed;
 
 	/*
 	 * Bind to the specified CPU.  When this call returns,
@@ -163,7 +163,7 @@ static unsigned int integrator_get(unsigned int cpu)
 	u_int cm_osc;
 	struct icst_vco vco;
 
-	cpus_allowed = current->cpus_allowed;
+	cpus_allowed = *current->cpus_allowed;
 
 	set_cpus_allowed(current, cpumask_of_cpu(cpu));
 	BUG_ON(cpu != smp_processor_id());
diff --git a/arch/ia64/kernel/cpufreq/acpi-cpufreq.c b/arch/ia64/kernel/cpufreq/acpi-cpufreq.c
index f09b174..15fa5bc 100644
--- a/arch/ia64/kernel/cpufreq/acpi-cpufreq.c
+++ b/arch/ia64/kernel/cpufreq/acpi-cpufreq.c
@@ -110,7 +110,7 @@ processor_get_freq (
 
 	pr_debug("processor_get_freq\n");
 
-	saved_mask = current->cpus_allowed;
+	saved_mask = *current->cpus_allowed;
 	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 	if (smp_processor_id() != cpu)
 		goto migrate_end;
@@ -148,7 +148,7 @@ processor_set_freq (
 
 	pr_debug("processor_set_freq\n");
 
-	saved_mask = current->cpus_allowed;
+	saved_mask = *current->cpus_allowed;
 	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 	if (smp_processor_id() != cpu) {
 		retval = -EAGAIN;
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 65bf9cd..9eb3e40 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1828,7 +1828,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset,
 	ti->cpu = cpu;
 	p->stack = ti;
 	p->state = TASK_UNINTERRUPTIBLE;
-	cpu_set(cpu, p->cpus_allowed);
+	cpumask_set_cpu(cpu, p->cpus_allowed);
 	INIT_LIST_HEAD(&p->tasks);
 	p->parent = p->real_parent = p->group_leader = p;
 	INIT_LIST_HEAD(&p->children);
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index 79802e5..0e50972 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -404,7 +404,7 @@ salinfo_log_release(struct inode *inode, struct file *file)
 static void
 call_on_cpu(int cpu, void (*fn)(void *), void *arg)
 {
-	cpumask_t save_cpus_allowed = current->cpus_allowed;
+	cpumask_t save_cpus_allowed = *current->cpus_allowed;
 	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 	(*fn)(arg);
 	set_cpus_allowed_ptr(current, &save_cpus_allowed);
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index c64460b..d859ac7 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -362,7 +362,7 @@ static int __cpuinit cache_add_dev(struct device * sys_dev)
 	if (all_cpu_cache_info[cpu].kobj.parent)
 		return 0;
 
-	oldmask = current->cpus_allowed;
+	oldmask = *current->cpus_allowed;
 	retval = set_cpus_allowed_ptr(current, cpumask_of(cpu));
 	if (unlikely(retval))
 		return retval;
diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index 4554f68..4379cbe 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -632,7 +632,7 @@ static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info)
 		}
 		else {
 			/* migrate the task before calling SAL */ 
-			save_allowed = current->cpus_allowed;
+			save_allowed = *current->cpus_allowed;
 			set_cpus_allowed_ptr(current, cpumask_of(cpu));
 			sn_hwperf_call_sal(op_info);
 			set_cpus_allowed_ptr(current, &save_allowed);
diff --git a/arch/mips/kernel/cpufreq/loongson2_cpufreq.c b/arch/mips/kernel/cpufreq/loongson2_cpufreq.c
index ae5db20..c00990b 100644
--- a/arch/mips/kernel/cpufreq/loongson2_cpufreq.c
+++ b/arch/mips/kernel/cpufreq/loongson2_cpufreq.c
@@ -64,7 +64,7 @@ static int loongson2_cpufreq_target(struct cpufreq_policy *policy,
 	if (!cpu_online(cpu))
 		return -ENODEV;
 
-	cpus_allowed = current->cpus_allowed;
+	cpus_allowed = *current->cpus_allowed;
 	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 
 	if (cpufreq_frequency_table_target
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index cfdaaa4..b85363d 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -928,13 +928,13 @@ static void mt_ase_fp_affinity(void)
 		 * restricted the allowed set to exclude any CPUs with FPUs,
 		 * we'll skip the procedure.
 		 */
-		if (cpus_intersects(current->cpus_allowed, mt_fpu_cpumask)) {
+		if (cpumask_intersects(current->cpus_allowed, &mt_fpu_cpumask)) {
 			cpumask_t tmask;
 
 			current->thread.user_cpus_allowed
-				= current->cpus_allowed;
-			cpus_and(tmask, current->cpus_allowed,
-				mt_fpu_cpumask);
+				= *current->cpus_allowed;
+			cpumask_and(&tmask, current->cpus_allowed,
+				    &mt_fpu_cpumask);
 			set_cpus_allowed_ptr(current, &tmask);
 			set_thread_flag(TIF_FPUBOUND);
 		}
diff --git a/arch/sh/kernel/cpufreq.c b/arch/sh/kernel/cpufreq.c
index e68b45b..1931eb2 100644
--- a/arch/sh/kernel/cpufreq.c
+++ b/arch/sh/kernel/cpufreq.c
@@ -54,7 +54,7 @@ static int sh_cpufreq_target(struct cpufreq_policy *policy,
 	if (!cpu_online(cpu))
 		return -ENODEV;
 
-	cpus_allowed = current->cpus_allowed;
+	cpus_allowed = *current->cpus_allowed;
 	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 
 	BUG_ON(smp_processor_id() != cpu);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 38e49bc..fe0f725 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -177,7 +177,7 @@ void cmci_rediscover(int dying)
 		return;
 	if (!alloc_cpumask_var(&old, GFP_KERNEL))
 		return;
-	cpumask_copy(old, &current->cpus_allowed);
+	cpumask_copy(old, current->cpus_allowed);
 
 	for_each_online_cpu(cpu) {
 		if (cpu == dying)
diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c
index 1d02b7b..7bbfac9 100644
--- a/drivers/acpi/processor_throttling.c
+++ b/drivers/acpi/processor_throttling.c
@@ -909,7 +909,7 @@ static int acpi_processor_get_throttling(struct acpi_processor *pr)
 	/*
 	 * Migrate task to the cpu pointed by pr.
 	 */
-	cpumask_copy(saved_mask, &current->cpus_allowed);
+	cpumask_copy(saved_mask, current->cpus_allowed);
 	/* FIXME: use work_on_cpu() */
 	if (set_cpus_allowed_ptr(current, cpumask_of(pr->id))) {
 		/* Can't migrate to the target pr->id CPU. Exit */
@@ -1098,7 +1098,7 @@ int acpi_processor_set_throttling(struct acpi_processor *pr,
 		return -ENODEV;
 	}
 
-	cpumask_copy(saved_mask, &current->cpus_allowed);
+	cpumask_copy(saved_mask, current->cpus_allowed);
 	t_state.target_state = state;
 	p_throttling = &(pr->throttling);
 	cpumask_and(online_throttling_cpus, cpu_online_mask,
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c
index ea5ac2d..e7a6529 100644
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -258,7 +258,7 @@ int dcdbas_smi_request(struct smi_cmd *smi_cmd)
 	if (!alloc_cpumask_var(&old_mask, GFP_KERNEL))
 		return -ENOMEM;
 
-	cpumask_copy(old_mask, &current->cpus_allowed);
+	cpumask_copy(old_mask, current->cpus_allowed);
 	set_cpus_allowed_ptr(current, cpumask_of(0));
 	if (smp_processor_id() != 0) {
 		dev_dbg(&dcdbas_pdev->dev, "%s: failed to get CPU 0\n",
diff --git a/fs/proc/array.c b/fs/proc/array.c
index f9bd395..6118f2b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -330,10 +330,10 @@ static inline void task_context_switch_counts(struct seq_file *m,
 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 {
 	seq_puts(m, "Cpus_allowed:\t");
-	seq_cpumask(m, &task->cpus_allowed);
+	seq_cpumask(m, task->cpus_allowed);
 	seq_putc(m, '\n');
 	seq_puts(m, "Cpus_allowed_list:\t");
-	seq_cpumask_list(m, &task->cpus_allowed);
+	seq_cpumask_list(m, task->cpus_allowed);
 	seq_putc(m, '\n');
 }
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e4baff5..0ade0f7 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -134,6 +134,13 @@ extern struct cred init_cred;
 
 #define INIT_TASK_COMM "swapper"
 
+#ifdef CONFIG_CPUMASK_OFFSTACK
+extern DECLARE_BITMAP(init_task_cpus_allowed, NR_CPUS);
+#define INIT_TASK_CPUS_ALLOWED (to_cpumask(init_task_cpus_allowed))
+#else
+#define INIT_TASK_CPUS_ALLOWED { { CPU_BITS_ALL } }
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -148,7 +155,7 @@ extern struct cred init_cred;
 	.static_prio	= MAX_PRIO-20,					\
 	.normal_prio	= MAX_PRIO-20,					\
 	.policy		= SCHED_NORMAL,					\
-	.cpus_allowed	= CPU_MASK_ALL,					\
+	.cpus_allowed	= INIT_TASK_CPUS_ALLOWED,			\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
 	.se		= {						\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 714fa99..a29b71f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1299,7 +1299,7 @@ struct task_struct {
 #endif
 
 	unsigned int policy;
-	cpumask_t cpus_allowed;
+	cpumask_var_t cpus_allowed;
 
 #ifdef CONFIG_PREEMPT_RCU
 	int rcu_read_lock_nesting;
@@ -1624,7 +1624,7 @@ struct task_struct {
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
-#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
+#define tsk_cpus_allowed(tsk) ((tsk)->cpus_allowed)
 
 /*
  * Priority of a process goes from 0..MAX_PRIO-1, valid RT
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 14f7070..c6c7b38 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -810,7 +810,7 @@ void rebuild_sched_domains(void)
 static int cpuset_test_cpumask(struct task_struct *tsk,
 			       struct cgroup_scanner *scan)
 {
-	return !cpumask_equal(&tsk->cpus_allowed,
+	return !cpumask_equal(tsk->cpus_allowed,
 			(cgroup_cs(scan->cg))->cpus_allowed);
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 4347dcf..bea887f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -218,9 +218,15 @@ void __init fork_init(unsigned long mempages)
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
 #endif
+	unsigned int task_size = sizeof(struct task_struct);
+
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	task_size += BITS_TO_LONGS(nr_cpu_ids) * sizeof(long);
+#endif /* CONFIG_CPUMASK_OFFSTACK */
+
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep =
-		kmem_cache_create("task_struct", sizeof(struct task_struct),
+		kmem_cache_create("task_struct", task_size,
 			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif
 
@@ -277,6 +283,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	if (err)
 		goto out;
 
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	/* cpumask sits at end of task struct. */
+	tsk->cpus_allowed = (void *)(tsk + 1);
+	cpumask_copy(tsk->cpus_allowed, orig->cpus_allowed);
+#endif
+
 	tsk->stack = ti;
 
 	setup_thread_stack(tsk, orig);
@@ -1841,3 +1853,7 @@ int unshare_files(struct files_struct **displaced)
 	task_unlock(task);
 	return 0;
 }
+
+#ifdef CONFIG_CPUMASK_OFFSTACK
+DECLARE_BITMAP(init_task_cpus_allowed, NR_CPUS) = CPU_BITS_ALL;
+#endif
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c023464..83be5cf 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1591,7 +1591,7 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
 static int rcu_cpu_kthread_should_stop(int cpu)
 {
 	while (cpu_is_offline(cpu) ||
-	       !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
+	       !cpumask_equal(current->cpus_allowed, cpumask_of(cpu)) ||
 	       smp_processor_id() != cpu) {
 		if (kthread_should_stop())
 			return 1;
@@ -1599,7 +1599,7 @@ static int rcu_cpu_kthread_should_stop(int cpu)
 		per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
 		local_bh_enable();
 		schedule_timeout_uninterruptible(1);
-		if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
+		if (!cpumask_equal(current->cpus_allowed, cpumask_of(cpu)))
 			set_cpus_allowed_ptr(current, cpumask_of(cpu));
 		local_bh_disable();
 	}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7b77e3b..9bc231b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4466,7 +4466,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
 		goto out_unlock;
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+	cpumask_and(mask, tsk_cpus_allowed(p), cpu_online_mask);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 out_unlock:
@@ -4957,7 +4957,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 	if (p->sched_class && p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, new_mask);
 
-	cpumask_copy(&p->cpus_allowed, new_mask);
+	cpumask_copy(tsk_cpus_allowed(p), new_mask);
 	p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
@@ -4993,7 +4993,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 
 	rq = task_rq_lock(p, &flags);
 
-	if (cpumask_equal(&p->cpus_allowed, new_mask))
+	if (cpumask_equal(tsk_cpus_allowed(p), new_mask))
 		goto out;
 
 	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index d72586f..7604d12 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -101,11 +101,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 		if (skip)
 			continue;
 
-		if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
+		if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids)
 			continue;
 
 		if (lowest_mask) {
-			cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+			cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask);
 
 			/*
 			 * We have to ensure that we have at least one bit
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 209b379..2cb7257 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -53,7 +53,7 @@ probe_workqueue_insertion(void *ignore,
 			  struct task_struct *wq_thread,
 			  struct work_struct *work)
 {
-	int cpu = cpumask_first(&wq_thread->cpus_allowed);
+	int cpu = cpumask_first(wq_thread->cpus_allowed);
 	struct cpu_workqueue_stats *node;
 	unsigned long flags;
 
@@ -75,7 +75,7 @@ probe_workqueue_execution(void *ignore,
 			  struct task_struct *wq_thread,
 			  struct work_struct *work)
 {
-	int cpu = cpumask_first(&wq_thread->cpus_allowed);
+	int cpu = cpumask_first(wq_thread->cpus_allowed);
 	struct cpu_workqueue_stats *node;
 	unsigned long flags;
 
@@ -121,7 +121,7 @@ static void
 probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
 {
 	/* Workqueue only execute on one cpu */
-	int cpu = cpumask_first(&wq_thread->cpus_allowed);
+	int cpu = cpumask_first(wq_thread->cpus_allowed);
 	struct cpu_workqueue_stats *node, *next;
 	unsigned long flags;
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5abf42f..55d1d29 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1284,7 +1284,7 @@ __acquires(&gcwq->lock)
 		if (gcwq->flags & GCWQ_DISASSOCIATED)
 			return false;
 		if (task_cpu(task) == gcwq->cpu &&
-		    cpumask_equal(&current->cpus_allowed,
+		    cpumask_equal(current->cpus_allowed,
 				  get_cpu_mask(gcwq->cpu)))
 			return true;
 		spin_unlock_irq(&gcwq->lock);