Untested: simplify stop_machine.

stop_machine creates a kthread which creates kernel threads.  We can
create those threads directly and simplify things a little.  Some care
must be taken with CPU hotunplug, which has special needs, but that code
seems more robust than it was in the past.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/stop_machine.h |   12 -
 kernel/cpu.c                 |   13 -
 kernel/stop_machine.c        |  343 +++++++++++++++----------------------------
 kernel/sysctl.c              |   12 -
 4 files changed, 130 insertions(+), 250 deletions(-)

diff -r 2215ff4e6a64 include/linux/stop_machine.h
--- a/include/linux/stop_machine.h	Tue May 06 12:56:27 2008 +1000
+++ b/include/linux/stop_machine.h	Tue May 06 17:10:53 2008 +1000
@@ -16,8 +16,7 @@
  * @data: the data ptr for the @fn()
  * @cpu: if @cpu == n, run @fn() on cpu n
  *       if @cpu == NR_CPUS, run @fn() on any cpu
- *       if @cpu == ALL_CPUS, run @fn() first on the calling cpu, and then
- *       concurrently on all the other cpus
+ *       if @cpu == ALL_CPUS, run @fn() on every online CPU.
  *
  * Description: This causes a thread to be scheduled on every other cpu,
  * each of which disables interrupts, and finally interrupts are disabled
@@ -38,13 +37,10 @@ int stop_machine_run_notype(int (*fn)(vo
  * @data: the data ptr for the @fn
  * @cpu: the cpu to run @fn on (or any, if @cpu == NR_CPUS.
  *
- * Description: This is a special version of the above, which returns the
- * thread which has run @fn(): kthread_stop will return the return value
- * of @fn().  Used by hotplug cpu.
+ * Description: This is a special version of the above, which assumes cpus
+ * won't come or go while it's being called.  Used by hotplug cpu.
  */
-struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
-				       unsigned int cpu);
-
+int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu);
 #else
 
 static inline int stop_machine_run_notype(int (*fn)(void *), void *data,
diff -r 2215ff4e6a64 kernel/cpu.c
--- a/kernel/cpu.c	Tue May 06 12:56:27 2008 +1000
+++ b/kernel/cpu.c	Tue May 06 17:10:53 2008 +1000
@@ -192,7 +192,6 @@ static int __ref _cpu_down(unsigned int 
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
 	int err, nr_calls = 0;
-	struct task_struct *p;
 	cpumask_t old_allowed, tmp;
 	void *hcpu = (void *)(long)cpu;
 	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
@@ -226,19 +225,15 @@ static int __ref _cpu_down(unsigned int 
 	cpu_clear(cpu, tmp);
 	set_cpus_allowed_ptr(current, &tmp);
 
-	p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
+	err = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
 
-	if (IS_ERR(p) || cpu_online(cpu)) {
+	if (err || cpu_online(cpu)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
 					    hcpu) == NOTIFY_BAD)
 			BUG();
 
-		if (IS_ERR(p)) {
-			err = PTR_ERR(p);
-			goto out_allowed;
-		}
-		goto out_thread;
+		goto out_allowed;
 	}
 
 	/* Wait for it to sleep (leaving idle task). */
@@ -255,8 +250,6 @@ static int __ref _cpu_down(unsigned int 
 
 	check_for_tasks(cpu);
 
-out_thread:
-	err = kthread_stop(p);
 out_allowed:
 	set_cpus_allowed_ptr(current, &old_allowed);
 out_release:
diff -r 2215ff4e6a64 kernel/stop_machine.c
--- a/kernel/stop_machine.c	Tue May 06 12:56:27 2008 +1000
+++ b/kernel/stop_machine.c	Tue May 06 17:10:53 2008 +1000
@@ -13,265 +13,168 @@
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
 
-/* Since we effect priority and affinity (both of which are visible
- * to, and settable by outside processes) we do indirection via a
- * kthread. */
-
-/* Thread to stop each CPU in user context. */
+/* This controls the threads on each CPU. */
 enum stopmachine_state {
-	STOPMACHINE_WAIT,
-	STOPMACHINE_DEPLOY,
-	STOPMACHINE_PREPARE,
+	/* Dummy starting state for thread. */
+	STOPMACHINE_NONE,
+	/* Disable interrupts. */
 	STOPMACHINE_DISABLE_IRQ,
+	/* Run the function */
 	STOPMACHINE_RUN,
+	/* Exit */
 	STOPMACHINE_EXIT,
+	/* Everyone exited. */
+	STOPMACHINE_COMPLETE,
 };
+static enum stopmachine_state state;
 
 struct stop_machine_data {
 	int (*fn)(void *);
 	void *data;
-	struct completion done;
-	int run_all;
-} smdata;
+	int fnret;
+};
 
-static enum stopmachine_state stopmachine_state;
-static unsigned int stopmachine_num_threads;
-static atomic_t stopmachine_thread_ack;
-static atomic_t stopmachine_busy_exit;
+static unsigned int num_threads;
+static atomic_t thread_ack;
+static struct completion finished;
 
-unsigned long stopmachine_timeout = 5; /* secs, arbitrary */
+static void set_state(enum stopmachine_state newstate)
+{
+	/* Reset ack counter. */
+	atomic_set(&thread_ack, num_threads);
+	smp_wmb();
+	state = newstate;
+}
 
-static int stopmachine(void *cpu)
+/* Last one to ack a state moves to the next state. */
+static void ack_state(void)
 {
-	int irqs_disabled = 0;
-	int prepared = 0;
-	int ran = 0;
+	if (atomic_dec_and_test(&thread_ack)) {
+		set_state(state + 1);
+		if (state == STOPMACHINE_COMPLETE)
+			complete(&finished);
+	}
+}
 
-	/* Wait sisters */
-	while (stopmachine_state == STOPMACHINE_WAIT)
-		yield();
-	/* short path for cancel */
-	if (stopmachine_state == STOPMACHINE_EXIT)
-		goto exit;
-
-	/* If target cpu is on fire, this call can stuck */
-	set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu));
-
-	/* Ack: we arrived */
-	smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
-	atomic_inc(&stopmachine_thread_ack);
+/* This is the actual thread which stops the CPU.  It exits by itself rather
+ * than waiting for kthread_stop(), because it's easier for hotplug CPU. */
+static int stop_cpu(struct stop_machine_data *smdata)
+{
+	enum stopmachine_state curstate = STOPMACHINE_NONE;
+	int uninitialized_var(ret);
 
 	/* Simple state machine */
-	while (stopmachine_state != STOPMACHINE_EXIT) {
-		if (stopmachine_state == STOPMACHINE_DISABLE_IRQ 
-		    && !irqs_disabled) {
-			local_irq_disable();
-			hard_irq_disable();
-			irqs_disabled = 1;
-			/* Ack: irqs disabled. */
-			smp_mb(); /* Must read state first. */
-			atomic_inc(&stopmachine_thread_ack);
-		} else if (stopmachine_state == STOPMACHINE_PREPARE
-			   && !prepared) {
-			/* Everyone is in place, hold CPU. */
-			preempt_disable();
-			prepared = 1;
-			smp_mb(); /* Must read state first. */
-			atomic_inc(&stopmachine_thread_ack);
-		} else if (stopmachine_state == STOPMACHINE_RUN && !ran) {
-			smdata.fn(smdata.data);
-			ran = 1;
-			smp_mb(); /* Must read state first. */
-			atomic_inc(&stopmachine_thread_ack);
+	do {
+		/* Chill out and ensure we re-read stopmachine_state. */
+		cpu_relax();
+		if (state != curstate) {
+			curstate = state;
+			switch (curstate) {
+			case STOPMACHINE_DISABLE_IRQ:
+				local_irq_disable();
+				hard_irq_disable();
+				break;
+			case STOPMACHINE_RUN:
+				/* |= allows error detection if functions on
+				 * multiple CPUs. */
+				smdata->fnret |= smdata->fn(smdata->data);
+				break;
+			default:
+				break;
+			}
+			ack_state();
 		}
-		/* Yield in first stage: migration threads need to
-		 * help our sisters onto their CPUs. */
-		if (!prepared && !irqs_disabled)
-			yield();
-		else
-			cpu_relax();
-	}
-exit:
-	/* Ack: we are exiting. */
-	smp_mb(); /* Must read state first. */
-	atomic_inc(&stopmachine_thread_ack);
+	} while (curstate < STOPMACHINE_EXIT);
 
-	if (irqs_disabled)
-		local_irq_enable();
-	if (prepared)
-		preempt_enable();
+	local_irq_enable();
+	do_exit(0);
+}
 
-	if (atomic_read(&stopmachine_busy_exit)) {
-		atomic_dec(&stopmachine_busy_exit);
-		printk(KERN_INFO "stopmachine: cpu#%d is not busy now.\n",
-			(int)(long)cpu);
-	}
-
+/* Callback for CPUs which aren't supposed to do anything. */
+static int chill(void *unused)
+{
 	return 0;
 }
 
-/* Change the thread state */
-static void stopmachine_set_state(enum stopmachine_state state)
+int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 {
-	atomic_set(&stopmachine_thread_ack, 0);
-	smp_wmb();
-	stopmachine_state = state;
-	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
-		cpu_relax();
-}
+	int i, err;
+	struct stop_machine_data active, idle;
+	struct task_struct **threads;
 
-static int stop_machine(void)
-{
-	int i, ret = 0;
-	unsigned long limit;
+	active.fn = fn;
+	active.data = data;
+	active.fnret = 0;
+	idle.fn = chill;
+	idle.data = NULL;
 
-	if (atomic_read(&stopmachine_busy_exit)) {
-		/*
-		 * previous try was timeout, and still there is a unreachable
-		 * cpu and abandoned child.
-		 */
-		return -EBUSY;
+	/* If they don't care which cpu fn runs on, just pick one. */
+	if (cpu == NR_CPUS)
+		cpu = any_online_cpu(cpu_online_map);
+
+	/* This could be too big for stack on large machines. */
+	threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL);
+	if (!threads)
+		return -ENOMEM;
+
+	/* Set up initial state. */
+	init_completion(&finished);
+	set_state(STOPMACHINE_DISABLE_IRQ);
+
+	for_each_online_cpu(i) {
+		struct stop_machine_data *smdata;
+		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+
+		if (cpu == ALL_CPUS || i == cpu)
+			smdata = &active;
+		else
+			smdata = &idle;
+
+		threads[i] = kthread_create(stop_cpu, smdata, "kstop%u", i);
+		if (IS_ERR(threads[i])) {
+			err = PTR_ERR(threads[i]);
+			threads[i] = NULL;
+			goto kill_threads;
+		}
+
+		/* Place it onto correct cpu. */
+		kthread_bind(threads[i], i);
+
+		/* Make it highest prio. */
+		if (sched_setscheduler(threads[i], SCHED_FIFO, &param) != 0)
+			BUG();
+
+		num_threads++;
 	}
 
-	atomic_set(&stopmachine_thread_ack, 0);
-	stopmachine_num_threads = 0;
-	stopmachine_state = STOPMACHINE_WAIT;
+	/* We've created all the threads.  Wake them all. */
+	cpu = get_cpu();
+	for_each_online_cpu(i)
+		wake_up_process(threads[i]);
 
-	for_each_online_cpu(i) {
-		if (i == raw_smp_processor_id())
-			continue;
-		ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
-		if (ret < 0)
-			goto exit_threads;
-		stopmachine_num_threads++;
-	}
+	/* This will release the thread on our CPU. */
+	put_cpu();
+	wait_for_completion(&finished);
 
-	limit = jiffies + msecs_to_jiffies(stopmachine_timeout * MSEC_PER_SEC);
+	kfree(threads);
 
-	/* Wait for them all to come to life on the target. */
-	stopmachine_state = STOPMACHINE_DEPLOY;
-	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
-		if (time_is_after_jiffies(limit))
-			yield();
-		else
-			goto deploy_timeout;
+	return active.fnret;
 
-	/* Now they are all started, make them hold the CPUs, ready. */
-	preempt_disable();
-	stopmachine_set_state(STOPMACHINE_PREPARE);
-
-	/* Make them disable irqs. */
-	local_irq_disable();
-	hard_irq_disable();
-	stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
-
-	return 0;
-
-deploy_timeout:
-	printk(KERN_CRIT "stopmachine: Failed to stop machine in time(%lds). "
-		"Are there any CPUs on file?\n", stopmachine_timeout);
-
-	/* defer exit check to the beginning of next try. */
-	atomic_set(&stopmachine_busy_exit, stopmachine_num_threads);
-
-	printk(KERN_INFO "stopmachine: cpu#%d is initiator of failed stop.\n",
-			raw_smp_processor_id());
-	smp_wmb();
-	stopmachine_state = STOPMACHINE_EXIT;
-
-	return -EBUSY;
-
-exit_threads:
-	/* Wait for them all to exit, since stop is canceled */
-	stopmachine_set_state(STOPMACHINE_EXIT);
-
-	return ret;
-}
-
-static void restart_machine(void)
-{
-	stopmachine_set_state(STOPMACHINE_EXIT);
-	local_irq_enable();
-	preempt_enable_no_resched();
-}
-
-static void run_other_cpus(void)
-{
-	stopmachine_set_state(STOPMACHINE_RUN);
-}
-
-static int do_stop(void *_smdata)
-{
-	struct stop_machine_data *smdata = _smdata;
-	int ret;
-
-	ret = stop_machine();
-	if (ret == 0) {
-		ret = smdata->fn(smdata->data);
-		if (smdata->run_all)
-			run_other_cpus();
-		restart_machine();
-	}
-
-	/* We're done: you can kthread_stop us now */
-	complete(&smdata->done);
-
-	/* Wait for kthread_stop */
-	set_current_state(TASK_INTERRUPTIBLE);
-	while (!kthread_should_stop()) {
-		schedule();
-		set_current_state(TASK_INTERRUPTIBLE);
-	}
-	__set_current_state(TASK_RUNNING);
-	return ret;
-}
-
-struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
-				       unsigned int cpu)
-{
-	static DEFINE_MUTEX(stopmachine_mutex);
-	struct stop_machine_data smdata;
-	struct task_struct *p;
-
-	mutex_lock(&stopmachine_mutex);
-
-	smdata.fn = fn;
-	smdata.data = data;
-	smdata.run_all = (cpu == ALL_CPUS) ? 1 : 0;
-	init_completion(&smdata.done);
-
-	smp_wmb(); /* make sure other cpus see smdata updates */
-
-	/* If they don't care which CPU fn runs on, bind to any online one. */
-	if (cpu == NR_CPUS || cpu == ALL_CPUS)
-		cpu = raw_smp_processor_id();
-
-	p = kthread_create(do_stop, &smdata, "kstopmachine");
-	if (!IS_ERR(p)) {
-		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-
-		/* One high-prio thread per cpu.  We'll do this one. */
-		sched_setscheduler(p, SCHED_FIFO, &param);
-		kthread_bind(p, cpu);
-		wake_up_process(p);
-		wait_for_completion(&smdata.done);
-	}
-	mutex_unlock(&stopmachine_mutex);
-	return p;
+kill_threads:
+	for_each_online_cpu(i)
+		if (threads[i])
+			kthread_stop(threads[i]);
+	kfree(threads);
+	return err;
 }
 
 int stop_machine_run_notype(int (*fn)(void *), void *data, unsigned int cpu)
 {
-	struct task_struct *p;
 	int ret;
 
 	/* No CPUs can come up or down during this. */
 	get_online_cpus();
-	p = __stop_machine_run(fn, data, cpu);
-	if (!IS_ERR(p))
-		ret = kthread_stop(p);
-	else
-		ret = PTR_ERR(p);
+	ret = __stop_machine_run(fn, data, cpu);
 	put_online_cpus();
 
 	return ret;
diff -r 2215ff4e6a64 kernel/sysctl.c
--- a/kernel/sysctl.c	Tue May 06 12:56:27 2008 +1000
+++ b/kernel/sysctl.c	Tue May 06 17:10:53 2008 +1000
@@ -81,7 +81,6 @@ extern int maps_protect;
 extern int maps_protect;
 extern int sysctl_stat_interval;
 extern int latencytop_enabled;
-extern unsigned long stopmachine_timeout;
 
 /* Constants used for minimum and  maximum */
 #if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
@@ -805,17 +804,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dostring,
 		.strategy	= &sysctl_string,
 	},
-#ifdef CONFIG_STOP_MACHINE
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "stopmachine_timeout",
-		.data		= &stopmachine_timeout,
-		.maxlen		= sizeof(unsigned long),
-		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
-		.strategy	= &sysctl_intvec,
-	},
-#endif
 #ifdef CONFIG_KEYS
 	{
 		.ctl_name	= CTL_UNNUMBERED,
