workqueue: reimplement CPU hotplugging support using trustee

Reimplement CPU hotplugging support using trustee thread. On CPU down, a trustee thread is created and each step of CPU down is executed by the trustee and workqueue_cpu_callback() simply drives and waits for trustee state transitions. CPU down operation no longer waits for works to be drained but trustee sticks around till all pending works have been completed. If CPU is brought back up while works are still draining, workqueue_cpu_callback() tells trustee to step down and tell workers to rebind to the cpu. As it's difficult to tell whether cwqs are empty if it's freezing or frozen, trustee doesn't consider draining to be complete while a gcwq is freezing or frozen (tracked by new GCWQ_FREEZING flag). Also, workers which get unbound from their cpu are marked with WORKER_ROGUE. Trustee based implementation doesn't bring any new feature at this point but it will be used to manage worker pool when dynamic shared worker pool is implemented. Signed-off-by: Tejun Heo <tj@kernel.org>

workqueue: reimplement CPU hotplugging support using trustee
Reimplement CPU hotplugging support using trustee thread. On CPU down, a trustee thread is created and each step of CPU down is executed by the trustee and workqueue_cpu_callback() simply drives and waits for trustee state transitions. CPU down operation no longer waits for works to be drained but trustee sticks around till all pending works have been completed. If CPU is brought back up while works are still draining, workqueue_cpu_callback() tells trustee to step down and tell workers to rebind to the cpu. As it's difficult to tell whether cwqs are empty if it's freezing or frozen, trustee doesn't consider draining to be complete while a gcwq is freezing or frozen (tracked by new GCWQ_FREEZING flag). Also, workers which get unbound from their cpu are marked with WORKER_ROGUE. Trustee based implementation doesn't bring any new feature at this point but it will be used to manage worker pool when dynamic shared worker pool is implemented. Signed-off-by: Tejun Heo <tj@kernel.org>
Tejun Heo
1 parent c8e55f3602
Showing 2 changed files with 279 additions and 16 deletions Side-by-side Diff
include/linux/cpu.h
kernel/workqueue.c
@@ -71,6 +71,8 @@
 	/* migration should happen before other stuff but after perf */
 	CPU_PRI_PERF		= 20,
 	CPU_PRI_MIGRATION	= 10,
+	/* prepare workqueues for other notifiers */
+	CPU_PRI_WORKQUEUE	= 5,
 };
  
 #ifdef CONFIG_SMP
@@ -36,14 +36,27 @@
 #include <linux/idr.h>
  
 enum {
+	/* global_cwq flags */
+	GCWQ_FREEZING		= 1 << 3,	/* freeze in progress */
+
 	/* worker flags */
 	WORKER_STARTED		= 1 << 0,	/* started */
 	WORKER_DIE		= 1 << 1,	/* die die die */
 	WORKER_IDLE		= 1 << 2,	/* is idle */
+	WORKER_ROGUE		= 1 << 4,	/* not bound to any cpu */
  
+	/* gcwq->trustee_state */
+	TRUSTEE_START		= 0,		/* start */
+	TRUSTEE_IN_CHARGE	= 1,		/* trustee in charge of gcwq */
+	TRUSTEE_BUTCHER		= 2,		/* butcher workers */
+	TRUSTEE_RELEASE		= 3,		/* release workers */
+	TRUSTEE_DONE		= 4,		/* trustee is done */
+
 	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
 	BUSY_WORKER_HASH_SIZE	= 1 << BUSY_WORKER_HASH_ORDER,
 	BUSY_WORKER_HASH_MASK	= BUSY_WORKER_HASH_SIZE - 1,
+
+	TRUSTEE_COOLDOWN	= HZ / 10,	/* for trustee draining */
 };
  
 /*
@@ -83,6 +96,7 @@
 struct global_cwq {
 	spinlock_t		lock;		/* the gcwq lock */
 	unsigned int		cpu;		/* I: the associated cpu */
+	unsigned int		flags;		/* L: GCWQ_* flags */
  
 	int			nr_workers;	/* L: total number of workers */
 	int			nr_idle;	/* L: currently idle ones */
@@ -93,6 +107,10 @@
 						/* L: hash of busy workers */
  
 	struct ida		worker_ida;	/* L: for worker IDs */
+
+	struct task_struct	*trustee;	/* L: for gcwq shutdown */
+	unsigned int		trustee_state;	/* L: trustee state */
+	wait_queue_head_t	trustee_wait;	/* trustee wait */
 } ____cacheline_aligned_in_smp;
  
 /*
@@ -148,6 +166,10 @@
 #endif
 };
  
+#define for_each_busy_worker(worker, i, pos, gcwq)			\
+	for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)			\
+		hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
+
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
  
 static struct debug_obj_descr work_debug_descr;
@@ -546,6 +568,9 @@
  
 	/* idle_list is LIFO */
 	list_add(&worker->entry, &gcwq->idle_list);
+
+	if (unlikely(worker->flags & WORKER_ROGUE))
+		wake_up_all(&gcwq->trustee_wait);
 }
  
 /**
  
@@ -622,8 +647,15 @@
 	if (IS_ERR(worker->task))
 		goto fail;
  
+	/*
+	 * A rogue worker will become a regular one if CPU comes
+	 * online later on.  Make sure every worker has
+	 * PF_THREAD_BOUND set.
+	 */
 	if (bind)
 		kthread_bind(worker->task, gcwq->cpu);
+	else
+		worker->task->flags |= PF_THREAD_BOUND;
  
 	return worker;
 fail:
@@ -882,10 +914,6 @@
 	struct cpu_workqueue_struct *cwq = worker->cwq;
  
 woke_up:
-	if (unlikely(!cpumask_equal(&worker->task->cpus_allowed,
-				    get_cpu_mask(gcwq->cpu))))
-		set_cpus_allowed_ptr(worker->task, get_cpu_mask(gcwq->cpu));
-
 	spin_lock_irq(&gcwq->lock);
  
 	/* DIE can be set only while we're idle, checking here is enough */
@@ -895,7 +923,7 @@
 	}
  
 	worker_leave_idle(worker);
-
+recheck:
 	/*
 	 * ->scheduled list can only be filled while a worker is
 	 * preparing to process a work or actually processing it.
@@ -908,6 +936,22 @@
 			list_first_entry(&cwq->worklist,
 					 struct work_struct, entry);
  
+		/*
+		 * The following is a rather inefficient way to close
+		 * race window against cpu hotplug operations.  Will
+		 * be replaced soon.
+		 */
+		if (unlikely(!(worker->flags & WORKER_ROGUE) &&
+			     !cpumask_equal(&worker->task->cpus_allowed,
+					    get_cpu_mask(gcwq->cpu)))) {
+			spin_unlock_irq(&gcwq->lock);
+			set_cpus_allowed_ptr(worker->task,
+					     get_cpu_mask(gcwq->cpu));
+			cpu_relax();
+			spin_lock_irq(&gcwq->lock);
+			goto recheck;
+		}
+
 		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
 			/* optimization path, not strictly necessary */
 			process_one_work(worker, work);
  
  
  
  
  
  
@@ -1812,29 +1856,237 @@
 }
 EXPORT_SYMBOL_GPL(destroy_workqueue);
  
+/*
+ * CPU hotplug.
+ *
+ * CPU hotplug is implemented by allowing cwqs to be detached from
+ * CPU, running with unbound workers and allowing them to be
+ * reattached later if the cpu comes back online.  A separate thread
+ * is created to govern cwqs in such state and is called the trustee.
+ *
+ * Trustee states and their descriptions.
+ *
+ * START	Command state used on startup.  On CPU_DOWN_PREPARE, a
+ *		new trustee is started with this state.
+ *
+ * IN_CHARGE	Once started, trustee will enter this state after
+ *		making all existing workers rogue.  DOWN_PREPARE waits
+ *		for trustee to enter this state.  After reaching
+ *		IN_CHARGE, trustee tries to execute the pending
+ *		worklist until it's empty and the state is set to
+ *		BUTCHER, or the state is set to RELEASE.
+ *
+ * BUTCHER	Command state which is set by the cpu callback after
+ *		the cpu has went down.  Once this state is set trustee
+ *		knows that there will be no new works on the worklist
+ *		and once the worklist is empty it can proceed to
+ *		killing idle workers.
+ *
+ * RELEASE	Command state which is set by the cpu callback if the
+ *		cpu down has been canceled or it has come online
+ *		again.  After recognizing this state, trustee stops
+ *		trying to drain or butcher and transits to DONE.
+ *
+ * DONE		Trustee will enter this state after BUTCHER or RELEASE
+ *		is complete.
+ *
+ *          trustee                 CPU                draining
+ *         took over                down               complete
+ * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
+ *                        |                     |                  ^
+ *                        | CPU is back online  v   return workers |
+ *                         ----------------> RELEASE --------------
+ */
+
+/**
+ * trustee_wait_event_timeout - timed event wait for trustee
+ * @cond: condition to wait for
+ * @timeout: timeout in jiffies
+ *
+ * wait_event_timeout() for trustee to use.  Handles locking and
+ * checks for RELEASE request.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by trustee.
+ *
+ * RETURNS:
+ * Positive indicating left time if @cond is satisfied, 0 if timed
+ * out, -1 if canceled.
+ */
+#define trustee_wait_event_timeout(cond, timeout) ({			\
+	long __ret = (timeout);						\
+	while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) &&	\
+	       __ret) {							\
+		spin_unlock_irq(&gcwq->lock);				\
+		__wait_event_timeout(gcwq->trustee_wait, (cond) ||	\
+			(gcwq->trustee_state == TRUSTEE_RELEASE),	\
+			__ret);						\
+		spin_lock_irq(&gcwq->lock);				\
+	}								\
+	gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);		\
+})
+
+/**
+ * trustee_wait_event - event wait for trustee
+ * @cond: condition to wait for
+ *
+ * wait_event() for trustee to use.  Automatically handles locking and
+ * checks for CANCEL request.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by trustee.
+ *
+ * RETURNS:
+ * 0 if @cond is satisfied, -1 if canceled.
+ */
+#define trustee_wait_event(cond) ({					\
+	long __ret1;							\
+	__ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
+	__ret1 < 0 ? -1 : 0;						\
+})
+
+static int __cpuinit trustee_thread(void *__gcwq)
+{
+	struct global_cwq *gcwq = __gcwq;
+	struct worker *worker;
+	struct hlist_node *pos;
+	int i;
+
+	BUG_ON(gcwq->cpu != smp_processor_id());
+
+	spin_lock_irq(&gcwq->lock);
+	/*
+	 * Make all multithread workers rogue.  Trustee must be bound
+	 * to the target cpu and can't be cancelled.
+	 */
+	BUG_ON(gcwq->cpu != smp_processor_id());
+
+	list_for_each_entry(worker, &gcwq->idle_list, entry)
+		if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
+			worker->flags |= WORKER_ROGUE;
+
+	for_each_busy_worker(worker, i, pos, gcwq)
+		if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
+			worker->flags |= WORKER_ROGUE;
+
+	/*
+	 * We're now in charge.  Notify and proceed to drain.  We need
+	 * to keep the gcwq running during the whole CPU down
+	 * procedure as other cpu hotunplug callbacks may need to
+	 * flush currently running tasks.
+	 */
+	gcwq->trustee_state = TRUSTEE_IN_CHARGE;
+	wake_up_all(&gcwq->trustee_wait);
+
+	/*
+	 * The original cpu is in the process of dying and may go away
+	 * anytime now.  When that happens, we and all workers would
+	 * be migrated to other cpus.  Try draining any left work.
+	 * Note that if the gcwq is frozen, there may be frozen works
+	 * in freezeable cwqs.  Don't declare completion while frozen.
+	 */
+	while (gcwq->nr_workers != gcwq->nr_idle ||
+	       gcwq->flags & GCWQ_FREEZING ||
+	       gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
+		/* give a breather */
+		if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
+			break;
+	}
+
+	/* notify completion */
+	gcwq->trustee = NULL;
+	gcwq->trustee_state = TRUSTEE_DONE;
+	wake_up_all(&gcwq->trustee_wait);
+	spin_unlock_irq(&gcwq->lock);
+	return 0;
+}
+
+/**
+ * wait_trustee_state - wait for trustee to enter the specified state
+ * @gcwq: gcwq the trustee of interest belongs to
+ * @state: target state to wait for
+ *
+ * Wait for the trustee to reach @state.  DONE is already matched.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by cpu_callback.
+ */
+static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
+{
+	if (!(gcwq->trustee_state == state ||
+	      gcwq->trustee_state == TRUSTEE_DONE)) {
+		spin_unlock_irq(&gcwq->lock);
+		__wait_event(gcwq->trustee_wait,
+			     gcwq->trustee_state == state ||
+			     gcwq->trustee_state == TRUSTEE_DONE);
+		spin_lock_irq(&gcwq->lock);
+	}
+}
+
 static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 						unsigned long action,
 						void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct cpu_workqueue_struct *cwq;
-	struct workqueue_struct *wq;
+	struct global_cwq *gcwq = get_gcwq(cpu);
+	struct task_struct *new_trustee = NULL;
+	struct worker *worker;
+	struct hlist_node *pos;
+	unsigned long flags;
+	int i;
  
 	action &= ~CPU_TASKS_FROZEN;
  
-	list_for_each_entry(wq, &workqueues, list) {
-		if (wq->flags & WQ_SINGLE_THREAD)
-			continue;
+	switch (action) {
+	case CPU_DOWN_PREPARE:
+		new_trustee = kthread_create(trustee_thread, gcwq,
+					     "workqueue_trustee/%d\n", cpu);
+		if (IS_ERR(new_trustee))
+			return notifier_from_errno(PTR_ERR(new_trustee));
+		kthread_bind(new_trustee, cpu);
+	}
  
-		cwq = get_cwq(cpu, wq);
+	/* some are called w/ irq disabled, don't disturb irq status */
+	spin_lock_irqsave(&gcwq->lock, flags);
  
-		switch (action) {
-		case CPU_POST_DEAD:
-			flush_workqueue(wq);
-			break;
+	switch (action) {
+	case CPU_DOWN_PREPARE:
+		/* initialize trustee and tell it to acquire the gcwq */
+		BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
+		gcwq->trustee = new_trustee;
+		gcwq->trustee_state = TRUSTEE_START;
+		wake_up_process(gcwq->trustee);
+		wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
+		break;
+
+	case CPU_POST_DEAD:
+		gcwq->trustee_state = TRUSTEE_BUTCHER;
+		break;
+
+	case CPU_DOWN_FAILED:
+	case CPU_ONLINE:
+		if (gcwq->trustee_state != TRUSTEE_DONE) {
+			gcwq->trustee_state = TRUSTEE_RELEASE;
+			wake_up_process(gcwq->trustee);
+			wait_trustee_state(gcwq, TRUSTEE_DONE);
 		}
+
+		/* clear ROGUE from all multithread workers */
+		list_for_each_entry(worker, &gcwq->idle_list, entry)
+			if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
+				worker->flags &= ~WORKER_ROGUE;
+
+		for_each_busy_worker(worker, i, pos, gcwq)
+			if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
+				worker->flags &= ~WORKER_ROGUE;
+		break;
 	}
  
+	spin_unlock_irqrestore(&gcwq->lock, flags);
+
 	return notifier_from_errno(0);
 }
  
@@ -1912,6 +2164,9 @@
  
 		spin_lock_irq(&gcwq->lock);
  
+		BUG_ON(gcwq->flags & GCWQ_FREEZING);
+		gcwq->flags |= GCWQ_FREEZING;
+
 		list_for_each_entry(wq, &workqueues, list) {
 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
  
@@ -1995,6 +2250,9 @@
  
 		spin_lock_irq(&gcwq->lock);
  
+		BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
+		gcwq->flags &= ~GCWQ_FREEZING;
+
 		list_for_each_entry(wq, &workqueues, list) {
 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
  
@@ -2026,7 +2284,7 @@
 	int i;
  
 	singlethread_cpu = cpumask_first(cpu_possible_mask);
-	hotcpu_notifier(workqueue_cpu_callback, 0);
+	hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
  
 	/* initialize gcwqs */
 	for_each_possible_cpu(cpu) {
@@ -2040,6 +2298,9 @@
 			INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
  
 		ida_init(&gcwq->worker_ida);
+
+		gcwq->trustee_state = TRUSTEE_DONE;
+		init_waitqueue_head(&gcwq->trustee_wait);
 	}
  
 	keventd_wq = create_workqueue("events");
...	...	@@ -71,6 +71,8 @@
71	71	/* migration should happen before other stuff but after perf */
72	72	CPU_PRI_PERF = 20,
73	73	CPU_PRI_MIGRATION = 10,
	74	+ /* prepare workqueues for other notifiers */
	75	+ CPU_PRI_WORKQUEUE = 5,
74	76	};
75	77
76	78	#ifdef CONFIG_SMP
...	...	@@ -36,14 +36,27 @@
36	36	#include <linux/idr.h>
37	37
38	38	enum {
	39	+ /* global_cwq flags */
	40	+ GCWQ_FREEZING = 1 << 3, /* freeze in progress */
	41	+
39	42	/* worker flags */
40	43	WORKER_STARTED = 1 << 0, /* started */
41	44	WORKER_DIE = 1 << 1, /* die die die */
42	45	WORKER_IDLE = 1 << 2, /* is idle */
	46	+ WORKER_ROGUE = 1 << 4, /* not bound to any cpu */
43	47
	48	+ /* gcwq->trustee_state */
	49	+ TRUSTEE_START = 0, /* start */
	50	+ TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */
	51	+ TRUSTEE_BUTCHER = 2, /* butcher workers */
	52	+ TRUSTEE_RELEASE = 3, /* release workers */
	53	+ TRUSTEE_DONE = 4, /* trustee is done */
	54	+
44	55	BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
45	56	BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
46	57	BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1,
	58	+
	59	+ TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
47	60	};
48	61
49	62	/*
...	...	@@ -83,6 +96,7 @@
83	96	struct global_cwq {
84	97	spinlock_t lock; /* the gcwq lock */
85	98	unsigned int cpu; /* I: the associated cpu */
	99	+ unsigned int flags; /* L: GCWQ_* flags */
86	100
87	101	int nr_workers; /* L: total number of workers */
88	102	int nr_idle; /* L: currently idle ones */
...	...	@@ -93,6 +107,10 @@
93	107	/* L: hash of busy workers */
94	108
95	109	struct ida worker_ida; /* L: for worker IDs */
	110	+
	111	+ struct task_struct trustee; / L: for gcwq shutdown */
	112	+ unsigned int trustee_state; /* L: trustee state */
	113	+ wait_queue_head_t trustee_wait; /* trustee wait */
96	114	} ____cacheline_aligned_in_smp;
97	115
98	116	/*
...	...	@@ -148,6 +166,10 @@
148	166	#endif
149	167	};
150	168
	169	+#define for_each_busy_worker(worker, i, pos, gcwq) \
	170	+ for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
	171	+ hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
	172	+
151	173	#ifdef CONFIG_DEBUG_OBJECTS_WORK
152	174
153	175	static struct debug_obj_descr work_debug_descr;
...	...	@@ -546,6 +568,9 @@
546	568
547	569	/* idle_list is LIFO */
548	570	list_add(&worker->entry, &gcwq->idle_list);
	571	+
	572	+ if (unlikely(worker->flags & WORKER_ROGUE))
	573	+ wake_up_all(&gcwq->trustee_wait);
549	574	}
550	575
551	576	/**
552	577
...	...	@@ -622,8 +647,15 @@
622	647	if (IS_ERR(worker->task))
623	648	goto fail;
624	649
	650	+ /*
	651	+ * A rogue worker will become a regular one if CPU comes
	652	+ * online later on. Make sure every worker has
	653	+ * PF_THREAD_BOUND set.
	654	+ */
625	655	if (bind)
626	656	kthread_bind(worker->task, gcwq->cpu);
	657	+ else
	658	+ worker->task->flags \|= PF_THREAD_BOUND;
627	659
628	660	return worker;
629	661	fail:
...	...	@@ -882,10 +914,6 @@
882	914	struct cpu_workqueue_struct *cwq = worker->cwq;
883	915
884	916	woke_up:
885		- if (unlikely(!cpumask_equal(&worker->task->cpus_allowed,
886		- get_cpu_mask(gcwq->cpu))))
887		- set_cpus_allowed_ptr(worker->task, get_cpu_mask(gcwq->cpu));
888		-
889	917	spin_lock_irq(&gcwq->lock);
890	918
891	919	/* DIE can be set only while we're idle, checking here is enough */
...	...	@@ -895,7 +923,7 @@
895	923	}
896	924
897	925	worker_leave_idle(worker);
898		-
	926	+recheck:
899	927	/*
900	928	* ->scheduled list can only be filled while a worker is
901	929	* preparing to process a work or actually processing it.
...	...	@@ -908,6 +936,22 @@
908	936	list_first_entry(&cwq->worklist,
909	937	struct work_struct, entry);
910	938
	939	+ /*
	940	+ * The following is a rather inefficient way to close
	941	+ * race window against cpu hotplug operations. Will
	942	+ * be replaced soon.
	943	+ */
	944	+ if (unlikely(!(worker->flags & WORKER_ROGUE) &&
	945	+ !cpumask_equal(&worker->task->cpus_allowed,
	946	+ get_cpu_mask(gcwq->cpu)))) {
	947	+ spin_unlock_irq(&gcwq->lock);
	948	+ set_cpus_allowed_ptr(worker->task,
	949	+ get_cpu_mask(gcwq->cpu));
	950	+ cpu_relax();
	951	+ spin_lock_irq(&gcwq->lock);
	952	+ goto recheck;
	953	+ }
	954	+
911	955	if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
912	956	/* optimization path, not strictly necessary */
913	957	process_one_work(worker, work);
914	958
915	959
916	960
917	961
918	962
919	963
...	...	@@ -1812,29 +1856,237 @@
1812	1856	}
1813	1857	EXPORT_SYMBOL_GPL(destroy_workqueue);
1814	1858
	1859	+/*
	1860	+ * CPU hotplug.
	1861	+ *
	1862	+ * CPU hotplug is implemented by allowing cwqs to be detached from
	1863	+ * CPU, running with unbound workers and allowing them to be
	1864	+ * reattached later if the cpu comes back online. A separate thread
	1865	+ * is created to govern cwqs in such state and is called the trustee.
	1866	+ *
	1867	+ * Trustee states and their descriptions.
	1868	+ *
	1869	+ * START Command state used on startup. On CPU_DOWN_PREPARE, a
	1870	+ * new trustee is started with this state.
	1871	+ *
	1872	+ * IN_CHARGE Once started, trustee will enter this state after
	1873	+ * making all existing workers rogue. DOWN_PREPARE waits
	1874	+ * for trustee to enter this state. After reaching
	1875	+ * IN_CHARGE, trustee tries to execute the pending
	1876	+ * worklist until it's empty and the state is set to
	1877	+ * BUTCHER, or the state is set to RELEASE.
	1878	+ *
	1879	+ * BUTCHER Command state which is set by the cpu callback after
	1880	+ * the cpu has went down. Once this state is set trustee
	1881	+ * knows that there will be no new works on the worklist
	1882	+ * and once the worklist is empty it can proceed to
	1883	+ * killing idle workers.
	1884	+ *
	1885	+ * RELEASE Command state which is set by the cpu callback if the
	1886	+ * cpu down has been canceled or it has come online
	1887	+ * again. After recognizing this state, trustee stops
	1888	+ * trying to drain or butcher and transits to DONE.
	1889	+ *
	1890	+ * DONE Trustee will enter this state after BUTCHER or RELEASE
	1891	+ * is complete.
	1892	+ *
	1893	+ * trustee CPU draining
	1894	+ * took over down complete
	1895	+ * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
	1896	+ * \| \| ^
	1897	+ * \| CPU is back online v return workers \|
	1898	+ * ----------------> RELEASE --------------
	1899	+ */
	1900	+
	1901	+/**
	1902	+ * trustee_wait_event_timeout - timed event wait for trustee
	1903	+ * @cond: condition to wait for
	1904	+ * @timeout: timeout in jiffies
	1905	+ *
	1906	+ * wait_event_timeout() for trustee to use. Handles locking and
	1907	+ * checks for RELEASE request.
	1908	+ *
	1909	+ * CONTEXT:
	1910	+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
	1911	+ * multiple times. To be used by trustee.
	1912	+ *
	1913	+ * RETURNS:
	1914	+ * Positive indicating left time if @cond is satisfied, 0 if timed
	1915	+ * out, -1 if canceled.
	1916	+ */
	1917	+#define trustee_wait_event_timeout(cond, timeout) ({ \
	1918	+ long __ret = (timeout); \
	1919	+ while (!((cond) \|\| (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
	1920	+ __ret) { \
	1921	+ spin_unlock_irq(&gcwq->lock); \
	1922	+ __wait_event_timeout(gcwq->trustee_wait, (cond) \|\| \
	1923	+ (gcwq->trustee_state == TRUSTEE_RELEASE), \
	1924	+ __ret); \
	1925	+ spin_lock_irq(&gcwq->lock); \
	1926	+ } \
	1927	+ gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \
	1928	+})
	1929	+
	1930	+/**
	1931	+ * trustee_wait_event - event wait for trustee
	1932	+ * @cond: condition to wait for
	1933	+ *
	1934	+ * wait_event() for trustee to use. Automatically handles locking and
	1935	+ * checks for CANCEL request.
	1936	+ *
	1937	+ * CONTEXT:
	1938	+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
	1939	+ * multiple times. To be used by trustee.
	1940	+ *
	1941	+ * RETURNS:
	1942	+ * 0 if @cond is satisfied, -1 if canceled.
	1943	+ */
	1944	+#define trustee_wait_event(cond) ({ \
	1945	+ long __ret1; \
	1946	+ __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
	1947	+ __ret1 < 0 ? -1 : 0; \
	1948	+})
	1949	+
	1950	+static int __cpuinit trustee_thread(void *__gcwq)
	1951	+{
	1952	+ struct global_cwq *gcwq = __gcwq;
	1953	+ struct worker *worker;
	1954	+ struct hlist_node *pos;
	1955	+ int i;
	1956	+
	1957	+ BUG_ON(gcwq->cpu != smp_processor_id());
	1958	+
	1959	+ spin_lock_irq(&gcwq->lock);
	1960	+ /*
	1961	+ * Make all multithread workers rogue. Trustee must be bound
	1962	+ * to the target cpu and can't be cancelled.
	1963	+ */
	1964	+ BUG_ON(gcwq->cpu != smp_processor_id());
	1965	+
	1966	+ list_for_each_entry(worker, &gcwq->idle_list, entry)
	1967	+ if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
	1968	+ worker->flags \|= WORKER_ROGUE;
	1969	+
	1970	+ for_each_busy_worker(worker, i, pos, gcwq)
	1971	+ if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
	1972	+ worker->flags \|= WORKER_ROGUE;
	1973	+
	1974	+ /*
	1975	+ * We're now in charge. Notify and proceed to drain. We need
	1976	+ * to keep the gcwq running during the whole CPU down
	1977	+ * procedure as other cpu hotunplug callbacks may need to
	1978	+ * flush currently running tasks.
	1979	+ */
	1980	+ gcwq->trustee_state = TRUSTEE_IN_CHARGE;
	1981	+ wake_up_all(&gcwq->trustee_wait);
	1982	+
	1983	+ /*
	1984	+ * The original cpu is in the process of dying and may go away
	1985	+ * anytime now. When that happens, we and all workers would
	1986	+ * be migrated to other cpus. Try draining any left work.
	1987	+ * Note that if the gcwq is frozen, there may be frozen works
	1988	+ * in freezeable cwqs. Don't declare completion while frozen.
	1989	+ */
	1990	+ while (gcwq->nr_workers != gcwq->nr_idle \|\|
	1991	+ gcwq->flags & GCWQ_FREEZING \|\|
	1992	+ gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
	1993	+ /* give a breather */
	1994	+ if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
	1995	+ break;
	1996	+ }
	1997	+
	1998	+ /* notify completion */
	1999	+ gcwq->trustee = NULL;
	2000	+ gcwq->trustee_state = TRUSTEE_DONE;
	2001	+ wake_up_all(&gcwq->trustee_wait);
	2002	+ spin_unlock_irq(&gcwq->lock);
	2003	+ return 0;
	2004	+}
	2005	+
	2006	+/**
	2007	+ * wait_trustee_state - wait for trustee to enter the specified state
	2008	+ * @gcwq: gcwq the trustee of interest belongs to
	2009	+ * @state: target state to wait for
	2010	+ *
	2011	+ * Wait for the trustee to reach @state. DONE is already matched.
	2012	+ *
	2013	+ * CONTEXT:
	2014	+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
	2015	+ * multiple times. To be used by cpu_callback.
	2016	+ */
	2017	+static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
	2018	+{
	2019	+ if (!(gcwq->trustee_state == state \|\|
	2020	+ gcwq->trustee_state == TRUSTEE_DONE)) {
	2021	+ spin_unlock_irq(&gcwq->lock);
	2022	+ __wait_event(gcwq->trustee_wait,
	2023	+ gcwq->trustee_state == state \|\|
	2024	+ gcwq->trustee_state == TRUSTEE_DONE);
	2025	+ spin_lock_irq(&gcwq->lock);
	2026	+ }
	2027	+}
	2028	+
1815	2029	static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
1816	2030	unsigned long action,
1817	2031	void *hcpu)
1818	2032	{
1819	2033	unsigned int cpu = (unsigned long)hcpu;
1820		- struct cpu_workqueue_struct *cwq;
1821		- struct workqueue_struct *wq;
	2034	+ struct global_cwq *gcwq = get_gcwq(cpu);
	2035	+ struct task_struct *new_trustee = NULL;
	2036	+ struct worker *worker;
	2037	+ struct hlist_node *pos;
	2038	+ unsigned long flags;
	2039	+ int i;
1822	2040
1823	2041	action &= ~CPU_TASKS_FROZEN;
1824	2042
1825		- list_for_each_entry(wq, &workqueues, list) {
1826		- if (wq->flags & WQ_SINGLE_THREAD)
1827		- continue;
	2043	+ switch (action) {
	2044	+ case CPU_DOWN_PREPARE:
	2045	+ new_trustee = kthread_create(trustee_thread, gcwq,
	2046	+ "workqueue_trustee/%d\n", cpu);
	2047	+ if (IS_ERR(new_trustee))
	2048	+ return notifier_from_errno(PTR_ERR(new_trustee));
	2049	+ kthread_bind(new_trustee, cpu);
	2050	+ }
1828	2051
1829		- cwq = get_cwq(cpu, wq);
	2052	+ /* some are called w/ irq disabled, don't disturb irq status */
	2053	+ spin_lock_irqsave(&gcwq->lock, flags);
1830	2054
1831		- switch (action) {
1832		- case CPU_POST_DEAD:
1833		- flush_workqueue(wq);
1834		- break;
	2055	+ switch (action) {
	2056	+ case CPU_DOWN_PREPARE:
	2057	+ /* initialize trustee and tell it to acquire the gcwq */
	2058	+ BUG_ON(gcwq->trustee \|\| gcwq->trustee_state != TRUSTEE_DONE);
	2059	+ gcwq->trustee = new_trustee;
	2060	+ gcwq->trustee_state = TRUSTEE_START;
	2061	+ wake_up_process(gcwq->trustee);
	2062	+ wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
	2063	+ break;
	2064	+
	2065	+ case CPU_POST_DEAD:
	2066	+ gcwq->trustee_state = TRUSTEE_BUTCHER;
	2067	+ break;
	2068	+
	2069	+ case CPU_DOWN_FAILED:
	2070	+ case CPU_ONLINE:
	2071	+ if (gcwq->trustee_state != TRUSTEE_DONE) {
	2072	+ gcwq->trustee_state = TRUSTEE_RELEASE;
	2073	+ wake_up_process(gcwq->trustee);
	2074	+ wait_trustee_state(gcwq, TRUSTEE_DONE);
1835	2075	}
	2076	+
	2077	+ /* clear ROGUE from all multithread workers */
	2078	+ list_for_each_entry(worker, &gcwq->idle_list, entry)
	2079	+ if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
	2080	+ worker->flags &= ~WORKER_ROGUE;
	2081	+
	2082	+ for_each_busy_worker(worker, i, pos, gcwq)
	2083	+ if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
	2084	+ worker->flags &= ~WORKER_ROGUE;
	2085	+ break;
1836	2086	}
1837	2087
	2088	+ spin_unlock_irqrestore(&gcwq->lock, flags);
	2089	+
1838	2090	return notifier_from_errno(0);
1839	2091	}
1840	2092
...	...	@@ -1912,6 +2164,9 @@
1912	2164
1913	2165	spin_lock_irq(&gcwq->lock);
1914	2166
	2167	+ BUG_ON(gcwq->flags & GCWQ_FREEZING);
	2168	+ gcwq->flags \|= GCWQ_FREEZING;
	2169	+
1915	2170	list_for_each_entry(wq, &workqueues, list) {
1916	2171	struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
1917	2172
...	...	@@ -1995,6 +2250,9 @@
1995	2250
1996	2251	spin_lock_irq(&gcwq->lock);
1997	2252
	2253	+ BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
	2254	+ gcwq->flags &= ~GCWQ_FREEZING;
	2255	+
1998	2256	list_for_each_entry(wq, &workqueues, list) {
1999	2257	struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2000	2258
...	...	@@ -2026,7 +2284,7 @@
2026	2284	int i;
2027	2285
2028	2286	singlethread_cpu = cpumask_first(cpu_possible_mask);
2029		- hotcpu_notifier(workqueue_cpu_callback, 0);
	2287	+ hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
2030	2288
2031	2289	/* initialize gcwqs */
2032	2290	for_each_possible_cpu(cpu) {
...	...	@@ -2040,6 +2298,9 @@
2040	2298	INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
2041	2299
2042	2300	ida_init(&gcwq->worker_ida);
	2301	+
	2302	+ gcwq->trustee_state = TRUSTEE_DONE;
	2303	+ init_waitqueue_head(&gcwq->trustee_wait);
2043	2304	}
2044	2305
2045	2306	keventd_wq = create_workqueue("events");