Commit f2530dc71cf0822f90bb63ea4600caaef33a66bb

Authored by Thomas Gleixner
1 parent cfb63bafdb

kthread: Prevent unpark race which puts threads on the wrong cpu

The smpboot threads rely on the park/unpark mechanism which binds per
cpu threads on a particular core. Though the functionality is racy:

CPU0	       	 	CPU1  	     	    CPU2
unpark(T)				    wake_up_process(T)
  clear(SHOULD_PARK)	T runs
			leave parkme() due to !SHOULD_PARK
  bind_to(CPU2)		BUG_ON(wrong CPU)

We cannot let the tasks move themself to the target CPU as one of
those tasks is actually the migration thread itself, which requires
that it starts running on the target cpu right away.

The solution to this problem is to prevent wakeups in park mode which
are not from unpark(). That way we can guarantee that the association
of the task to the target cpu is working correctly.

Add a new task state (TASK_PARKED) which prevents other wakeups and
use this state explicitly for the unpark wakeup.

Peter noticed: Also, since the task state is visible to userspace and
all the parked tasks are still in the PID space, its a good hint in ps
and friends that these tasks aren't really there for the moment.

The migration thread has another related issue.

CPU0	      	     	 CPU1
Bring up CPU2
create_thread(T)
park(T)
 wait_for_completion()
			 parkme()
			 complete()
sched_set_stop_task()
			 schedule(TASK_PARKED)

The sched_set_stop_task() call is issued while the task is on the
runqueue of CPU1 and that confuses the hell out of the stop_task class
on that cpu. So we need the same synchronizaion before
sched_set_stop_task().

Reported-by: Dave Jones <davej@redhat.com>
Reported-and-tested-by: Dave Hansen <dave@sr71.net>
Reported-and-tested-by: Borislav Petkov <bp@alien8.de>
Acked-by: Peter Ziljstra <peterz@infradead.org>
Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: dhillf@gmail.com
Cc: Ingo Molnar <mingo@kernel.org>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1304091635430.21884@ionos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Showing 5 changed files with 45 additions and 29 deletions Side-by-side Diff

... ... @@ -143,6 +143,7 @@
143 143 "x (dead)", /* 64 */
144 144 "K (wakekill)", /* 128 */
145 145 "W (waking)", /* 256 */
  146 + "P (parked)", /* 512 */
146 147 };
147 148  
148 149 static inline const char *get_task_state(struct task_struct *tsk)
include/linux/sched.h
... ... @@ -163,9 +163,10 @@
163 163 #define TASK_DEAD 64
164 164 #define TASK_WAKEKILL 128
165 165 #define TASK_WAKING 256
166   -#define TASK_STATE_MAX 512
  166 +#define TASK_PARKED 512
  167 +#define TASK_STATE_MAX 1024
167 168  
168   -#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW"
  169 +#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
169 170  
170 171 extern char ___assert_task_state[1 - 2*!!(
171 172 sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
include/trace/events/sched.h
... ... @@ -147,7 +147,7 @@
147 147 __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
148 148 { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
149 149 { 16, "Z" }, { 32, "X" }, { 64, "x" },
150   - { 128, "W" }) : "R",
  150 + { 128, "K" }, { 256, "W" }, { 512, "P" }) : "R",
151 151 __entry->prev_state & TASK_STATE_MAX ? "+" : "",
152 152 __entry->next_comm, __entry->next_pid, __entry->next_prio)
153 153 );
... ... @@ -124,12 +124,12 @@
124 124  
125 125 static void __kthread_parkme(struct kthread *self)
126 126 {
127   - __set_current_state(TASK_INTERRUPTIBLE);
  127 + __set_current_state(TASK_PARKED);
128 128 while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
129 129 if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
130 130 complete(&self->parked);
131 131 schedule();
132   - __set_current_state(TASK_INTERRUPTIBLE);
  132 + __set_current_state(TASK_PARKED);
133 133 }
134 134 clear_bit(KTHREAD_IS_PARKED, &self->flags);
135 135 __set_current_state(TASK_RUNNING);
136 136  
... ... @@ -256,8 +256,13 @@
256 256 }
257 257 EXPORT_SYMBOL(kthread_create_on_node);
258 258  
259   -static void __kthread_bind(struct task_struct *p, unsigned int cpu)
  259 +static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
260 260 {
  261 + /* Must have done schedule() in kthread() before we set_task_cpu */
  262 + if (!wait_task_inactive(p, state)) {
  263 + WARN_ON(1);
  264 + return;
  265 + }
261 266 /* It's safe because the task is inactive. */
262 267 do_set_cpus_allowed(p, cpumask_of(cpu));
263 268 p->flags |= PF_THREAD_BOUND;
... ... @@ -274,12 +279,7 @@
274 279 */
275 280 void kthread_bind(struct task_struct *p, unsigned int cpu)
276 281 {
277   - /* Must have done schedule() in kthread() before we set_task_cpu */
278   - if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
279   - WARN_ON(1);
280   - return;
281   - }
282   - __kthread_bind(p, cpu);
  282 + __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
283 283 }
284 284 EXPORT_SYMBOL(kthread_bind);
285 285  
... ... @@ -324,6 +324,22 @@
324 324 return NULL;
325 325 }
326 326  
  327 +static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
  328 +{
  329 + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
  330 + /*
  331 + * We clear the IS_PARKED bit here as we don't wait
  332 + * until the task has left the park code. So if we'd
  333 + * park before that happens we'd see the IS_PARKED bit
  334 + * which might be about to be cleared.
  335 + */
  336 + if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
  337 + if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
  338 + __kthread_bind(k, kthread->cpu, TASK_PARKED);
  339 + wake_up_state(k, TASK_PARKED);
  340 + }
  341 +}
  342 +
327 343 /**
328 344 * kthread_unpark - unpark a thread created by kthread_create().
329 345 * @k: thread created by kthread_create().
... ... @@ -336,20 +352,8 @@
336 352 {
337 353 struct kthread *kthread = task_get_live_kthread(k);
338 354  
339   - if (kthread) {
340   - clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
341   - /*
342   - * We clear the IS_PARKED bit here as we don't wait
343   - * until the task has left the park code. So if we'd
344   - * park before that happens we'd see the IS_PARKED bit
345   - * which might be about to be cleared.
346   - */
347   - if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
348   - if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
349   - __kthread_bind(k, kthread->cpu);
350   - wake_up_process(k);
351   - }
352   - }
  355 + if (kthread)
  356 + __kthread_unpark(k, kthread);
353 357 put_task_struct(k);
354 358 }
355 359  
... ... @@ -407,7 +411,7 @@
407 411 trace_sched_kthread_stop(k);
408 412 if (kthread) {
409 413 set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
410   - clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
  414 + __kthread_unpark(k, kthread);
411 415 wake_up_process(k);
412 416 wait_for_completion(&kthread->exited);
413 417 }
... ... @@ -185,8 +185,18 @@
185 185 }
186 186 get_task_struct(tsk);
187 187 *per_cpu_ptr(ht->store, cpu) = tsk;
188   - if (ht->create)
189   - ht->create(cpu);
  188 + if (ht->create) {
  189 + /*
  190 + * Make sure that the task has actually scheduled out
  191 + * into park position, before calling the create
  192 + * callback. At least the migration thread callback
  193 + * requires that the task is off the runqueue.
  194 + */
  195 + if (!wait_task_inactive(tsk, TASK_PARKED))
  196 + WARN_ON(1);
  197 + else
  198 + ht->create(cpu);
  199 + }
190 200 return 0;
191 201 }
192 202