Commit bf52fa4a26567bfbf5b1d30f84cf0226e61d26cd

Authored by Doug Thompson
Committed by Linus Torvalds
1 parent fb3fb20687

drivers/edac: fix workq reset deadlock

Fix mutex locking deadlock on the device controller linked list.  Was calling
a lock then a function that could call the same lock.  Moved the cancel workq
function to outside the lock

Added some short circuit logic in the workq code

Added comments of description

Code tidying

Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Cc: Greg KH <greg@kroah.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 100 additions and 28 deletions Side-by-side Diff

drivers/edac/edac_device.c
... ... @@ -32,7 +32,9 @@
32 32 #include "edac_core.h"
33 33 #include "edac_module.h"
34 34  
35   -/* lock to memory controller's control array 'edac_device_list' */
  35 +/* lock for the list: 'edac_device_list', manipulation of this list
  36 + * is protected by the 'device_ctls_mutex' lock
  37 + */
36 38 static DEFINE_MUTEX(device_ctls_mutex);
37 39 static struct list_head edac_device_list = LIST_HEAD_INIT(edac_device_list);
38 40  
... ... @@ -386,6 +388,14 @@
386 388 /*
387 389 * edac_device_workq_function
388 390 * performs the operation scheduled by a workq request
  391 + *
  392 + * this workq is embedded within an edac_device_ctl_info
  393 + * structure, that needs to be polled for possible error events.
  394 + *
  395 + * This operation is to acquire the list mutex lock
  396 + * (thus preventing insertation or deletion)
  397 + * and then call the device's poll function IFF this device is
  398 + * running polled and there is a poll function defined.
389 399 */
390 400 static void edac_device_workq_function(struct work_struct *work_req)
391 401 {
... ... @@ -403,8 +413,17 @@
403 413  
404 414 mutex_unlock(&device_ctls_mutex);
405 415  
406   - /* Reschedule */
407   - queue_delayed_work(edac_workqueue, &edac_dev->work, edac_dev->delay);
  416 + /* Reschedule the workq for the next time period to start again
  417 + * if the number of msec is for 1 sec, then adjust to the next
  418 + * whole one second to save timers fireing all over the period
  419 + * between integral seconds
  420 + */
  421 + if (edac_dev->poll_msec == 1000)
  422 + queue_delayed_work(edac_workqueue, &edac_dev->work,
  423 + round_jiffies(edac_dev->delay));
  424 + else
  425 + queue_delayed_work(edac_workqueue, &edac_dev->work,
  426 + edac_dev->delay);
408 427 }
409 428  
410 429 /*
411 430  
412 431  
... ... @@ -417,11 +436,26 @@
417 436 {
418 437 debugf0("%s()\n", __func__);
419 438  
  439 + /* take the arg 'msec' and set it into the control structure
  440 + * to used in the time period calculation
  441 + * then calc the number of jiffies that represents
  442 + */
420 443 edac_dev->poll_msec = msec;
421   - edac_dev->delay = msecs_to_jiffies(msec); /* Calc delay jiffies */
  444 + edac_dev->delay = msecs_to_jiffies(msec);
422 445  
423 446 INIT_DELAYED_WORK(&edac_dev->work, edac_device_workq_function);
424   - queue_delayed_work(edac_workqueue, &edac_dev->work, edac_dev->delay);
  447 +
  448 + /* optimize here for the 1 second case, which will be normal value, to
  449 + * fire ON the 1 second time event. This helps reduce all sorts of
  450 + * timers firing on sub-second basis, while they are happy
  451 + * to fire together on the 1 second exactly
  452 + */
  453 + if (edac_dev->poll_msec == 1000)
  454 + queue_delayed_work(edac_workqueue, &edac_dev->work,
  455 + round_jiffies(edac_dev->delay));
  456 + else
  457 + queue_delayed_work(edac_workqueue, &edac_dev->work,
  458 + edac_dev->delay);
425 459 }
426 460  
427 461 /*
428 462  
429 463  
430 464  
... ... @@ -441,15 +475,19 @@
441 475  
442 476 /*
443 477 * edac_device_reset_delay_period
  478 + *
  479 + * need to stop any outstanding workq queued up at this time
  480 + * because we will be resetting the sleep time.
  481 + * Then restart the workq on the new delay
444 482 */
445   -
446 483 void edac_device_reset_delay_period(struct edac_device_ctl_info *edac_dev,
447 484 unsigned long value)
448 485 {
449   - mutex_lock(&device_ctls_mutex);
450   -
451   - /* cancel the current workq request */
  486 + /* cancel the current workq request, without the mutex lock */
452 487 edac_device_workq_teardown(edac_dev);
  488 +
  489 + /* acquire the mutex before doing the workq setup */
  490 + mutex_lock(&device_ctls_mutex);
453 491  
454 492 /* restart the workq request, with new delay value */
455 493 edac_device_workq_setup(edac_dev, value);
drivers/edac/edac_mc.c
... ... @@ -258,6 +258,12 @@
258 258  
259 259 mutex_lock(&mem_ctls_mutex);
260 260  
  261 + /* if this control struct has movd to offline state, we are done */
  262 + if (mci->op_state == OP_OFFLINE) {
  263 + mutex_unlock(&mem_ctls_mutex);
  264 + return;
  265 + }
  266 +
261 267 /* Only poll controllers that are running polled and have a check */
262 268 if (edac_mc_assert_error_check_and_clear() && (mci->edac_check != NULL))
263 269 mci->edac_check(mci);
264 270  
265 271  
... ... @@ -279,11 +285,19 @@
279 285 * edac_mc_workq_setup
280 286 * initialize a workq item for this mci
281 287 * passing in the new delay period in msec
  288 + *
  289 + * locking model:
  290 + *
  291 + * called with the mem_ctls_mutex held
282 292 */
283   -void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec)
  293 +static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec)
284 294 {
285 295 debugf0("%s()\n", __func__);
286 296  
  297 + /* if this instance is not in the POLL state, then simply return */
  298 + if (mci->op_state != OP_RUNNING_POLL)
  299 + return;
  300 +
287 301 INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function);
288 302 queue_delayed_work(edac_workqueue, &mci->work, msecs_to_jiffies(msec));
289 303 }
290 304  
291 305  
292 306  
293 307  
294 308  
... ... @@ -291,29 +305,39 @@
291 305 /*
292 306 * edac_mc_workq_teardown
293 307 * stop the workq processing on this mci
  308 + *
  309 + * locking model:
  310 + *
  311 + * called WITHOUT lock held
294 312 */
295   -void edac_mc_workq_teardown(struct mem_ctl_info *mci)
  313 +static void edac_mc_workq_teardown(struct mem_ctl_info *mci)
296 314 {
297 315 int status;
298 316  
299   - status = cancel_delayed_work(&mci->work);
300   - if (status == 0) {
301   - /* workq instance might be running, wait for it */
302   - flush_workqueue(edac_workqueue);
  317 + /* if not running POLL, leave now */
  318 + if (mci->op_state == OP_RUNNING_POLL) {
  319 + status = cancel_delayed_work(&mci->work);
  320 + if (status == 0) {
  321 + debugf0("%s() not canceled, flush the queue\n",
  322 + __func__);
  323 +
  324 + /* workq instance might be running, wait for it */
  325 + flush_workqueue(edac_workqueue);
  326 + }
303 327 }
304 328 }
305 329  
306 330 /*
307 331 * edac_reset_delay_period
308 332 */
309   -
310   -void edac_reset_delay_period(struct mem_ctl_info *mci, unsigned long value)
  333 +static void edac_reset_delay_period(struct mem_ctl_info *mci, unsigned long value)
311 334 {
312   - mutex_lock(&mem_ctls_mutex);
313   -
314 335 /* cancel the current workq request */
315 336 edac_mc_workq_teardown(mci);
316 337  
  338 + /* lock the list of devices for the new setup */
  339 + mutex_lock(&mem_ctls_mutex);
  340 +
317 341 /* restart the workq request, with new delay value */
318 342 edac_mc_workq_setup(mci, value);
319 343  
... ... @@ -323,6 +347,10 @@
323 347 /* Return 0 on success, 1 on failure.
324 348 * Before calling this function, caller must
325 349 * assign a unique value to mci->mc_idx.
  350 + *
  351 + * locking model:
  352 + *
  353 + * called with the mem_ctls_mutex lock held
326 354 */
327 355 static int add_mc_to_global_list(struct mem_ctl_info *mci)
328 356 {
... ... @@ -331,7 +359,8 @@
331 359  
332 360 insert_before = &mc_devices;
333 361  
334   - if (unlikely((p = find_mci_by_dev(mci->dev)) != NULL))
  362 + p = find_mci_by_dev(mci->dev);
  363 + if (unlikely(p != NULL))
335 364 goto fail0;
336 365  
337 366 list_for_each(item, &mc_devices) {
... ... @@ -467,8 +496,8 @@
467 496 }
468 497  
469 498 /* Report action taken */
470   - edac_mc_printk(mci, KERN_INFO, "Giving out device to %s %s: DEV %s\n",
471   - mci->mod_name, mci->ctl_name, dev_name(mci));
  499 + edac_mc_printk(mci, KERN_INFO, "Giving out device to '%s' '%s':"
  500 + " DEV %s\n", mci->mod_name, mci->ctl_name, dev_name(mci));
472 501  
473 502 mutex_unlock(&mem_ctls_mutex);
474 503 return 0;
475 504  
... ... @@ -493,10 +522,13 @@
493 522 {
494 523 struct mem_ctl_info *mci;
495 524  
496   - debugf0("MC: %s()\n", __func__);
  525 + debugf0("%s()\n", __func__);
  526 +
497 527 mutex_lock(&mem_ctls_mutex);
498 528  
499   - if ((mci = find_mci_by_dev(dev)) == NULL) {
  529 + /* find the requested mci struct in the global list */
  530 + mci = find_mci_by_dev(dev);
  531 + if (mci == NULL) {
500 532 mutex_unlock(&mem_ctls_mutex);
501 533 return NULL;
502 534 }
503 535  
504 536  
... ... @@ -504,15 +536,17 @@
504 536 /* marking MCI offline */
505 537 mci->op_state = OP_OFFLINE;
506 538  
507   - /* flush workq processes */
508   - edac_mc_workq_teardown(mci);
509   -
510   - edac_remove_sysfs_mci_device(mci);
511 539 del_mc_from_global_list(mci);
512 540 mutex_unlock(&mem_ctls_mutex);
  541 +
  542 + /* flush workq processes and remove sysfs */
  543 + edac_mc_workq_teardown(mci);
  544 + edac_remove_sysfs_mci_device(mci);
  545 +
513 546 edac_printk(KERN_INFO, EDAC_MC,
514 547 "Removed device %d for %s %s: DEV %s\n", mci->mc_idx,
515 548 mci->mod_name, mci->ctl_name, dev_name(mci));
  549 +
516 550 return mci;
517 551 }
518 552 EXPORT_SYMBOL_GPL(edac_mc_del_mc);