Commit 0e6a805aac5bb916350190b0e853e620c315edd5

Authored by Dan Williams
Committed by George Cherian
1 parent af235643fe

sd: limit the scope of the async probe domain

sd injects and synchronizes probe work on the global kernel-wide domain.
This runs into conflict with PM that wants to perform resume actions in
async context:

[  494.237079] INFO: task kworker/u:3:554 blocked for more than 120 seconds.
[  494.294396] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  494.360809] kworker/u:3     D 0000000000000000     0   554      2 0x00000000
[  494.420739]  ffff88012e4d3af0 0000000000000046 ffff88013200c160 ffff88012e4d3fd8
[  494.484392]  ffff88012e4d3fd8 0000000000012500 ffff8801394ea0b0 ffff88013200c160
[  494.548038]  ffff88012e4d3ae0 00000000000001e3 ffffffff81a249e0 ffff8801321c5398
[  494.611685] Call Trace:
[  494.632649]  [<ffffffff8149dd25>] schedule+0x5a/0x5c
[  494.674687]  [<ffffffff8104b968>] async_synchronize_cookie_domain+0xb6/0x112
[  494.734177]  [<ffffffff810461ff>] ? __init_waitqueue_head+0x50/0x50
[  494.787134]  [<ffffffff8131a224>] ? scsi_remove_target+0x48/0x48
[  494.837900]  [<ffffffff8104b9d9>] async_synchronize_cookie+0x15/0x17
[  494.891567]  [<ffffffff8104ba49>] async_synchronize_full+0x54/0x70  <-- here we wait for async contexts to complete
[  494.943783]  [<ffffffff8104b9f5>] ? async_synchronize_full_domain+0x1a/0x1a
[  495.002547]  [<ffffffffa00114b1>] sd_remove+0x2c/0xa2 [sd_mod]
[  495.051861]  [<ffffffff812fe94f>] __device_release_driver+0x86/0xcf
[  495.104807]  [<ffffffff812fe9bd>] device_release_driver+0x25/0x32  <-- here we take device_lock()

[  853.511341] INFO: task kworker/u:4:549 blocked for more than 120 seconds.
[  853.568693] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  853.635119] kworker/u:4     D ffff88013097b5d0     0   549      2 0x00000000
[  853.695129]  ffff880132773c40 0000000000000046 ffff880130790000 ffff880132773fd8
[  853.758990]  ffff880132773fd8 0000000000012500 ffff88013288a0b0 ffff880130790000
[  853.822796]  0000000000000246 0000000000000040 ffff88013097b5c8 ffff880130790000
[  853.886633] Call Trace:
[  853.907631]  [<ffffffff8149dd25>] schedule+0x5a/0x5c
[  853.949670]  [<ffffffff8149cc44>] __mutex_lock_common+0x220/0x351
[  854.001225]  [<ffffffff81304bd7>] ? device_resume+0x58/0x1c4
[  854.049082]  [<ffffffff81304bd7>] ? device_resume+0x58/0x1c4
[  854.097011]  [<ffffffff8149ce48>] mutex_lock_nested+0x2f/0x36   <-- here we wait for device_lock()
[  854.145591]  [<ffffffff81304bd7>] device_resume+0x58/0x1c4
[  854.192066]  [<ffffffff81304d61>] async_resume+0x1e/0x45
[  854.237019]  [<ffffffff8104bc93>] async_run_entry_fn+0xc6/0x173  <-- ...while running in async context

Provide a 'scsi_sd_probe_domain' so that async probe actions actions can
be flushed without regard for the state of PM, and allow for the resume
path to handle devices that have transitioned from SDEV_QUIESCE to
SDEV_DEL prior to resume.

Acked-by: Alan Stern <stern@rowland.harvard.edu>
[alan: uplevel scsi_sd_probe_domain, clarify scsi_device_resume]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
[jejb: remove unneeded config guards in include file]
Signed-off-by: James Bottomley <JBottomley@Parallels.com>

Showing 5 changed files with 19 additions and 6 deletions Side-by-side Diff

... ... @@ -90,6 +90,12 @@
90 90 EXPORT_SYMBOL(scsi_logging_level);
91 91 #endif
92 92  
  93 +#if IS_ENABLED(CONFIG_PM) || IS_ENABLED(CONFIG_BLK_DEV_SD)
  94 +/* sd and scsi_pm need to coordinate flushing async actions */
  95 +LIST_HEAD(scsi_sd_probe_domain);
  96 +EXPORT_SYMBOL(scsi_sd_probe_domain);
  97 +#endif
  98 +
93 99 /* NB: These are exposed through /proc/scsi/scsi and form part of the ABI.
94 100 * You may not alter any existing entry (although adding new ones is
95 101 * encouraged once assigned by ANSI/INCITS T10
drivers/scsi/scsi_lib.c
... ... @@ -2352,10 +2352,14 @@
2352 2352 *
2353 2353 * Must be called with user context, may sleep.
2354 2354 */
2355   -void
2356   -scsi_device_resume(struct scsi_device *sdev)
  2355 +void scsi_device_resume(struct scsi_device *sdev)
2357 2356 {
2358   - if(scsi_device_set_state(sdev, SDEV_RUNNING))
  2357 + /* check if the device state was mutated prior to resume, and if
  2358 + * so assume the state is being managed elsewhere (for example
  2359 + * device deleted during suspend)
  2360 + */
  2361 + if (sdev->sdev_state != SDEV_QUIESCE ||
  2362 + scsi_device_set_state(sdev, SDEV_RUNNING))
2359 2363 return;
2360 2364 scsi_run_queue(sdev->request_queue);
2361 2365 }
drivers/scsi/scsi_pm.c
... ... @@ -74,7 +74,7 @@
74 74 {
75 75 if (scsi_is_sdev_device(dev)) {
76 76 /* sd probing uses async_schedule. Wait until it finishes. */
77   - async_synchronize_full();
  77 + async_synchronize_full_domain(&scsi_sd_probe_domain);
78 78  
79 79 } else if (scsi_is_host_device(dev)) {
80 80 /* Wait until async scanning is finished */
drivers/scsi/scsi_priv.h
... ... @@ -164,6 +164,8 @@
164 164 static inline void scsi_autopm_put_host(struct Scsi_Host *h) {}
165 165 #endif /* CONFIG_PM_RUNTIME */
166 166  
  167 +extern struct list_head scsi_sd_probe_domain;
  168 +
167 169 /*
168 170 * internal scsi timeout functions: for use by mid-layer and transport
169 171 * classes.
... ... @@ -64,6 +64,7 @@
64 64 #include <scsi/scsicam.h>
65 65  
66 66 #include "sd.h"
  67 +#include "scsi_priv.h"
67 68 #include "scsi_logging.h"
68 69  
69 70 MODULE_AUTHOR("Eric Youngdale");
... ... @@ -2627,7 +2628,7 @@
2627 2628 dev_set_drvdata(dev, sdkp);
2628 2629  
2629 2630 get_device(&sdkp->dev); /* prevent release before async_schedule */
2630   - async_schedule(sd_probe_async, sdkp);
  2631 + async_schedule_domain(sd_probe_async, sdkp, &scsi_sd_probe_domain);
2631 2632  
2632 2633 return 0;
2633 2634  
... ... @@ -2661,7 +2662,7 @@
2661 2662 sdkp = dev_get_drvdata(dev);
2662 2663 scsi_autopm_get_device(sdkp->device);
2663 2664  
2664   - async_synchronize_full();
  2665 + async_synchronize_full_domain(&scsi_sd_probe_domain);
2665 2666 blk_queue_prep_rq(sdkp->device->request_queue, scsi_prep_fn);
2666 2667 blk_queue_unprep_rq(sdkp->device->request_queue, NULL);
2667 2668 device_del(&sdkp->dev);