Commit b45620229dd67ff1daffa8adce57f37b37860f78
Committed by
James Bottomley
1 parent
6b1e5a45d4
Exists in
master
and in
16 other branches
[SCSI] Add 'eh_deadline' to limit SCSI EH runtime
This patchs adds an 'eh_deadline' sysfs attribute to the scsi host which limits the overall runtime of the SCSI EH. The 'eh_deadline' value is stored in the now obsolete field 'resetting'. When a command is failed the start time of the EH is stored in 'last_reset'. If the overall runtime of the SCSI EH is longer than last_reset + eh_deadline, the EH is short-circuited and falls through to issue a host reset only. [jejb: add comments in Scsi_Host about new fields] Signed-off-by: Hannes Reinecke <hare@suse.de> Signed-off-by: James Bottomley <JBottomley@Parallels.com>
Showing 4 changed files with 173 additions and 6 deletions Side-by-side Diff
drivers/scsi/hosts.c
... | ... | @@ -316,6 +316,12 @@ |
316 | 316 | kfree(shost); |
317 | 317 | } |
318 | 318 | |
319 | +static unsigned int shost_eh_deadline; | |
320 | + | |
321 | +module_param_named(eh_deadline, shost_eh_deadline, uint, S_IRUGO|S_IWUSR); | |
322 | +MODULE_PARM_DESC(eh_deadline, | |
323 | + "SCSI EH timeout in seconds (should be between 1 and 2^32-1)"); | |
324 | + | |
319 | 325 | static struct device_type scsi_host_type = { |
320 | 326 | .name = "scsi_host", |
321 | 327 | .release = scsi_host_dev_release, |
... | ... | @@ -388,6 +394,7 @@ |
388 | 394 | shost->unchecked_isa_dma = sht->unchecked_isa_dma; |
389 | 395 | shost->use_clustering = sht->use_clustering; |
390 | 396 | shost->ordered_tag = sht->ordered_tag; |
397 | + shost->eh_deadline = shost_eh_deadline * HZ; | |
391 | 398 | |
392 | 399 | if (sht->supported_mode == MODE_UNKNOWN) |
393 | 400 | /* means we didn't set it ... default to INITIATOR */ |
drivers/scsi/scsi_error.c
... | ... | @@ -87,6 +87,18 @@ |
87 | 87 | } |
88 | 88 | EXPORT_SYMBOL_GPL(scsi_schedule_eh); |
89 | 89 | |
90 | +static int scsi_host_eh_past_deadline(struct Scsi_Host *shost) | |
91 | +{ | |
92 | + if (!shost->last_reset || !shost->eh_deadline) | |
93 | + return 0; | |
94 | + | |
95 | + if (time_before(jiffies, | |
96 | + shost->last_reset + shost->eh_deadline)) | |
97 | + return 0; | |
98 | + | |
99 | + return 1; | |
100 | +} | |
101 | + | |
90 | 102 | /** |
91 | 103 | * scsi_eh_scmd_add - add scsi cmd to error handling. |
92 | 104 | * @scmd: scmd to run eh on. |
... | ... | @@ -109,6 +121,9 @@ |
109 | 121 | if (scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY)) |
110 | 122 | goto out_unlock; |
111 | 123 | |
124 | + if (shost->eh_deadline && !shost->last_reset) | |
125 | + shost->last_reset = jiffies; | |
126 | + | |
112 | 127 | ret = 1; |
113 | 128 | scmd->eh_eflags |= eh_flag; |
114 | 129 | list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q); |
... | ... | @@ -138,6 +153,9 @@ |
138 | 153 | trace_scsi_dispatch_cmd_timeout(scmd); |
139 | 154 | scsi_log_completion(scmd, TIMEOUT_ERROR); |
140 | 155 | |
156 | + if (host->eh_deadline && !host->last_reset) | |
157 | + host->last_reset = jiffies; | |
158 | + | |
141 | 159 | if (host->transportt->eh_timed_out) |
142 | 160 | rtn = host->transportt->eh_timed_out(scmd); |
143 | 161 | else if (host->hostt->eh_timed_out) |
144 | 162 | |
145 | 163 | |
... | ... | @@ -990,13 +1008,26 @@ |
990 | 1008 | struct list_head *done_q) |
991 | 1009 | { |
992 | 1010 | struct scsi_cmnd *scmd, *next; |
1011 | + struct Scsi_Host *shost; | |
993 | 1012 | int rtn; |
1013 | + unsigned long flags; | |
994 | 1014 | |
995 | 1015 | list_for_each_entry_safe(scmd, next, work_q, eh_entry) { |
996 | 1016 | if ((scmd->eh_eflags & SCSI_EH_CANCEL_CMD) || |
997 | 1017 | SCSI_SENSE_VALID(scmd)) |
998 | 1018 | continue; |
999 | 1019 | |
1020 | + shost = scmd->device->host; | |
1021 | + spin_lock_irqsave(shost->host_lock, flags); | |
1022 | + if (scsi_host_eh_past_deadline(shost)) { | |
1023 | + spin_unlock_irqrestore(shost->host_lock, flags); | |
1024 | + SCSI_LOG_ERROR_RECOVERY(3, | |
1025 | + shost_printk(KERN_INFO, shost, | |
1026 | + "skip %s, past eh deadline\n", | |
1027 | + __func__)); | |
1028 | + break; | |
1029 | + } | |
1030 | + spin_unlock_irqrestore(shost->host_lock, flags); | |
1000 | 1031 | SCSI_LOG_ERROR_RECOVERY(2, scmd_printk(KERN_INFO, scmd, |
1001 | 1032 | "%s: requesting sense\n", |
1002 | 1033 | current->comm)); |
1003 | 1034 | |
... | ... | @@ -1082,11 +1113,28 @@ |
1082 | 1113 | struct scsi_cmnd *scmd, *next; |
1083 | 1114 | struct scsi_device *sdev; |
1084 | 1115 | int finish_cmds; |
1116 | + unsigned long flags; | |
1085 | 1117 | |
1086 | 1118 | while (!list_empty(cmd_list)) { |
1087 | 1119 | scmd = list_entry(cmd_list->next, struct scsi_cmnd, eh_entry); |
1088 | 1120 | sdev = scmd->device; |
1089 | 1121 | |
1122 | + if (!try_stu) { | |
1123 | + spin_lock_irqsave(sdev->host->host_lock, flags); | |
1124 | + if (scsi_host_eh_past_deadline(sdev->host)) { | |
1125 | + /* Push items back onto work_q */ | |
1126 | + list_splice_init(cmd_list, work_q); | |
1127 | + spin_unlock_irqrestore(sdev->host->host_lock, | |
1128 | + flags); | |
1129 | + SCSI_LOG_ERROR_RECOVERY(3, | |
1130 | + shost_printk(KERN_INFO, sdev->host, | |
1131 | + "skip %s, past eh deadline", | |
1132 | + __func__)); | |
1133 | + break; | |
1134 | + } | |
1135 | + spin_unlock_irqrestore(sdev->host->host_lock, flags); | |
1136 | + } | |
1137 | + | |
1090 | 1138 | finish_cmds = !scsi_device_online(scmd->device) || |
1091 | 1139 | (try_stu && !scsi_eh_try_stu(scmd) && |
1092 | 1140 | !scsi_eh_tur(scmd)) || |
1093 | 1141 | |
1094 | 1142 | |
... | ... | @@ -1122,14 +1170,28 @@ |
1122 | 1170 | struct scsi_cmnd *scmd, *next; |
1123 | 1171 | LIST_HEAD(check_list); |
1124 | 1172 | int rtn; |
1173 | + struct Scsi_Host *shost; | |
1174 | + unsigned long flags; | |
1125 | 1175 | |
1126 | 1176 | list_for_each_entry_safe(scmd, next, work_q, eh_entry) { |
1127 | 1177 | if (!(scmd->eh_eflags & SCSI_EH_CANCEL_CMD)) |
1128 | 1178 | continue; |
1179 | + shost = scmd->device->host; | |
1180 | + spin_lock_irqsave(shost->host_lock, flags); | |
1181 | + if (scsi_host_eh_past_deadline(shost)) { | |
1182 | + spin_unlock_irqrestore(shost->host_lock, flags); | |
1183 | + list_splice_init(&check_list, work_q); | |
1184 | + SCSI_LOG_ERROR_RECOVERY(3, | |
1185 | + shost_printk(KERN_INFO, shost, | |
1186 | + "skip %s, past eh deadline\n", | |
1187 | + __func__)); | |
1188 | + return list_empty(work_q); | |
1189 | + } | |
1190 | + spin_unlock_irqrestore(shost->host_lock, flags); | |
1129 | 1191 | SCSI_LOG_ERROR_RECOVERY(3, printk("%s: aborting cmd:" |
1130 | 1192 | "0x%p\n", current->comm, |
1131 | 1193 | scmd)); |
1132 | - rtn = scsi_try_to_abort_cmd(scmd->device->host->hostt, scmd); | |
1194 | + rtn = scsi_try_to_abort_cmd(shost->hostt, scmd); | |
1133 | 1195 | if (rtn == SUCCESS || rtn == FAST_IO_FAIL) { |
1134 | 1196 | scmd->eh_eflags &= ~SCSI_EH_CANCEL_CMD; |
1135 | 1197 | if (rtn == FAST_IO_FAIL) |
1136 | 1198 | |
... | ... | @@ -1187,8 +1249,19 @@ |
1187 | 1249 | { |
1188 | 1250 | struct scsi_cmnd *scmd, *stu_scmd, *next; |
1189 | 1251 | struct scsi_device *sdev; |
1252 | + unsigned long flags; | |
1190 | 1253 | |
1191 | 1254 | shost_for_each_device(sdev, shost) { |
1255 | + spin_lock_irqsave(shost->host_lock, flags); | |
1256 | + if (scsi_host_eh_past_deadline(shost)) { | |
1257 | + spin_unlock_irqrestore(shost->host_lock, flags); | |
1258 | + SCSI_LOG_ERROR_RECOVERY(3, | |
1259 | + shost_printk(KERN_INFO, shost, | |
1260 | + "skip %s, past eh deadline\n", | |
1261 | + __func__)); | |
1262 | + break; | |
1263 | + } | |
1264 | + spin_unlock_irqrestore(shost->host_lock, flags); | |
1192 | 1265 | stu_scmd = NULL; |
1193 | 1266 | list_for_each_entry(scmd, work_q, eh_entry) |
1194 | 1267 | if (scmd->device == sdev && SCSI_SENSE_VALID(scmd) && |
1195 | 1268 | |
... | ... | @@ -1241,9 +1314,20 @@ |
1241 | 1314 | { |
1242 | 1315 | struct scsi_cmnd *scmd, *bdr_scmd, *next; |
1243 | 1316 | struct scsi_device *sdev; |
1317 | + unsigned long flags; | |
1244 | 1318 | int rtn; |
1245 | 1319 | |
1246 | 1320 | shost_for_each_device(sdev, shost) { |
1321 | + spin_lock_irqsave(shost->host_lock, flags); | |
1322 | + if (scsi_host_eh_past_deadline(shost)) { | |
1323 | + spin_unlock_irqrestore(shost->host_lock, flags); | |
1324 | + SCSI_LOG_ERROR_RECOVERY(3, | |
1325 | + shost_printk(KERN_INFO, shost, | |
1326 | + "skip %s, past eh deadline\n", | |
1327 | + __func__)); | |
1328 | + break; | |
1329 | + } | |
1330 | + spin_unlock_irqrestore(shost->host_lock, flags); | |
1247 | 1331 | bdr_scmd = NULL; |
1248 | 1332 | list_for_each_entry(scmd, work_q, eh_entry) |
1249 | 1333 | if (scmd->device == sdev) { |
1250 | 1334 | |
... | ... | @@ -1303,7 +1387,22 @@ |
1303 | 1387 | struct scsi_cmnd *next, *scmd; |
1304 | 1388 | int rtn; |
1305 | 1389 | unsigned int id; |
1390 | + unsigned long flags; | |
1306 | 1391 | |
1392 | + spin_lock_irqsave(shost->host_lock, flags); | |
1393 | + if (scsi_host_eh_past_deadline(shost)) { | |
1394 | + spin_unlock_irqrestore(shost->host_lock, flags); | |
1395 | + /* push back on work queue for further processing */ | |
1396 | + list_splice_init(&check_list, work_q); | |
1397 | + list_splice_init(&tmp_list, work_q); | |
1398 | + SCSI_LOG_ERROR_RECOVERY(3, | |
1399 | + shost_printk(KERN_INFO, shost, | |
1400 | + "skip %s, past eh deadline\n", | |
1401 | + __func__)); | |
1402 | + return list_empty(work_q); | |
1403 | + } | |
1404 | + spin_unlock_irqrestore(shost->host_lock, flags); | |
1405 | + | |
1307 | 1406 | scmd = list_entry(tmp_list.next, struct scsi_cmnd, eh_entry); |
1308 | 1407 | id = scmd_id(scmd); |
1309 | 1408 | |
... | ... | @@ -1347,6 +1446,7 @@ |
1347 | 1446 | LIST_HEAD(check_list); |
1348 | 1447 | unsigned int channel; |
1349 | 1448 | int rtn; |
1449 | + unsigned long flags; | |
1350 | 1450 | |
1351 | 1451 | /* |
1352 | 1452 | * we really want to loop over the various channels, and do this on |
... | ... | @@ -1356,6 +1456,18 @@ |
1356 | 1456 | */ |
1357 | 1457 | |
1358 | 1458 | for (channel = 0; channel <= shost->max_channel; channel++) { |
1459 | + spin_lock_irqsave(shost->host_lock, flags); | |
1460 | + if (scsi_host_eh_past_deadline(shost)) { | |
1461 | + spin_unlock_irqrestore(shost->host_lock, flags); | |
1462 | + list_splice_init(&check_list, work_q); | |
1463 | + SCSI_LOG_ERROR_RECOVERY(3, | |
1464 | + shost_printk(KERN_INFO, shost, | |
1465 | + "skip %s, past eh deadline\n", | |
1466 | + __func__)); | |
1467 | + return list_empty(work_q); | |
1468 | + } | |
1469 | + spin_unlock_irqrestore(shost->host_lock, flags); | |
1470 | + | |
1359 | 1471 | chan_scmd = NULL; |
1360 | 1472 | list_for_each_entry(scmd, work_q, eh_entry) { |
1361 | 1473 | if (channel == scmd_channel(scmd)) { |
... | ... | @@ -1755,8 +1867,9 @@ |
1755 | 1867 | * will be requests for character device operations, and also for |
1756 | 1868 | * ioctls to queued block devices. |
1757 | 1869 | */ |
1758 | - SCSI_LOG_ERROR_RECOVERY(3, printk("%s: waking up host to restart\n", | |
1759 | - __func__)); | |
1870 | + SCSI_LOG_ERROR_RECOVERY(3, | |
1871 | + printk("scsi_eh_%d waking up host to restart\n", | |
1872 | + shost->host_no)); | |
1760 | 1873 | |
1761 | 1874 | spin_lock_irqsave(shost->host_lock, flags); |
1762 | 1875 | if (scsi_host_set_state(shost, SHOST_RUNNING)) |
... | ... | @@ -1883,6 +1996,10 @@ |
1883 | 1996 | if (!scsi_eh_abort_cmds(&eh_work_q, &eh_done_q)) |
1884 | 1997 | scsi_eh_ready_devs(shost, &eh_work_q, &eh_done_q); |
1885 | 1998 | |
1999 | + spin_lock_irqsave(shost->host_lock, flags); | |
2000 | + if (shost->eh_deadline) | |
2001 | + shost->last_reset = 0; | |
2002 | + spin_unlock_irqrestore(shost->host_lock, flags); | |
1886 | 2003 | scsi_eh_flush_done_q(&eh_done_q); |
1887 | 2004 | } |
1888 | 2005 | |
... | ... | @@ -1909,7 +2026,7 @@ |
1909 | 2026 | if ((shost->host_failed == 0 && shost->host_eh_scheduled == 0) || |
1910 | 2027 | shost->host_failed != shost->host_busy) { |
1911 | 2028 | SCSI_LOG_ERROR_RECOVERY(1, |
1912 | - printk("Error handler scsi_eh_%d sleeping\n", | |
2029 | + printk("scsi_eh_%d: sleeping\n", | |
1913 | 2030 | shost->host_no)); |
1914 | 2031 | schedule(); |
1915 | 2032 | continue; |
... | ... | @@ -1917,8 +2034,9 @@ |
1917 | 2034 | |
1918 | 2035 | __set_current_state(TASK_RUNNING); |
1919 | 2036 | SCSI_LOG_ERROR_RECOVERY(1, |
1920 | - printk("Error handler scsi_eh_%d waking up\n", | |
1921 | - shost->host_no)); | |
2037 | + printk("scsi_eh_%d: waking up %d/%d/%d\n", | |
2038 | + shost->host_no, shost->host_eh_scheduled, | |
2039 | + shost->host_failed, shost->host_busy)); | |
1922 | 2040 | |
1923 | 2041 | /* |
1924 | 2042 | * We have a host that is failing for some reason. Figure out |
drivers/scsi/scsi_sysfs.c
... | ... | @@ -281,6 +281,42 @@ |
281 | 281 | |
282 | 282 | static DEVICE_ATTR(host_reset, S_IWUSR, NULL, store_host_reset); |
283 | 283 | |
284 | +static ssize_t | |
285 | +show_shost_eh_deadline(struct device *dev, | |
286 | + struct device_attribute *attr, char *buf) | |
287 | +{ | |
288 | + struct Scsi_Host *shost = class_to_shost(dev); | |
289 | + | |
290 | + return sprintf(buf, "%d\n", shost->eh_deadline / HZ); | |
291 | +} | |
292 | + | |
293 | +static ssize_t | |
294 | +store_shost_eh_deadline(struct device *dev, struct device_attribute *attr, | |
295 | + const char *buf, size_t count) | |
296 | +{ | |
297 | + struct Scsi_Host *shost = class_to_shost(dev); | |
298 | + int ret = -EINVAL; | |
299 | + int deadline; | |
300 | + unsigned long flags; | |
301 | + | |
302 | + if (shost->transportt && shost->transportt->eh_strategy_handler) | |
303 | + return ret; | |
304 | + | |
305 | + if (sscanf(buf, "%d\n", &deadline) == 1) { | |
306 | + spin_lock_irqsave(shost->host_lock, flags); | |
307 | + if (scsi_host_in_recovery(shost)) | |
308 | + ret = -EBUSY; | |
309 | + else { | |
310 | + shost->eh_deadline = deadline * HZ; | |
311 | + ret = count; | |
312 | + } | |
313 | + spin_unlock_irqrestore(shost->host_lock, flags); | |
314 | + } | |
315 | + return ret; | |
316 | +} | |
317 | + | |
318 | +static DEVICE_ATTR(eh_deadline, S_IRUGO | S_IWUSR, show_shost_eh_deadline, store_shost_eh_deadline); | |
319 | + | |
284 | 320 | shost_rd_attr(unique_id, "%u\n"); |
285 | 321 | shost_rd_attr(host_busy, "%hu\n"); |
286 | 322 | shost_rd_attr(cmd_per_lun, "%hd\n"); |
... | ... | @@ -308,6 +344,7 @@ |
308 | 344 | &dev_attr_prot_capabilities.attr, |
309 | 345 | &dev_attr_prot_guard_type.attr, |
310 | 346 | &dev_attr_host_reset.attr, |
347 | + &dev_attr_eh_deadline.attr, | |
311 | 348 | NULL |
312 | 349 | }; |
313 | 350 |
include/scsi/scsi_host.h
... | ... | @@ -599,6 +599,11 @@ |
599 | 599 | |
600 | 600 | unsigned int host_no; /* Used for IOCTL_GET_IDLUN, /proc/scsi et al. */ |
601 | 601 | |
602 | + /* next two fields are used to bound the time spent in error handling */ | |
603 | + int eh_deadline; | |
604 | + unsigned long last_reset; | |
605 | + | |
606 | + | |
602 | 607 | /* |
603 | 608 | * These three parameters can be used to allow for wide scsi, |
604 | 609 | * and for host adapters that support multiple busses |