Commit b45620229dd67ff1daffa8adce57f37b37860f78

Authored by Hannes Reinecke
Committed by James Bottomley
1 parent 6b1e5a45d4

[SCSI] Add 'eh_deadline' to limit SCSI EH runtime

This patchs adds an 'eh_deadline' sysfs attribute to the scsi
host which limits the overall runtime of the SCSI EH.
The 'eh_deadline' value is stored in the now obsolete field
'resetting'.
When a command is failed the start time of the EH is stored
in 'last_reset'. If the overall runtime of the SCSI EH is longer
than last_reset + eh_deadline, the EH is short-circuited and
falls through to issue a host reset only.

[jejb: add comments in Scsi_Host about new fields]
Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>

Showing 4 changed files with 173 additions and 6 deletions Side-by-side Diff

drivers/scsi/hosts.c
... ... @@ -316,6 +316,12 @@
316 316 kfree(shost);
317 317 }
318 318  
  319 +static unsigned int shost_eh_deadline;
  320 +
  321 +module_param_named(eh_deadline, shost_eh_deadline, uint, S_IRUGO|S_IWUSR);
  322 +MODULE_PARM_DESC(eh_deadline,
  323 + "SCSI EH timeout in seconds (should be between 1 and 2^32-1)");
  324 +
319 325 static struct device_type scsi_host_type = {
320 326 .name = "scsi_host",
321 327 .release = scsi_host_dev_release,
... ... @@ -388,6 +394,7 @@
388 394 shost->unchecked_isa_dma = sht->unchecked_isa_dma;
389 395 shost->use_clustering = sht->use_clustering;
390 396 shost->ordered_tag = sht->ordered_tag;
  397 + shost->eh_deadline = shost_eh_deadline * HZ;
391 398  
392 399 if (sht->supported_mode == MODE_UNKNOWN)
393 400 /* means we didn't set it ... default to INITIATOR */
drivers/scsi/scsi_error.c
... ... @@ -87,6 +87,18 @@
87 87 }
88 88 EXPORT_SYMBOL_GPL(scsi_schedule_eh);
89 89  
  90 +static int scsi_host_eh_past_deadline(struct Scsi_Host *shost)
  91 +{
  92 + if (!shost->last_reset || !shost->eh_deadline)
  93 + return 0;
  94 +
  95 + if (time_before(jiffies,
  96 + shost->last_reset + shost->eh_deadline))
  97 + return 0;
  98 +
  99 + return 1;
  100 +}
  101 +
90 102 /**
91 103 * scsi_eh_scmd_add - add scsi cmd to error handling.
92 104 * @scmd: scmd to run eh on.
... ... @@ -109,6 +121,9 @@
109 121 if (scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY))
110 122 goto out_unlock;
111 123  
  124 + if (shost->eh_deadline && !shost->last_reset)
  125 + shost->last_reset = jiffies;
  126 +
112 127 ret = 1;
113 128 scmd->eh_eflags |= eh_flag;
114 129 list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q);
... ... @@ -138,6 +153,9 @@
138 153 trace_scsi_dispatch_cmd_timeout(scmd);
139 154 scsi_log_completion(scmd, TIMEOUT_ERROR);
140 155  
  156 + if (host->eh_deadline && !host->last_reset)
  157 + host->last_reset = jiffies;
  158 +
141 159 if (host->transportt->eh_timed_out)
142 160 rtn = host->transportt->eh_timed_out(scmd);
143 161 else if (host->hostt->eh_timed_out)
144 162  
145 163  
... ... @@ -990,13 +1008,26 @@
990 1008 struct list_head *done_q)
991 1009 {
992 1010 struct scsi_cmnd *scmd, *next;
  1011 + struct Scsi_Host *shost;
993 1012 int rtn;
  1013 + unsigned long flags;
994 1014  
995 1015 list_for_each_entry_safe(scmd, next, work_q, eh_entry) {
996 1016 if ((scmd->eh_eflags & SCSI_EH_CANCEL_CMD) ||
997 1017 SCSI_SENSE_VALID(scmd))
998 1018 continue;
999 1019  
  1020 + shost = scmd->device->host;
  1021 + spin_lock_irqsave(shost->host_lock, flags);
  1022 + if (scsi_host_eh_past_deadline(shost)) {
  1023 + spin_unlock_irqrestore(shost->host_lock, flags);
  1024 + SCSI_LOG_ERROR_RECOVERY(3,
  1025 + shost_printk(KERN_INFO, shost,
  1026 + "skip %s, past eh deadline\n",
  1027 + __func__));
  1028 + break;
  1029 + }
  1030 + spin_unlock_irqrestore(shost->host_lock, flags);
1000 1031 SCSI_LOG_ERROR_RECOVERY(2, scmd_printk(KERN_INFO, scmd,
1001 1032 "%s: requesting sense\n",
1002 1033 current->comm));
1003 1034  
... ... @@ -1082,11 +1113,28 @@
1082 1113 struct scsi_cmnd *scmd, *next;
1083 1114 struct scsi_device *sdev;
1084 1115 int finish_cmds;
  1116 + unsigned long flags;
1085 1117  
1086 1118 while (!list_empty(cmd_list)) {
1087 1119 scmd = list_entry(cmd_list->next, struct scsi_cmnd, eh_entry);
1088 1120 sdev = scmd->device;
1089 1121  
  1122 + if (!try_stu) {
  1123 + spin_lock_irqsave(sdev->host->host_lock, flags);
  1124 + if (scsi_host_eh_past_deadline(sdev->host)) {
  1125 + /* Push items back onto work_q */
  1126 + list_splice_init(cmd_list, work_q);
  1127 + spin_unlock_irqrestore(sdev->host->host_lock,
  1128 + flags);
  1129 + SCSI_LOG_ERROR_RECOVERY(3,
  1130 + shost_printk(KERN_INFO, sdev->host,
  1131 + "skip %s, past eh deadline",
  1132 + __func__));
  1133 + break;
  1134 + }
  1135 + spin_unlock_irqrestore(sdev->host->host_lock, flags);
  1136 + }
  1137 +
1090 1138 finish_cmds = !scsi_device_online(scmd->device) ||
1091 1139 (try_stu && !scsi_eh_try_stu(scmd) &&
1092 1140 !scsi_eh_tur(scmd)) ||
1093 1141  
1094 1142  
... ... @@ -1122,14 +1170,28 @@
1122 1170 struct scsi_cmnd *scmd, *next;
1123 1171 LIST_HEAD(check_list);
1124 1172 int rtn;
  1173 + struct Scsi_Host *shost;
  1174 + unsigned long flags;
1125 1175  
1126 1176 list_for_each_entry_safe(scmd, next, work_q, eh_entry) {
1127 1177 if (!(scmd->eh_eflags & SCSI_EH_CANCEL_CMD))
1128 1178 continue;
  1179 + shost = scmd->device->host;
  1180 + spin_lock_irqsave(shost->host_lock, flags);
  1181 + if (scsi_host_eh_past_deadline(shost)) {
  1182 + spin_unlock_irqrestore(shost->host_lock, flags);
  1183 + list_splice_init(&check_list, work_q);
  1184 + SCSI_LOG_ERROR_RECOVERY(3,
  1185 + shost_printk(KERN_INFO, shost,
  1186 + "skip %s, past eh deadline\n",
  1187 + __func__));
  1188 + return list_empty(work_q);
  1189 + }
  1190 + spin_unlock_irqrestore(shost->host_lock, flags);
1129 1191 SCSI_LOG_ERROR_RECOVERY(3, printk("%s: aborting cmd:"
1130 1192 "0x%p\n", current->comm,
1131 1193 scmd));
1132   - rtn = scsi_try_to_abort_cmd(scmd->device->host->hostt, scmd);
  1194 + rtn = scsi_try_to_abort_cmd(shost->hostt, scmd);
1133 1195 if (rtn == SUCCESS || rtn == FAST_IO_FAIL) {
1134 1196 scmd->eh_eflags &= ~SCSI_EH_CANCEL_CMD;
1135 1197 if (rtn == FAST_IO_FAIL)
1136 1198  
... ... @@ -1187,8 +1249,19 @@
1187 1249 {
1188 1250 struct scsi_cmnd *scmd, *stu_scmd, *next;
1189 1251 struct scsi_device *sdev;
  1252 + unsigned long flags;
1190 1253  
1191 1254 shost_for_each_device(sdev, shost) {
  1255 + spin_lock_irqsave(shost->host_lock, flags);
  1256 + if (scsi_host_eh_past_deadline(shost)) {
  1257 + spin_unlock_irqrestore(shost->host_lock, flags);
  1258 + SCSI_LOG_ERROR_RECOVERY(3,
  1259 + shost_printk(KERN_INFO, shost,
  1260 + "skip %s, past eh deadline\n",
  1261 + __func__));
  1262 + break;
  1263 + }
  1264 + spin_unlock_irqrestore(shost->host_lock, flags);
1192 1265 stu_scmd = NULL;
1193 1266 list_for_each_entry(scmd, work_q, eh_entry)
1194 1267 if (scmd->device == sdev && SCSI_SENSE_VALID(scmd) &&
1195 1268  
... ... @@ -1241,9 +1314,20 @@
1241 1314 {
1242 1315 struct scsi_cmnd *scmd, *bdr_scmd, *next;
1243 1316 struct scsi_device *sdev;
  1317 + unsigned long flags;
1244 1318 int rtn;
1245 1319  
1246 1320 shost_for_each_device(sdev, shost) {
  1321 + spin_lock_irqsave(shost->host_lock, flags);
  1322 + if (scsi_host_eh_past_deadline(shost)) {
  1323 + spin_unlock_irqrestore(shost->host_lock, flags);
  1324 + SCSI_LOG_ERROR_RECOVERY(3,
  1325 + shost_printk(KERN_INFO, shost,
  1326 + "skip %s, past eh deadline\n",
  1327 + __func__));
  1328 + break;
  1329 + }
  1330 + spin_unlock_irqrestore(shost->host_lock, flags);
1247 1331 bdr_scmd = NULL;
1248 1332 list_for_each_entry(scmd, work_q, eh_entry)
1249 1333 if (scmd->device == sdev) {
1250 1334  
... ... @@ -1303,7 +1387,22 @@
1303 1387 struct scsi_cmnd *next, *scmd;
1304 1388 int rtn;
1305 1389 unsigned int id;
  1390 + unsigned long flags;
1306 1391  
  1392 + spin_lock_irqsave(shost->host_lock, flags);
  1393 + if (scsi_host_eh_past_deadline(shost)) {
  1394 + spin_unlock_irqrestore(shost->host_lock, flags);
  1395 + /* push back on work queue for further processing */
  1396 + list_splice_init(&check_list, work_q);
  1397 + list_splice_init(&tmp_list, work_q);
  1398 + SCSI_LOG_ERROR_RECOVERY(3,
  1399 + shost_printk(KERN_INFO, shost,
  1400 + "skip %s, past eh deadline\n",
  1401 + __func__));
  1402 + return list_empty(work_q);
  1403 + }
  1404 + spin_unlock_irqrestore(shost->host_lock, flags);
  1405 +
1307 1406 scmd = list_entry(tmp_list.next, struct scsi_cmnd, eh_entry);
1308 1407 id = scmd_id(scmd);
1309 1408  
... ... @@ -1347,6 +1446,7 @@
1347 1446 LIST_HEAD(check_list);
1348 1447 unsigned int channel;
1349 1448 int rtn;
  1449 + unsigned long flags;
1350 1450  
1351 1451 /*
1352 1452 * we really want to loop over the various channels, and do this on
... ... @@ -1356,6 +1456,18 @@
1356 1456 */
1357 1457  
1358 1458 for (channel = 0; channel <= shost->max_channel; channel++) {
  1459 + spin_lock_irqsave(shost->host_lock, flags);
  1460 + if (scsi_host_eh_past_deadline(shost)) {
  1461 + spin_unlock_irqrestore(shost->host_lock, flags);
  1462 + list_splice_init(&check_list, work_q);
  1463 + SCSI_LOG_ERROR_RECOVERY(3,
  1464 + shost_printk(KERN_INFO, shost,
  1465 + "skip %s, past eh deadline\n",
  1466 + __func__));
  1467 + return list_empty(work_q);
  1468 + }
  1469 + spin_unlock_irqrestore(shost->host_lock, flags);
  1470 +
1359 1471 chan_scmd = NULL;
1360 1472 list_for_each_entry(scmd, work_q, eh_entry) {
1361 1473 if (channel == scmd_channel(scmd)) {
... ... @@ -1755,8 +1867,9 @@
1755 1867 * will be requests for character device operations, and also for
1756 1868 * ioctls to queued block devices.
1757 1869 */
1758   - SCSI_LOG_ERROR_RECOVERY(3, printk("%s: waking up host to restart\n",
1759   - __func__));
  1870 + SCSI_LOG_ERROR_RECOVERY(3,
  1871 + printk("scsi_eh_%d waking up host to restart\n",
  1872 + shost->host_no));
1760 1873  
1761 1874 spin_lock_irqsave(shost->host_lock, flags);
1762 1875 if (scsi_host_set_state(shost, SHOST_RUNNING))
... ... @@ -1883,6 +1996,10 @@
1883 1996 if (!scsi_eh_abort_cmds(&eh_work_q, &eh_done_q))
1884 1997 scsi_eh_ready_devs(shost, &eh_work_q, &eh_done_q);
1885 1998  
  1999 + spin_lock_irqsave(shost->host_lock, flags);
  2000 + if (shost->eh_deadline)
  2001 + shost->last_reset = 0;
  2002 + spin_unlock_irqrestore(shost->host_lock, flags);
1886 2003 scsi_eh_flush_done_q(&eh_done_q);
1887 2004 }
1888 2005  
... ... @@ -1909,7 +2026,7 @@
1909 2026 if ((shost->host_failed == 0 && shost->host_eh_scheduled == 0) ||
1910 2027 shost->host_failed != shost->host_busy) {
1911 2028 SCSI_LOG_ERROR_RECOVERY(1,
1912   - printk("Error handler scsi_eh_%d sleeping\n",
  2029 + printk("scsi_eh_%d: sleeping\n",
1913 2030 shost->host_no));
1914 2031 schedule();
1915 2032 continue;
... ... @@ -1917,8 +2034,9 @@
1917 2034  
1918 2035 __set_current_state(TASK_RUNNING);
1919 2036 SCSI_LOG_ERROR_RECOVERY(1,
1920   - printk("Error handler scsi_eh_%d waking up\n",
1921   - shost->host_no));
  2037 + printk("scsi_eh_%d: waking up %d/%d/%d\n",
  2038 + shost->host_no, shost->host_eh_scheduled,
  2039 + shost->host_failed, shost->host_busy));
1922 2040  
1923 2041 /*
1924 2042 * We have a host that is failing for some reason. Figure out
drivers/scsi/scsi_sysfs.c
... ... @@ -281,6 +281,42 @@
281 281  
282 282 static DEVICE_ATTR(host_reset, S_IWUSR, NULL, store_host_reset);
283 283  
  284 +static ssize_t
  285 +show_shost_eh_deadline(struct device *dev,
  286 + struct device_attribute *attr, char *buf)
  287 +{
  288 + struct Scsi_Host *shost = class_to_shost(dev);
  289 +
  290 + return sprintf(buf, "%d\n", shost->eh_deadline / HZ);
  291 +}
  292 +
  293 +static ssize_t
  294 +store_shost_eh_deadline(struct device *dev, struct device_attribute *attr,
  295 + const char *buf, size_t count)
  296 +{
  297 + struct Scsi_Host *shost = class_to_shost(dev);
  298 + int ret = -EINVAL;
  299 + int deadline;
  300 + unsigned long flags;
  301 +
  302 + if (shost->transportt && shost->transportt->eh_strategy_handler)
  303 + return ret;
  304 +
  305 + if (sscanf(buf, "%d\n", &deadline) == 1) {
  306 + spin_lock_irqsave(shost->host_lock, flags);
  307 + if (scsi_host_in_recovery(shost))
  308 + ret = -EBUSY;
  309 + else {
  310 + shost->eh_deadline = deadline * HZ;
  311 + ret = count;
  312 + }
  313 + spin_unlock_irqrestore(shost->host_lock, flags);
  314 + }
  315 + return ret;
  316 +}
  317 +
  318 +static DEVICE_ATTR(eh_deadline, S_IRUGO | S_IWUSR, show_shost_eh_deadline, store_shost_eh_deadline);
  319 +
284 320 shost_rd_attr(unique_id, "%u\n");
285 321 shost_rd_attr(host_busy, "%hu\n");
286 322 shost_rd_attr(cmd_per_lun, "%hd\n");
... ... @@ -308,6 +344,7 @@
308 344 &dev_attr_prot_capabilities.attr,
309 345 &dev_attr_prot_guard_type.attr,
310 346 &dev_attr_host_reset.attr,
  347 + &dev_attr_eh_deadline.attr,
311 348 NULL
312 349 };
313 350  
include/scsi/scsi_host.h
... ... @@ -599,6 +599,11 @@
599 599  
600 600 unsigned int host_no; /* Used for IOCTL_GET_IDLUN, /proc/scsi et al. */
601 601  
  602 + /* next two fields are used to bound the time spent in error handling */
  603 + int eh_deadline;
  604 + unsigned long last_reset;
  605 +
  606 +
602 607 /*
603 608 * These three parameters can be used to allow for wide scsi,
604 609 * and for host adapters that support multiple busses