Commit e494f6a728394ab0df194342549ee20e6f0752df
Committed by
James Bottomley
1 parent
2451079bc2
Exists in
master
and in
16 other branches
[SCSI] improved eh timeout handler
When a command runs into a timeout we need to send an 'ABORT TASK' TMF. This is typically done by the 'eh_abort_handler' LLDD callback. Conceptually, however, this function is a normal SCSI command, so there is no need to enter the error handler. This patch implements a new scsi_abort_command() function which invokes an asynchronous function scsi_eh_abort_handler() to abort the commands via the usual 'eh_abort_handler'. If abort succeeds the command is either retried or terminated, depending on the number of allowed retries. However, 'eh_eflags' records the abort, so if the retry would fail again the command is pushed onto the error handler without trying to abort it (again); it'll be cleared up from SCSI EH. [hare: smatch detected stray switch fixed] Signed-off-by: Hannes Reinecke <hare@suse.de> Signed-off-by: James Bottomley <JBottomley@Parallels.com>
Showing 6 changed files with 167 additions and 14 deletions Side-by-side Diff
drivers/scsi/hosts.c
... | ... | @@ -169,6 +169,7 @@ |
169 | 169 | spin_unlock_irqrestore(shost->host_lock, flags); |
170 | 170 | |
171 | 171 | scsi_autopm_get_host(shost); |
172 | + flush_workqueue(shost->tmf_work_q); | |
172 | 173 | scsi_forget_host(shost); |
173 | 174 | mutex_unlock(&shost->scan_mutex); |
174 | 175 | scsi_proc_host_rm(shost); |
... | ... | @@ -294,6 +295,8 @@ |
294 | 295 | |
295 | 296 | scsi_proc_hostdir_rm(shost->hostt); |
296 | 297 | |
298 | + if (shost->tmf_work_q) | |
299 | + destroy_workqueue(shost->tmf_work_q); | |
297 | 300 | if (shost->ehandler) |
298 | 301 | kthread_stop(shost->ehandler); |
299 | 302 | if (shost->work_q) |
... | ... | @@ -360,7 +363,6 @@ |
360 | 363 | INIT_LIST_HEAD(&shost->eh_cmd_q); |
361 | 364 | INIT_LIST_HEAD(&shost->starved_list); |
362 | 365 | init_waitqueue_head(&shost->host_wait); |
363 | - | |
364 | 366 | mutex_init(&shost->scan_mutex); |
365 | 367 | |
366 | 368 | /* |
367 | 369 | |
... | ... | @@ -444,9 +446,19 @@ |
444 | 446 | goto fail_kfree; |
445 | 447 | } |
446 | 448 | |
449 | + shost->tmf_work_q = alloc_workqueue("scsi_tmf_%d", | |
450 | + WQ_UNBOUND | WQ_MEM_RECLAIM, | |
451 | + 1, shost->host_no); | |
452 | + if (!shost->tmf_work_q) { | |
453 | + printk(KERN_WARNING "scsi%d: failed to create tmf workq\n", | |
454 | + shost->host_no); | |
455 | + goto fail_kthread; | |
456 | + } | |
447 | 457 | scsi_proc_hostdir_add(shost->hostt); |
448 | 458 | return shost; |
449 | 459 | |
460 | + fail_kthread: | |
461 | + kthread_stop(shost->ehandler); | |
450 | 462 | fail_kfree: |
451 | 463 | kfree(shost); |
452 | 464 | return NULL; |
drivers/scsi/scsi.c
... | ... | @@ -297,6 +297,7 @@ |
297 | 297 | |
298 | 298 | cmd->device = dev; |
299 | 299 | INIT_LIST_HEAD(&cmd->list); |
300 | + INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler); | |
300 | 301 | spin_lock_irqsave(&dev->list_lock, flags); |
301 | 302 | list_add_tail(&cmd->list, &dev->cmd_list); |
302 | 303 | spin_unlock_irqrestore(&dev->list_lock, flags); |
... | ... | @@ -352,6 +353,8 @@ |
352 | 353 | BUG_ON(list_empty(&cmd->list)); |
353 | 354 | list_del_init(&cmd->list); |
354 | 355 | spin_unlock_irqrestore(&cmd->device->list_lock, flags); |
356 | + | |
357 | + cancel_delayed_work(&cmd->abort_work); | |
355 | 358 | |
356 | 359 | __scsi_put_command(cmd->device->host, cmd, &sdev->sdev_gendev); |
357 | 360 | } |
drivers/scsi/scsi_error.c
... | ... | @@ -53,6 +53,8 @@ |
53 | 53 | #define HOST_RESET_SETTLE_TIME (10) |
54 | 54 | |
55 | 55 | static int scsi_eh_try_stu(struct scsi_cmnd *scmd); |
56 | +static int scsi_try_to_abort_cmd(struct scsi_host_template *, | |
57 | + struct scsi_cmnd *); | |
56 | 58 | |
57 | 59 | /* called with shost->host_lock held */ |
58 | 60 | void scsi_eh_wakeup(struct Scsi_Host *shost) |
... | ... | @@ -100,6 +102,116 @@ |
100 | 102 | } |
101 | 103 | |
102 | 104 | /** |
105 | + * scmd_eh_abort_handler - Handle command aborts | |
106 | + * @work: command to be aborted. | |
107 | + */ | |
108 | +void | |
109 | +scmd_eh_abort_handler(struct work_struct *work) | |
110 | +{ | |
111 | + struct scsi_cmnd *scmd = | |
112 | + container_of(work, struct scsi_cmnd, abort_work.work); | |
113 | + struct scsi_device *sdev = scmd->device; | |
114 | + unsigned long flags; | |
115 | + int rtn; | |
116 | + | |
117 | + spin_lock_irqsave(sdev->host->host_lock, flags); | |
118 | + if (scsi_host_eh_past_deadline(sdev->host)) { | |
119 | + spin_unlock_irqrestore(sdev->host->host_lock, flags); | |
120 | + SCSI_LOG_ERROR_RECOVERY(3, | |
121 | + scmd_printk(KERN_INFO, scmd, | |
122 | + "scmd %p eh timeout, not aborting\n", | |
123 | + scmd)); | |
124 | + } else { | |
125 | + spin_unlock_irqrestore(sdev->host->host_lock, flags); | |
126 | + SCSI_LOG_ERROR_RECOVERY(3, | |
127 | + scmd_printk(KERN_INFO, scmd, | |
128 | + "aborting command %p\n", scmd)); | |
129 | + rtn = scsi_try_to_abort_cmd(sdev->host->hostt, scmd); | |
130 | + if (rtn == SUCCESS) { | |
131 | + scmd->result |= DID_TIME_OUT << 16; | |
132 | + if (!scsi_noretry_cmd(scmd) && | |
133 | + (++scmd->retries <= scmd->allowed)) { | |
134 | + SCSI_LOG_ERROR_RECOVERY(3, | |
135 | + scmd_printk(KERN_WARNING, scmd, | |
136 | + "scmd %p retry " | |
137 | + "aborted command\n", scmd)); | |
138 | + scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY); | |
139 | + } else { | |
140 | + SCSI_LOG_ERROR_RECOVERY(3, | |
141 | + scmd_printk(KERN_WARNING, scmd, | |
142 | + "scmd %p finish " | |
143 | + "aborted command\n", scmd)); | |
144 | + scsi_finish_command(scmd); | |
145 | + } | |
146 | + return; | |
147 | + } | |
148 | + SCSI_LOG_ERROR_RECOVERY(3, | |
149 | + scmd_printk(KERN_INFO, scmd, | |
150 | + "scmd %p abort failed, rtn %d\n", | |
151 | + scmd, rtn)); | |
152 | + } | |
153 | + | |
154 | + if (!scsi_eh_scmd_add(scmd, 0)) { | |
155 | + SCSI_LOG_ERROR_RECOVERY(3, | |
156 | + scmd_printk(KERN_WARNING, scmd, | |
157 | + "scmd %p terminate " | |
158 | + "aborted command\n", scmd)); | |
159 | + scmd->result |= DID_TIME_OUT << 16; | |
160 | + scsi_finish_command(scmd); | |
161 | + } | |
162 | +} | |
163 | + | |
164 | +/** | |
165 | + * scsi_abort_command - schedule a command abort | |
166 | + * @scmd: scmd to abort. | |
167 | + * | |
168 | + * We only need to abort commands after a command timeout | |
169 | + */ | |
170 | +static int | |
171 | +scsi_abort_command(struct scsi_cmnd *scmd) | |
172 | +{ | |
173 | + struct scsi_device *sdev = scmd->device; | |
174 | + struct Scsi_Host *shost = sdev->host; | |
175 | + unsigned long flags; | |
176 | + | |
177 | + if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) { | |
178 | + /* | |
179 | + * Retry after abort failed, escalate to next level. | |
180 | + */ | |
181 | + SCSI_LOG_ERROR_RECOVERY(3, | |
182 | + scmd_printk(KERN_INFO, scmd, | |
183 | + "scmd %p previous abort failed\n", scmd)); | |
184 | + cancel_delayed_work(&scmd->abort_work); | |
185 | + return FAILED; | |
186 | + } | |
187 | + | |
188 | + /* | |
189 | + * Do not try a command abort if | |
190 | + * SCSI EH has already started. | |
191 | + */ | |
192 | + spin_lock_irqsave(shost->host_lock, flags); | |
193 | + if (scsi_host_in_recovery(shost)) { | |
194 | + spin_unlock_irqrestore(shost->host_lock, flags); | |
195 | + SCSI_LOG_ERROR_RECOVERY(3, | |
196 | + scmd_printk(KERN_INFO, scmd, | |
197 | + "scmd %p not aborting, host in recovery\n", | |
198 | + scmd)); | |
199 | + return FAILED; | |
200 | + } | |
201 | + | |
202 | + if (shost->eh_deadline && !shost->last_reset) | |
203 | + shost->last_reset = jiffies; | |
204 | + spin_unlock_irqrestore(shost->host_lock, flags); | |
205 | + | |
206 | + scmd->eh_eflags |= SCSI_EH_ABORT_SCHEDULED; | |
207 | + SCSI_LOG_ERROR_RECOVERY(3, | |
208 | + scmd_printk(KERN_INFO, scmd, | |
209 | + "scmd %p abort scheduled\n", scmd)); | |
210 | + queue_delayed_work(shost->tmf_work_q, &scmd->abort_work, HZ / 100); | |
211 | + return SUCCESS; | |
212 | +} | |
213 | + | |
214 | +/** | |
103 | 215 | * scsi_eh_scmd_add - add scsi cmd to error handling. |
104 | 216 | * @scmd: scmd to run eh on. |
105 | 217 | * @eh_flag: optional SCSI_EH flag. |
... | ... | @@ -125,6 +237,8 @@ |
125 | 237 | shost->last_reset = jiffies; |
126 | 238 | |
127 | 239 | ret = 1; |
240 | + if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) | |
241 | + eh_flag &= ~SCSI_EH_CANCEL_CMD; | |
128 | 242 | scmd->eh_eflags |= eh_flag; |
129 | 243 | list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q); |
130 | 244 | shost->host_failed++; |
... | ... | @@ -161,6 +275,10 @@ |
161 | 275 | else if (host->hostt->eh_timed_out) |
162 | 276 | rtn = host->hostt->eh_timed_out(scmd); |
163 | 277 | |
278 | + if (rtn == BLK_EH_NOT_HANDLED && !host->hostt->no_async_abort) | |
279 | + if (scsi_abort_command(scmd) == SUCCESS) | |
280 | + return BLK_EH_NOT_HANDLED; | |
281 | + | |
164 | 282 | scmd->result |= DID_TIME_OUT << 16; |
165 | 283 | |
166 | 284 | if (unlikely(rtn == BLK_EH_NOT_HANDLED && |
... | ... | @@ -1577,7 +1695,7 @@ |
1577 | 1695 | } |
1578 | 1696 | |
1579 | 1697 | /** |
1580 | - * scsi_noretry_cmd - determinte if command should be failed fast | |
1698 | + * scsi_noretry_cmd - determine if command should be failed fast | |
1581 | 1699 | * @scmd: SCSI cmd to examine. |
1582 | 1700 | */ |
1583 | 1701 | int scsi_noretry_cmd(struct scsi_cmnd *scmd) |
... | ... | @@ -1585,6 +1703,8 @@ |
1585 | 1703 | switch (host_byte(scmd->result)) { |
1586 | 1704 | case DID_OK: |
1587 | 1705 | break; |
1706 | + case DID_TIME_OUT: | |
1707 | + goto check_type; | |
1588 | 1708 | case DID_BUS_BUSY: |
1589 | 1709 | return (scmd->request->cmd_flags & REQ_FAILFAST_TRANSPORT); |
1590 | 1710 | case DID_PARITY: |
1591 | 1711 | |
... | ... | @@ -1598,18 +1718,19 @@ |
1598 | 1718 | return (scmd->request->cmd_flags & REQ_FAILFAST_DRIVER); |
1599 | 1719 | } |
1600 | 1720 | |
1601 | - switch (status_byte(scmd->result)) { | |
1602 | - case CHECK_CONDITION: | |
1603 | - /* | |
1604 | - * assume caller has checked sense and determinted | |
1605 | - * the check condition was retryable. | |
1606 | - */ | |
1607 | - if (scmd->request->cmd_flags & REQ_FAILFAST_DEV || | |
1608 | - scmd->request->cmd_type == REQ_TYPE_BLOCK_PC) | |
1609 | - return 1; | |
1610 | - } | |
1721 | + if (status_byte(scmd->result) != CHECK_CONDITION) | |
1722 | + return 0; | |
1611 | 1723 | |
1612 | - return 0; | |
1724 | +check_type: | |
1725 | + /* | |
1726 | + * assume caller has checked sense and determined | |
1727 | + * the check condition was retryable. | |
1728 | + */ | |
1729 | + if (scmd->request->cmd_flags & REQ_FAILFAST_DEV || | |
1730 | + scmd->request->cmd_type == REQ_TYPE_BLOCK_PC) | |
1731 | + return 1; | |
1732 | + else | |
1733 | + return 0; | |
1613 | 1734 | } |
1614 | 1735 | |
1615 | 1736 | /** |
1616 | 1737 | |
... | ... | @@ -1659,9 +1780,13 @@ |
1659 | 1780 | * looks good. drop through, and check the next byte. |
1660 | 1781 | */ |
1661 | 1782 | break; |
1783 | + case DID_ABORT: | |
1784 | + if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) { | |
1785 | + scmd->result |= DID_TIME_OUT << 16; | |
1786 | + return SUCCESS; | |
1787 | + } | |
1662 | 1788 | case DID_NO_CONNECT: |
1663 | 1789 | case DID_BAD_TARGET: |
1664 | - case DID_ABORT: | |
1665 | 1790 | /* |
1666 | 1791 | * note - this means that we just report the status back |
1667 | 1792 | * to the top level driver, not that we actually think |
drivers/scsi/scsi_priv.h
... | ... | @@ -19,6 +19,7 @@ |
19 | 19 | * Scsi Error Handler Flags |
20 | 20 | */ |
21 | 21 | #define SCSI_EH_CANCEL_CMD 0x0001 /* Cancel this cmd */ |
22 | +#define SCSI_EH_ABORT_SCHEDULED 0x0002 /* Abort has been scheduled */ | |
22 | 23 | |
23 | 24 | #define SCSI_SENSE_VALID(scmd) \ |
24 | 25 | (((scmd)->sense_buffer[0] & 0x70) == 0x70) |
... | ... | @@ -66,6 +67,7 @@ |
66 | 67 | extern void scsi_exit_devinfo(void); |
67 | 68 | |
68 | 69 | /* scsi_error.c */ |
70 | +extern void scmd_eh_abort_handler(struct work_struct *work); | |
69 | 71 | extern enum blk_eh_timer_return scsi_times_out(struct request *req); |
70 | 72 | extern int scsi_error_handler(void *host); |
71 | 73 | extern int scsi_decide_disposition(struct scsi_cmnd *cmd); |
include/scsi/scsi_cmnd.h
include/scsi/scsi_host.h
... | ... | @@ -479,6 +479,11 @@ |
479 | 479 | unsigned no_write_same:1; |
480 | 480 | |
481 | 481 | /* |
482 | + * True if asynchronous aborts are not supported | |
483 | + */ | |
484 | + unsigned no_async_abort:1; | |
485 | + | |
486 | + /* | |
482 | 487 | * Countdown for host blocking with no commands outstanding. |
483 | 488 | */ |
484 | 489 | unsigned int max_host_blocked; |
... | ... | @@ -688,6 +693,11 @@ |
688 | 693 | */ |
689 | 694 | char work_q_name[20]; |
690 | 695 | struct workqueue_struct *work_q; |
696 | + | |
697 | + /* | |
698 | + * Task management function work queue | |
699 | + */ | |
700 | + struct workqueue_struct *tmf_work_q; | |
691 | 701 | |
692 | 702 | /* |
693 | 703 | * Host has rejected a command because it was busy. |