Commit e494f6a728394ab0df194342549ee20e6f0752df

Authored by Hannes Reinecke
Committed by James Bottomley
1 parent 2451079bc2

[SCSI] improved eh timeout handler

When a command runs into a timeout we need to send an 'ABORT TASK'
TMF. This is typically done by the 'eh_abort_handler' LLDD callback.

Conceptually, however, this function is a normal SCSI command, so
there is no need to enter the error handler.

This patch implements a new scsi_abort_command() function which
invokes an asynchronous function scsi_eh_abort_handler() to
abort the commands via the usual 'eh_abort_handler'.

If abort succeeds the command is either retried or terminated,
depending on the number of allowed retries. However, 'eh_eflags'
records the abort, so if the retry would fail again the
command is pushed onto the error handler without trying to
abort it (again); it'll be cleared up from SCSI EH.

[hare: smatch detected stray switch fixed]
Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>

Showing 6 changed files with 167 additions and 14 deletions Side-by-side Diff

drivers/scsi/hosts.c
... ... @@ -169,6 +169,7 @@
169 169 spin_unlock_irqrestore(shost->host_lock, flags);
170 170  
171 171 scsi_autopm_get_host(shost);
  172 + flush_workqueue(shost->tmf_work_q);
172 173 scsi_forget_host(shost);
173 174 mutex_unlock(&shost->scan_mutex);
174 175 scsi_proc_host_rm(shost);
... ... @@ -294,6 +295,8 @@
294 295  
295 296 scsi_proc_hostdir_rm(shost->hostt);
296 297  
  298 + if (shost->tmf_work_q)
  299 + destroy_workqueue(shost->tmf_work_q);
297 300 if (shost->ehandler)
298 301 kthread_stop(shost->ehandler);
299 302 if (shost->work_q)
... ... @@ -360,7 +363,6 @@
360 363 INIT_LIST_HEAD(&shost->eh_cmd_q);
361 364 INIT_LIST_HEAD(&shost->starved_list);
362 365 init_waitqueue_head(&shost->host_wait);
363   -
364 366 mutex_init(&shost->scan_mutex);
365 367  
366 368 /*
367 369  
... ... @@ -444,9 +446,19 @@
444 446 goto fail_kfree;
445 447 }
446 448  
  449 + shost->tmf_work_q = alloc_workqueue("scsi_tmf_%d",
  450 + WQ_UNBOUND | WQ_MEM_RECLAIM,
  451 + 1, shost->host_no);
  452 + if (!shost->tmf_work_q) {
  453 + printk(KERN_WARNING "scsi%d: failed to create tmf workq\n",
  454 + shost->host_no);
  455 + goto fail_kthread;
  456 + }
447 457 scsi_proc_hostdir_add(shost->hostt);
448 458 return shost;
449 459  
  460 + fail_kthread:
  461 + kthread_stop(shost->ehandler);
450 462 fail_kfree:
451 463 kfree(shost);
452 464 return NULL;
... ... @@ -297,6 +297,7 @@
297 297  
298 298 cmd->device = dev;
299 299 INIT_LIST_HEAD(&cmd->list);
  300 + INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler);
300 301 spin_lock_irqsave(&dev->list_lock, flags);
301 302 list_add_tail(&cmd->list, &dev->cmd_list);
302 303 spin_unlock_irqrestore(&dev->list_lock, flags);
... ... @@ -352,6 +353,8 @@
352 353 BUG_ON(list_empty(&cmd->list));
353 354 list_del_init(&cmd->list);
354 355 spin_unlock_irqrestore(&cmd->device->list_lock, flags);
  356 +
  357 + cancel_delayed_work(&cmd->abort_work);
355 358  
356 359 __scsi_put_command(cmd->device->host, cmd, &sdev->sdev_gendev);
357 360 }
drivers/scsi/scsi_error.c
... ... @@ -53,6 +53,8 @@
53 53 #define HOST_RESET_SETTLE_TIME (10)
54 54  
55 55 static int scsi_eh_try_stu(struct scsi_cmnd *scmd);
  56 +static int scsi_try_to_abort_cmd(struct scsi_host_template *,
  57 + struct scsi_cmnd *);
56 58  
57 59 /* called with shost->host_lock held */
58 60 void scsi_eh_wakeup(struct Scsi_Host *shost)
... ... @@ -100,6 +102,116 @@
100 102 }
101 103  
102 104 /**
  105 + * scmd_eh_abort_handler - Handle command aborts
  106 + * @work: command to be aborted.
  107 + */
  108 +void
  109 +scmd_eh_abort_handler(struct work_struct *work)
  110 +{
  111 + struct scsi_cmnd *scmd =
  112 + container_of(work, struct scsi_cmnd, abort_work.work);
  113 + struct scsi_device *sdev = scmd->device;
  114 + unsigned long flags;
  115 + int rtn;
  116 +
  117 + spin_lock_irqsave(sdev->host->host_lock, flags);
  118 + if (scsi_host_eh_past_deadline(sdev->host)) {
  119 + spin_unlock_irqrestore(sdev->host->host_lock, flags);
  120 + SCSI_LOG_ERROR_RECOVERY(3,
  121 + scmd_printk(KERN_INFO, scmd,
  122 + "scmd %p eh timeout, not aborting\n",
  123 + scmd));
  124 + } else {
  125 + spin_unlock_irqrestore(sdev->host->host_lock, flags);
  126 + SCSI_LOG_ERROR_RECOVERY(3,
  127 + scmd_printk(KERN_INFO, scmd,
  128 + "aborting command %p\n", scmd));
  129 + rtn = scsi_try_to_abort_cmd(sdev->host->hostt, scmd);
  130 + if (rtn == SUCCESS) {
  131 + scmd->result |= DID_TIME_OUT << 16;
  132 + if (!scsi_noretry_cmd(scmd) &&
  133 + (++scmd->retries <= scmd->allowed)) {
  134 + SCSI_LOG_ERROR_RECOVERY(3,
  135 + scmd_printk(KERN_WARNING, scmd,
  136 + "scmd %p retry "
  137 + "aborted command\n", scmd));
  138 + scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY);
  139 + } else {
  140 + SCSI_LOG_ERROR_RECOVERY(3,
  141 + scmd_printk(KERN_WARNING, scmd,
  142 + "scmd %p finish "
  143 + "aborted command\n", scmd));
  144 + scsi_finish_command(scmd);
  145 + }
  146 + return;
  147 + }
  148 + SCSI_LOG_ERROR_RECOVERY(3,
  149 + scmd_printk(KERN_INFO, scmd,
  150 + "scmd %p abort failed, rtn %d\n",
  151 + scmd, rtn));
  152 + }
  153 +
  154 + if (!scsi_eh_scmd_add(scmd, 0)) {
  155 + SCSI_LOG_ERROR_RECOVERY(3,
  156 + scmd_printk(KERN_WARNING, scmd,
  157 + "scmd %p terminate "
  158 + "aborted command\n", scmd));
  159 + scmd->result |= DID_TIME_OUT << 16;
  160 + scsi_finish_command(scmd);
  161 + }
  162 +}
  163 +
  164 +/**
  165 + * scsi_abort_command - schedule a command abort
  166 + * @scmd: scmd to abort.
  167 + *
  168 + * We only need to abort commands after a command timeout
  169 + */
  170 +static int
  171 +scsi_abort_command(struct scsi_cmnd *scmd)
  172 +{
  173 + struct scsi_device *sdev = scmd->device;
  174 + struct Scsi_Host *shost = sdev->host;
  175 + unsigned long flags;
  176 +
  177 + if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) {
  178 + /*
  179 + * Retry after abort failed, escalate to next level.
  180 + */
  181 + SCSI_LOG_ERROR_RECOVERY(3,
  182 + scmd_printk(KERN_INFO, scmd,
  183 + "scmd %p previous abort failed\n", scmd));
  184 + cancel_delayed_work(&scmd->abort_work);
  185 + return FAILED;
  186 + }
  187 +
  188 + /*
  189 + * Do not try a command abort if
  190 + * SCSI EH has already started.
  191 + */
  192 + spin_lock_irqsave(shost->host_lock, flags);
  193 + if (scsi_host_in_recovery(shost)) {
  194 + spin_unlock_irqrestore(shost->host_lock, flags);
  195 + SCSI_LOG_ERROR_RECOVERY(3,
  196 + scmd_printk(KERN_INFO, scmd,
  197 + "scmd %p not aborting, host in recovery\n",
  198 + scmd));
  199 + return FAILED;
  200 + }
  201 +
  202 + if (shost->eh_deadline && !shost->last_reset)
  203 + shost->last_reset = jiffies;
  204 + spin_unlock_irqrestore(shost->host_lock, flags);
  205 +
  206 + scmd->eh_eflags |= SCSI_EH_ABORT_SCHEDULED;
  207 + SCSI_LOG_ERROR_RECOVERY(3,
  208 + scmd_printk(KERN_INFO, scmd,
  209 + "scmd %p abort scheduled\n", scmd));
  210 + queue_delayed_work(shost->tmf_work_q, &scmd->abort_work, HZ / 100);
  211 + return SUCCESS;
  212 +}
  213 +
  214 +/**
103 215 * scsi_eh_scmd_add - add scsi cmd to error handling.
104 216 * @scmd: scmd to run eh on.
105 217 * @eh_flag: optional SCSI_EH flag.
... ... @@ -125,6 +237,8 @@
125 237 shost->last_reset = jiffies;
126 238  
127 239 ret = 1;
  240 + if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED)
  241 + eh_flag &= ~SCSI_EH_CANCEL_CMD;
128 242 scmd->eh_eflags |= eh_flag;
129 243 list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q);
130 244 shost->host_failed++;
... ... @@ -161,6 +275,10 @@
161 275 else if (host->hostt->eh_timed_out)
162 276 rtn = host->hostt->eh_timed_out(scmd);
163 277  
  278 + if (rtn == BLK_EH_NOT_HANDLED && !host->hostt->no_async_abort)
  279 + if (scsi_abort_command(scmd) == SUCCESS)
  280 + return BLK_EH_NOT_HANDLED;
  281 +
164 282 scmd->result |= DID_TIME_OUT << 16;
165 283  
166 284 if (unlikely(rtn == BLK_EH_NOT_HANDLED &&
... ... @@ -1577,7 +1695,7 @@
1577 1695 }
1578 1696  
1579 1697 /**
1580   - * scsi_noretry_cmd - determinte if command should be failed fast
  1698 + * scsi_noretry_cmd - determine if command should be failed fast
1581 1699 * @scmd: SCSI cmd to examine.
1582 1700 */
1583 1701 int scsi_noretry_cmd(struct scsi_cmnd *scmd)
... ... @@ -1585,6 +1703,8 @@
1585 1703 switch (host_byte(scmd->result)) {
1586 1704 case DID_OK:
1587 1705 break;
  1706 + case DID_TIME_OUT:
  1707 + goto check_type;
1588 1708 case DID_BUS_BUSY:
1589 1709 return (scmd->request->cmd_flags & REQ_FAILFAST_TRANSPORT);
1590 1710 case DID_PARITY:
1591 1711  
... ... @@ -1598,18 +1718,19 @@
1598 1718 return (scmd->request->cmd_flags & REQ_FAILFAST_DRIVER);
1599 1719 }
1600 1720  
1601   - switch (status_byte(scmd->result)) {
1602   - case CHECK_CONDITION:
1603   - /*
1604   - * assume caller has checked sense and determinted
1605   - * the check condition was retryable.
1606   - */
1607   - if (scmd->request->cmd_flags & REQ_FAILFAST_DEV ||
1608   - scmd->request->cmd_type == REQ_TYPE_BLOCK_PC)
1609   - return 1;
1610   - }
  1721 + if (status_byte(scmd->result) != CHECK_CONDITION)
  1722 + return 0;
1611 1723  
1612   - return 0;
  1724 +check_type:
  1725 + /*
  1726 + * assume caller has checked sense and determined
  1727 + * the check condition was retryable.
  1728 + */
  1729 + if (scmd->request->cmd_flags & REQ_FAILFAST_DEV ||
  1730 + scmd->request->cmd_type == REQ_TYPE_BLOCK_PC)
  1731 + return 1;
  1732 + else
  1733 + return 0;
1613 1734 }
1614 1735  
1615 1736 /**
1616 1737  
... ... @@ -1659,9 +1780,13 @@
1659 1780 * looks good. drop through, and check the next byte.
1660 1781 */
1661 1782 break;
  1783 + case DID_ABORT:
  1784 + if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) {
  1785 + scmd->result |= DID_TIME_OUT << 16;
  1786 + return SUCCESS;
  1787 + }
1662 1788 case DID_NO_CONNECT:
1663 1789 case DID_BAD_TARGET:
1664   - case DID_ABORT:
1665 1790 /*
1666 1791 * note - this means that we just report the status back
1667 1792 * to the top level driver, not that we actually think
drivers/scsi/scsi_priv.h
... ... @@ -19,6 +19,7 @@
19 19 * Scsi Error Handler Flags
20 20 */
21 21 #define SCSI_EH_CANCEL_CMD 0x0001 /* Cancel this cmd */
  22 +#define SCSI_EH_ABORT_SCHEDULED 0x0002 /* Abort has been scheduled */
22 23  
23 24 #define SCSI_SENSE_VALID(scmd) \
24 25 (((scmd)->sense_buffer[0] & 0x70) == 0x70)
... ... @@ -66,6 +67,7 @@
66 67 extern void scsi_exit_devinfo(void);
67 68  
68 69 /* scsi_error.c */
  70 +extern void scmd_eh_abort_handler(struct work_struct *work);
69 71 extern enum blk_eh_timer_return scsi_times_out(struct request *req);
70 72 extern int scsi_error_handler(void *host);
71 73 extern int scsi_decide_disposition(struct scsi_cmnd *cmd);
include/scsi/scsi_cmnd.h
... ... @@ -55,6 +55,7 @@
55 55 struct scsi_device *device;
56 56 struct list_head list; /* scsi_cmnd participates in queue lists */
57 57 struct list_head eh_entry; /* entry for the host eh_cmd_q */
  58 + struct delayed_work abort_work;
58 59 int eh_eflags; /* Used by error handlr */
59 60  
60 61 /*
include/scsi/scsi_host.h
... ... @@ -479,6 +479,11 @@
479 479 unsigned no_write_same:1;
480 480  
481 481 /*
  482 + * True if asynchronous aborts are not supported
  483 + */
  484 + unsigned no_async_abort:1;
  485 +
  486 + /*
482 487 * Countdown for host blocking with no commands outstanding.
483 488 */
484 489 unsigned int max_host_blocked;
... ... @@ -688,6 +693,11 @@
688 693 */
689 694 char work_q_name[20];
690 695 struct workqueue_struct *work_q;
  696 +
  697 + /*
  698 + * Task management function work queue
  699 + */
  700 + struct workqueue_struct *tmf_work_q;
691 701  
692 702 /*
693 703 * Host has rejected a command because it was busy.