cciss: fix SCSI device reset handler
Stephen M. Cameron [Tue, 2 Jun 2009 12:48:11 +0000 (14:48 +0200)]
Fix the SCSI reset error handler to send a working, properly addressed
reset message to the target device and add code to wait for the target
device to become ready by polling it with Test Unit Ready.

The existing reset code was broken in that it didn't bother to set the
8-byte LUN address to anything besides zero, so the command was addressed
to the controller, which pretended to the driver that the command
succeeded, while doing nothing.  Ages ago I tested this code, but
unbeknownst to me, my test was flawed, and what I thought was a tape drive
getting reset was actually nothing of the sort.  Unfortunately, there is
still lots of Smartarray firmware that doesn't handle doing target resets
right, and this code won't help in those cases, but it also shouldn't make
things worse in those cases than they already are.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Cc: Mike Miller <mikem@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

drivers/block/cciss.c
drivers/block/cciss_scsi.c

index 8d0f893..cb43fb3 100644 (file)
@@ -1974,6 +1974,13 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_
                        c->Request.CDB[0] = BMIC_WRITE;
                        c->Request.CDB[6] = BMIC_CACHE_FLUSH;
                        break;
+               case TEST_UNIT_READY:
+                       memcpy(c->Header. LUN.LunAddrBytes, scsi3addr, 8);
+                       c->Request.CDBLen = 6;
+                       c->Request.Type.Attribute = ATTR_SIMPLE;
+                       c->Request.Type.Direction = XFER_NONE;
+                       c->Request.Timeout = 0;
+                       break;
                default:
                        printk(KERN_WARNING
                               "cciss%d:  Unknown Command 0x%c\n", ctlr, cmd);
@@ -1992,13 +1999,14 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_
                        memcpy(&c->Request.CDB[4], buff, 8);
                        break;
                case 1: /* RESET message */
-                       c->Request.CDBLen = 12;
+                       memcpy(c->Header.LUN.LunAddrBytes, scsi3addr, 8);
+                       c->Request.CDBLen = 16;
                        c->Request.Type.Attribute = ATTR_SIMPLE;
-                       c->Request.Type.Direction = XFER_WRITE;
+                       c->Request.Type.Direction = XFER_NONE;
                        c->Request.Timeout = 0;
                        memset(&c->Request.CDB[0], 0, sizeof(c->Request.CDB));
                        c->Request.CDB[0] = cmd;        /* reset */
-                       c->Request.CDB[1] = 0x04;       /* reset a LUN */
+                       c->Request.CDB[1] = 0x03;       /* reset a target */
                        break;
                case 3: /* No-Op message */
                        c->Request.CDBLen = 1;
index a3fd87b..8575c48 100644 (file)
@@ -58,6 +58,18 @@ static int sendcmd(
        unsigned char *scsi3addr,
        int cmd_type);
 
+static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
+       size_t size,
+       unsigned int use_unit_num, /* 0: address the controller,
+                                     1: address logical volume log_unit,
+                                     2: periph device address is scsi3addr */
+       unsigned int log_unit, __u8 page_code, unsigned char *scsi3addr,
+       int cmd_type);
+
+static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c);
+
+static CommandList_struct *cmd_alloc(ctlr_info_t *h, int get_from_pool);
+static void cmd_free(ctlr_info_t *h, CommandList_struct *c, int got_from_pool);
 
 static int cciss_scsi_proc_info(
                struct Scsi_Host *sh,
@@ -1575,6 +1587,68 @@ cciss_seq_tape_report(struct seq_file *seq, int ctlr)
        CPQ_TAPE_UNLOCK(ctlr, flags);
 }
 
+static int wait_for_device_to_become_ready(ctlr_info_t *h,
+       unsigned char lunaddr[])
+{
+       int rc;
+       int count = 0;
+       int waittime = HZ;
+       CommandList_struct *c;
+
+       c = cmd_alloc(h, 1);
+       if (!c) {
+               printk(KERN_WARNING "cciss%d: out of memory in "
+                       "wait_for_device_to_become_ready.\n", h->ctlr);
+               return IO_ERROR;
+       }
+
+       /* Send test unit ready until device ready, or give up. */
+       while (count < 20) {
+
+               /* Wait for a bit.  do this first, because if we send
+                * the TUR right away, the reset will just abort it.
+                */
+               set_current_state(TASK_INTERRUPTIBLE);
+               schedule_timeout(waittime);
+               count++;
+
+               /* Increase wait time with each try, up to a point. */
+               if (waittime < (HZ * 30))
+                       waittime = waittime * 2;
+
+               /* Send the Test Unit Ready */
+               rc = fill_cmd(c, TEST_UNIT_READY, h->ctlr, NULL, 0, 0, 0, 0,
+                       lunaddr, TYPE_CMD);
+               if (rc == 0) {
+                       rc = sendcmd_core(h, c);
+                       /* sendcmd turned off interrupts, turn 'em back on. */
+                       h->access.set_intr_mask(h, CCISS_INTR_ON);
+               }
+
+               if (rc == 0 && c->err_info->CommandStatus == CMD_SUCCESS)
+                       break;
+
+               if (rc == 0 &&
+                       c->err_info->CommandStatus == CMD_TARGET_STATUS &&
+                       c->err_info->ScsiStatus == SAM_STAT_CHECK_CONDITION &&
+                       (c->err_info->SenseInfo[2] == NO_SENSE ||
+                       c->err_info->SenseInfo[2] == UNIT_ATTENTION))
+                       break;
+
+               printk(KERN_WARNING "cciss%d: Waiting %d secs "
+                       "for device to become ready.\n",
+                       h->ctlr, waittime / HZ);
+               rc = 1; /* device not ready. */
+       }
+
+       if (rc)
+               printk("cciss%d: giving up on device.\n", h->ctlr);
+       else
+               printk(KERN_WARNING "cciss%d: device is ready.\n", h->ctlr);
+
+       cmd_free(h, c, 1);
+       return rc;
+}
 
 /* Need at least one of these error handlers to keep ../scsi/hosts.c from 
  * complaining.  Doing a host- or bus-reset can't do anything good here. 
@@ -1591,6 +1665,7 @@ static int cciss_eh_device_reset_handler(struct scsi_cmnd *scsicmd)
 {
        int rc;
        CommandList_struct *cmd_in_trouble;
+       unsigned char lunaddr[8];
        ctlr_info_t **c;
        int ctlr;
 
@@ -1600,19 +1675,17 @@ static int cciss_eh_device_reset_handler(struct scsi_cmnd *scsicmd)
                return FAILED;
        ctlr = (*c)->ctlr;
        printk(KERN_WARNING "cciss%d: resetting tape drive or medium changer.\n", ctlr);
-
        /* find the command that's giving us trouble */
        cmd_in_trouble = (CommandList_struct *) scsicmd->host_scribble;
-       if (cmd_in_trouble == NULL) { /* paranoia */
+       if (cmd_in_trouble == NULL) /* paranoia */
                return FAILED;
-       }
+       memcpy(lunaddr, &cmd_in_trouble->Header.LUN.LunAddrBytes[0], 8);
        /* send a reset to the SCSI LUN which the command was sent to */
-       rc = sendcmd(CCISS_RESET_MSG, ctlr, NULL, 0, 2, 0, 0, 
-               (unsigned char *) &cmd_in_trouble->Header.LUN.LunAddrBytes[0], 
+       rc = sendcmd(CCISS_RESET_MSG, ctlr, NULL, 0, 2, 0, 0, lunaddr,
                TYPE_MSG);
        /* sendcmd turned off interrupts on the board, turn 'em back on. */
        (*c)->access.set_intr_mask(*c, CCISS_INTR_ON);
-       if (rc == 0)
+       if (rc == 0 && wait_for_device_to_become_ready(*c, lunaddr) == 0)
                return SUCCESS;
        printk(KERN_WARNING "cciss%d: resetting device failed.\n", ctlr);
        return FAILED;