[PATCH] libata-eh-fw: update ata_scsi_error() for new EH

Update ata_scsi_error() for new EH. ata_scsi_error() is responsible for claiming timed out qcs and invoking ->error_handler in safe and synchronized manner. As the state of the controller is unknown if a qc has timed out, the port is frozen in such cases. Note that ata_scsi_timed_out() isn't used for new EH. This is because a timed out qc cannot be claimed by EH without freezing the port and freezing the port in ata_scsi_timed_out() results in unnecessary abortion of other active qcs. ata_scsi_timed_out() can be removed once all drivers are converted to new EH. While at it, add 'TODO: kill' comments to old EH functions. Signed-off-by: Tejun Heo <htejun@gmail.com>
2024-11-24 13:11:40 +00:00 · 2006-05-15 20:58:12 +09:00 · 2006-05-15 20:58:12 +09:00 · ad9e276244
commit ad9e276244
parent dafadcde8d
2 changed files with 134 additions and 5 deletions
--- a/drivers/scsi/libata-eh.c
+++ b/drivers/scsi/libata-eh.c
@ -44,6 +44,8 @@

 #include "libata.h"

+static void __ata_port_freeze(struct ata_port *ap);
+
 /**
 *	ata_scsi_timed_out - SCSI layer time out callback
 *	@cmd: timed out SCSI command
@ -55,6 +57,8 @@
 *	from finishing it by setting EH_SCHEDULED and return
 *	EH_NOT_HANDLED.
 *
+ *	TODO: kill this function once old EH is gone.
+ *
 *	LOCKING:
 *	Called from timer context
 *
@ -67,10 +71,16 @@ enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
 	struct ata_port *ap = ata_shost_to_port(host);
 	unsigned long flags;
 	struct ata_queued_cmd *qc;
-	enum scsi_eh_timer_return ret = EH_HANDLED;
+	enum scsi_eh_timer_return ret;

 	DPRINTK("ENTER\n");

+	if (ap->ops->error_handler) {
+		ret = EH_NOT_HANDLED;
+		goto out;
+	}
+
+	ret = EH_HANDLED;
 	spin_lock_irqsave(&ap->host_set->lock, flags);
 	qc = ata_qc_from_tag(ap, ap->active_tag);
 	if (qc) {
@ -81,6 +91,7 @@ enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
 	}
 	spin_unlock_irqrestore(&ap->host_set->lock, flags);

+ out:
 	DPRINTK("EXIT, ret=%d\n", ret);
 	return ret;
 }
@ -100,21 +111,132 @@ enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
 void ata_scsi_error(struct Scsi_Host *host)
 {
 	struct ata_port *ap = ata_shost_to_port(host);
+	spinlock_t *hs_lock = &ap->host_set->lock;
+	int i, repeat_cnt = ATA_EH_MAX_REPEAT;
+	unsigned long flags;

 	DPRINTK("ENTER\n");

-	/* synchronize with IRQ handler and port task */
-	spin_unlock_wait(&ap->host_set->lock);
+	/* synchronize with port task */
 	ata_port_flush_task(ap);

-	WARN_ON(ata_qc_from_tag(ap, ap->active_tag) == NULL);
+	/* synchronize with host_set lock and sort out timeouts */

-	ap->ops->eng_timeout(ap);
+	/* For new EH, all qcs are finished in one of three ways -
+	 * normal completion, error completion, and SCSI timeout.
+	 * Both cmpletions can race against SCSI timeout.  When normal
+	 * completion wins, the qc never reaches EH.  When error
+	 * completion wins, the qc has ATA_QCFLAG_FAILED set.
+	 *
+	 * When SCSI timeout wins, things are a bit more complex.
+	 * Normal or error completion can occur after the timeout but
+	 * before this point.  In such cases, both types of
+	 * completions are honored.  A scmd is determined to have
+	 * timed out iff its associated qc is active and not failed.
+	 */
+	if (ap->ops->error_handler) {
+		struct scsi_cmnd *scmd, *tmp;
+		int nr_timedout = 0;

+		spin_lock_irqsave(hs_lock, flags);
+
+		list_for_each_entry_safe(scmd, tmp, &host->eh_cmd_q, eh_entry) {
+			struct ata_queued_cmd *qc;
+
+			for (i = 0; i < ATA_MAX_QUEUE; i++) {
+				qc = __ata_qc_from_tag(ap, i);
+				if (qc->flags & ATA_QCFLAG_ACTIVE &&
+				    qc->scsicmd == scmd)
+					break;
+			}
+
+			if (i < ATA_MAX_QUEUE) {
+				/* the scmd has an associated qc */
+				if (!(qc->flags & ATA_QCFLAG_FAILED)) {
+					/* which hasn't failed yet, timeout */
+					qc->err_mask |= AC_ERR_TIMEOUT;
+					qc->flags |= ATA_QCFLAG_FAILED;
+					nr_timedout++;
+				}
+			} else {
+				/* Normal completion occurred after
+				 * SCSI timeout but before this point.
+				 * Successfully complete it.
+				 */
+				scmd->retries = scmd->allowed;
+				scsi_eh_finish_cmd(scmd, &ap->eh_done_q);
+			}
+		}
+
+		/* If we have timed out qcs.  They belong to EH from
+		 * this point but the state of the controller is
+		 * unknown.  Freeze the port to make sure the IRQ
+		 * handler doesn't diddle with those qcs.  This must
+		 * be done atomically w.r.t. setting QCFLAG_FAILED.
+		 */
+		if (nr_timedout)
+			__ata_port_freeze(ap);
+
+		spin_unlock_irqrestore(hs_lock, flags);
+	} else
+		spin_unlock_wait(hs_lock);
+
+ repeat:
+	/* invoke error handler */
+	if (ap->ops->error_handler) {
+		/* clear EH pending */
+		spin_lock_irqsave(hs_lock, flags);
+		ap->flags &= ~ATA_FLAG_EH_PENDING;
+		spin_unlock_irqrestore(hs_lock, flags);
+
+		/* invoke EH */
+		ap->ops->error_handler(ap);
+
+		/* Exception might have happend after ->error_handler
+		 * recovered the port but before this point.  Repeat
+		 * EH in such case.
+		 */
+		spin_lock_irqsave(hs_lock, flags);
+
+		if (ap->flags & ATA_FLAG_EH_PENDING) {
+			if (--repeat_cnt) {
+				ata_port_printk(ap, KERN_INFO,
+					"EH pending after completion, "
+					"repeating EH (cnt=%d)\n", repeat_cnt);
+				spin_unlock_irqrestore(hs_lock, flags);
+				goto repeat;
+			}
+			ata_port_printk(ap, KERN_ERR, "EH pending after %d "
+					"tries, giving up\n", ATA_EH_MAX_REPEAT);
+		}
+
+		/* Clear host_eh_scheduled while holding hs_lock such
+		 * that if exception occurs after this point but
+		 * before EH completion, SCSI midlayer will
+		 * re-initiate EH.
+		 */
+		host->host_eh_scheduled = 0;
+
+		spin_unlock_irqrestore(hs_lock, flags);
+	} else {
+		WARN_ON(ata_qc_from_tag(ap, ap->active_tag) == NULL);
+		ap->ops->eng_timeout(ap);
+	}
+
+	/* finish or retry handled scmd's and clean up */
 	WARN_ON(host->host_failed || !list_empty(&host->eh_cmd_q));

 	scsi_eh_flush_done_q(&ap->eh_done_q);

+	/* clean up */
+	spin_lock_irqsave(hs_lock, flags);
+
+	if (ap->flags & ATA_FLAG_RECOVERED)
+		ata_port_printk(ap, KERN_INFO, "EH complete\n");
+	ap->flags &= ~ATA_FLAG_RECOVERED;
+
+	spin_unlock_irqrestore(hs_lock, flags);
+
 	DPRINTK("EXIT\n");
 }

@ -133,6 +255,8 @@ void ata_scsi_error(struct Scsi_Host *host)
 *	an interrupt was not delivered to the driver, even though the
 *	transaction completed successfully.
 *
+ *	TODO: kill this function once old EH is gone.
+ *
 *	LOCKING:
 *	Inherited from SCSI layer (none, can sleep)
 */
@ -198,6 +322,8 @@ static void ata_qc_timeout(struct ata_queued_cmd *qc)
 *	an interrupt was not delivered to the driver, even though the
 *	transaction completed successfully.
 *
+ *	TODO: kill this function once old EH is gone.
+ *
 *	LOCKING:
 *	Inherited from SCSI layer (none, can sleep)
 */
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@ -225,6 +225,9 @@ enum {
 	ATA_PORT_PRIMARY	= (1 << 0),
 	ATA_PORT_SECONDARY	= (1 << 1),

+	/* max repeat if error condition is still set after ->error_handler */
+	ATA_EH_MAX_REPEAT	= 5,
+
 	/* how hard are we gonna try to probe/recover devices */
 	ATA_PROBE_MAX_TRIES	= 3,
 };