drm/i915: Don't count semaphore waits towards a stuck ring

If we detect a ring is in a valid wait for another, just let it be. Eventually it will either begin to progress again, or the entire system will come grinding to a halt and then hangcheck will fire as soon as the deadlock is detected. This error was foretold by Ben in commit 05407ff889 Author: Mika Kuoppala <mika.kuoppala@linux.intel.com> Date: Thu May 30 09:04:29 2013 +0300 drm/i915: detect hang using per ring hangcheck_score "If ring B is waiting on ring A via semaphore, and ring A is making progress, albeit slowly - the hangcheck will fire. The check will determine that A is moving, however ring B will appear hung because the ACTHD doesn't move. I honestly can't say if that's actually a realistic problem to hit it probably implies the timeout value is too low." v2: Make sure we don't even incur the KICK cost whilst waiting. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=65394 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Cc: Ben Widawsky <ben@bwidawsk.net> Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2024-12-29 06:12:08 +00:00 · 2013-06-10 11:20:21 +01:00 · 2013-06-10 11:20:21 +01:00 · 6274f2126a
commit 6274f2126a
parent 9107e9d227
2 changed files with 88 additions and 30 deletions
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@ -2321,21 +2321,21 @@ ring_idle(struct intel_ring_buffer *ring, u32 seqno)
 		i915_seqno_passed(seqno, ring_last_seqno(ring)));
 }

-static bool semaphore_passed(struct intel_ring_buffer *ring)
+static struct intel_ring_buffer *
+semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno)
 {
 	struct drm_i915_private *dev_priv = ring->dev->dev_private;
-	u32 acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
-	struct intel_ring_buffer *signaller;
-	u32 cmd, ipehr, acthd_min;
+	u32 cmd, ipehr, acthd, acthd_min;

 	ipehr = I915_READ(RING_IPEHR(ring->mmio_base));
 	if ((ipehr & ~(0x3 << 16)) !=
 	    (MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER))
-		return false;
+		return NULL;

 	/* ACTHD is likely pointing to the dword after the actual command,
 	 * so scan backwards until we find the MBOX.
 	 */
+	acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
 	acthd_min = max((int)acthd - 3 * 4, 0);
 	do {
 		cmd = ioread32(ring->virtual_start + acthd);
@ -2344,22 +2344,53 @@ static bool semaphore_passed(struct intel_ring_buffer *ring)

 		acthd -= 4;
 		if (acthd < acthd_min)
-			return false;
+			return NULL;
 	} while (1);

-	signaller = &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
-	return i915_seqno_passed(signaller->get_seqno(signaller, false),
-				 ioread32(ring->virtual_start+acthd+4)+1);
+	*seqno = ioread32(ring->virtual_start+acthd+4)+1;
+	return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
 }

-static bool ring_hung(struct intel_ring_buffer *ring)
+static int semaphore_passed(struct intel_ring_buffer *ring)
+{
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	struct intel_ring_buffer *signaller;
+	u32 seqno, ctl;
+
+	ring->hangcheck.deadlock = true;
+
+	signaller = semaphore_waits_for(ring, &seqno);
+	if (signaller == NULL || signaller->hangcheck.deadlock)
+		return -1;
+
+	/* cursory check for an unkickable deadlock */
+	ctl = I915_READ_CTL(signaller);
+	if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0)
+		return -1;
+
+	return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno);
+}
+
+static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
+{
+	struct intel_ring_buffer *ring;
+	int i;
+
+	for_each_ring(ring, dev_priv, i)
+		ring->hangcheck.deadlock = false;
+}
+
+static enum { wait, active, kick, hung } ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
 {
 	struct drm_device *dev = ring->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 tmp;

+	if (ring->hangcheck.acthd != acthd)
+		return active;
+
 	if (IS_GEN2(dev))
-		return true;
+		return hung;

 	/* Is the chip hanging on a WAIT_FOR_EVENT?
 	 * If so we can simply poke the RB_WAIT bit
@ -2371,19 +2402,24 @@ static bool ring_hung(struct intel_ring_buffer *ring)
 		DRM_ERROR("Kicking stuck wait on %s\n",
 			  ring->name);
 		I915_WRITE_CTL(ring, tmp);
-		return false;
+		return kick;
 	}

-	if (INTEL_INFO(dev)->gen >= 6 &&
-	    tmp & RING_WAIT_SEMAPHORE &&
-	    semaphore_passed(ring)) {
-		DRM_ERROR("Kicking stuck semaphore on %s\n",
-			  ring->name);
-		I915_WRITE_CTL(ring, tmp);
-		return false;
+	if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) {
+		switch (semaphore_passed(ring)) {
+		default:
+			return hung;
+		case 1:
+			DRM_ERROR("Kicking stuck semaphore on %s\n",
+				  ring->name);
+			I915_WRITE_CTL(ring, tmp);
+			return kick;
+		case 0:
+			return wait;
+		}
 	}

-	return true;
+	return hung;
 }

 /**
@ -2414,6 +2450,8 @@ void i915_hangcheck_elapsed(unsigned long data)
 		u32 seqno, acthd;
 		bool busy = true;

+		semaphore_clear_deadlocks(dev_priv);
+
 		seqno = ring->get_seqno(ring, false);
 		acthd = intel_ring_get_active_head(ring);

@ -2430,17 +2468,36 @@ void i915_hangcheck_elapsed(unsigned long data)
 			} else {
 				int score;

-				stuck[i] = ring->hangcheck.acthd == acthd;
-				if (stuck[i]) {
-					/* Every time we kick the ring, add a
-					 * small increment to the hangcheck
-					 * score so that we can catch a
-					 * batch that is repeatedly kicked.
-					 */
-					score = ring_hung(ring) ? HUNG : KICK;
-				} else
+				/* We always increment the hangcheck score
+				 * if the ring is busy and still processing
+				 * the same request, so that no single request
+				 * can run indefinitely (such as a chain of
+				 * batches). The only time we do not increment
+				 * the hangcheck score on this ring, if this
+				 * ring is in a legitimate wait for another
+				 * ring. In that case the waiting ring is a
+				 * victim and we want to be sure we catch the
+				 * right culprit. Then every time we do kick
+				 * the ring, add a small increment to the
+				 * score so that we can catch a batch that is
+				 * being repeatedly kicked and so responsible
+				 * for stalling the machine.
+				 */
+				switch (ring_stuck(ring, acthd)) {
+				case wait:
+					score = 0;
+					break;
+				case active:
 					score = BUSY;
-
+					break;
+				case kick:
+					score = KICK;
+					break;
+				case hung:
+					score = HUNG;
+					stuck[i] = true;
+					break;
+				}
 				ring->hangcheck.score += score;
 			}
 		} else {
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@ -38,6 +38,7 @@ struct  intel_hw_status_page {
 #define I915_READ_SYNC_1(ring) I915_READ(RING_SYNC_1((ring)->mmio_base))

 struct intel_ring_hangcheck {
+	bool deadlock;
 	u32 seqno;
 	u32 acthd;
 	int score;