From ef80c1a1d21b2845e3a60dcb936264b123ebf41c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Fri, 25 Sep 2020 16:16:48 +0300
Subject: [PATCH 01/15] drm/i915: Fix state checker hw.active/hw.enable readout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously intel_dump_pipe_config() used to dump the full crtc state
whether or not the crtc was logically enabled or not. As that meant
occasionally dumping confusing stale garbage I changed it to
check whether the crtc is logically enabled or not. However I did
not realize that the state checker readout code does not
populate crtc_state.hw.{active,enabled}. Hence the state checker
dump would only give us a full dump of the sw state but not the hw
state. Fix that by populating those bits of the hw state as well.

Reviewed-by: Uma Shankar <uma.shankar@intel.com>
Fixes: 10d75f5428fd ("drm/i915: Fix plane state dumps")
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200925131656.10022-2-ville.syrjala@linux.intel.com
(cherry picked from commit 504c7bd85c6f9b14b6c7f03cb5885c0818e805ad)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/display/intel_display.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c
index 5a9d933e425a..d64e46acfbbd 100644
--- a/drivers/gpu/drm/i915/display/intel_display.c
+++ b/drivers/gpu/drm/i915/display/intel_display.c
@@ -14304,7 +14304,6 @@ verify_crtc_state(struct intel_crtc *crtc,
 	struct intel_encoder *encoder;
 	struct intel_crtc_state *pipe_config = old_crtc_state;
 	struct drm_atomic_state *state = old_crtc_state->uapi.state;
-	bool active;
 
 	__drm_atomic_helper_crtc_destroy_state(&old_crtc_state->uapi);
 	intel_crtc_free_hw_state(old_crtc_state);
@@ -14314,16 +14313,19 @@ verify_crtc_state(struct intel_crtc *crtc,
 	drm_dbg_kms(&dev_priv->drm, "[CRTC:%d:%s]\n", crtc->base.base.id,
 		    crtc->base.name);
 
-	active = dev_priv->display.get_pipe_config(crtc, pipe_config);
+	pipe_config->hw.enable = new_crtc_state->hw.enable;
+
+	pipe_config->hw.active =
+		dev_priv->display.get_pipe_config(crtc, pipe_config);
 
 	/* we keep both pipes enabled on 830 */
-	if (IS_I830(dev_priv))
-		active = new_crtc_state->hw.active;
+	if (IS_I830(dev_priv) && pipe_config->hw.active)
+		pipe_config->hw.active = new_crtc_state->hw.active;
 
-	I915_STATE_WARN(new_crtc_state->hw.active != active,
+	I915_STATE_WARN(new_crtc_state->hw.active != pipe_config->hw.active,
 			"crtc active state doesn't match with hw state "
 			"(expected %i, found %i)\n",
-			new_crtc_state->hw.active, active);
+			new_crtc_state->hw.active, pipe_config->hw.active);
 
 	I915_STATE_WARN(crtc->active != new_crtc_state->hw.active,
 			"transitional active state does not match atomic hw state "
@@ -14332,6 +14334,7 @@ verify_crtc_state(struct intel_crtc *crtc,
 
 	for_each_encoder_on_crtc(dev, &crtc->base, encoder) {
 		enum pipe pipe;
+		bool active;
 
 		active = encoder->get_hw_state(encoder, &pipe);
 		I915_STATE_WARN(active != new_crtc_state->hw.active,

From 4caf017ee93703ba1c4504f3d73b50e6bbd4249e Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 15 Sep 2020 10:14:15 +0100
Subject: [PATCH 02/15] drm/i915/gem: Avoid implicit vmap for highmem on x86-32

On 32b, highmem using a finite set of indirect PTE (i.e. vmap) to provide
virtual mappings of the high pages.  As these are finite, map_new_virtual()
must wait for some other kmap() to finish when it runs out. If we map a
large number of objects, there is no method for it to tell us to release
the mappings, and we deadlock.

However, if we make an explicit vmap of the page, that uses a larger
vmalloc arena, and also has the ability to tell us to release unwanted
mappings. Most importantly, it will fail and propagate an error instead
of waiting forever.

Fixes: fb8621d3bee8 ("drm/i915: Avoid allocating a vmap arena for a single page") #x86-32
References: e87666b52f00 ("drm/i915/shrinker: Hook up vmap allocation failure notifier")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Cc: <stable@vger.kernel.org> # v4.7+
Link: https://patchwork.freedesktop.org/patch/msgid/20200915091417.4086-1-chris@chris-wilson.co.uk
(cherry picked from commit 060bb115c2d664f04db9c7613a104dfaef3fdd98)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_pages.c | 26 +++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
index e8a083743e09..49ce5777c511 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
@@ -255,8 +255,30 @@ static void *i915_gem_object_map(struct drm_i915_gem_object *obj,
 		return NULL;
 
 	/* A single page can always be kmapped */
-	if (n_pte == 1 && type == I915_MAP_WB)
-		return kmap(sg_page(sgt->sgl));
+	if (n_pte == 1 && type == I915_MAP_WB) {
+		struct page *page = sg_page(sgt->sgl);
+
+		/*
+		 * On 32b, highmem using a finite set of indirect PTE (i.e.
+		 * vmap) to provide virtual mappings of the high pages.
+		 * As these are finite, map_new_virtual() must wait for some
+		 * other kmap() to finish when it runs out. If we map a large
+		 * number of objects, there is no method for it to tell us
+		 * to release the mappings, and we deadlock.
+		 *
+		 * However, if we make an explicit vmap of the page, that
+		 * uses a larger vmalloc arena, and also has the ability
+		 * to tell us to release unwanted mappings. Most importantly,
+		 * it will fail and propagate an error instead of waiting
+		 * forever.
+		 *
+		 * So if the page is beyond the 32b boundary, make an explicit
+		 * vmap. On 64b, this check will be optimised away as we can
+		 * directly kmap any page on the system.
+		 */
+		if (!PageHighMem(page))
+			return kmap(page);
+	}
 
 	mem = stack;
 	if (n_pte > ARRAY_SIZE(stack)) {

From ba2ebf605d5f32a9be0f7b05d3174bbc501b83fe Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 15 Sep 2020 10:14:16 +0100
Subject: [PATCH 03/15] drm/i915/gem: Prevent using pgprot_writecombine() if
 PAT is not supported

Let's not try and use PAT attributes for I915_MAP_WC if the CPU doesn't
support PAT.

Fixes: 6056e50033d9 ("drm/i915/gem: Support discontiguous lmem object maps")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Cc: <stable@vger.kernel.org> # v5.6+
Link: https://patchwork.freedesktop.org/patch/msgid/20200915091417.4086-2-chris@chris-wilson.co.uk
(cherry picked from commit 121ba69ffddc60df11da56f6d5b29bdb45c8eb80)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_pages.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
index 49ce5777c511..d6eeefab3d01 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
@@ -254,6 +254,10 @@ static void *i915_gem_object_map(struct drm_i915_gem_object *obj,
 	if (!i915_gem_object_has_struct_page(obj) && type != I915_MAP_WC)
 		return NULL;
 
+	if (GEM_WARN_ON(type == I915_MAP_WC &&
+			!static_cpu_has(X86_FEATURE_PAT)))
+		return NULL;
+
 	/* A single page can always be kmapped */
 	if (n_pte == 1 && type == I915_MAP_WB) {
 		struct page *page = sg_page(sgt->sgl);

From 922d369b2922b3bb8c72aadddbd7445e6dc1183a Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 15 Sep 2020 10:14:17 +0100
Subject: [PATCH 04/15] drm/i915/gt: Clear the buffer pool age before use

If we create a new node, it is possible for the slab allocator to return
us a recently freed node. If that node was just retired, it will retain
the current jiffy as its node->age. There is then a miniscule window,
where as that node is retired, it will appear on the free list with an
incorrect age and be eligible for reuse by one thread, and then by a
second thread as the correct node->age is written.

Fixes: 06b73c2d0b65 ("drm/i915/gt: Delay taking the spinlock for grabbing from the buffer pool")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200915091417.4086-3-chris@chris-wilson.co.uk
(cherry picked from commit 9bb34ff25c458a2a48fb61409df42f465ede37f8)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c
index 4b7671ac5dca..104cb30e8c13 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c
@@ -134,6 +134,7 @@ static void pool_retire(struct i915_active *ref)
 	/* Return this object to the shrinker pool */
 	i915_gem_object_make_purgeable(node->obj);
 
+	GEM_BUG_ON(node->age);
 	spin_lock_irqsave(&pool->lock, flags);
 	list_add_rcu(&node->link, list);
 	WRITE_ONCE(node->age, jiffies ?: 1); /* 0 reserved for active nodes */
@@ -155,6 +156,7 @@ node_create(struct intel_gt_buffer_pool *pool, size_t sz)
 	if (!node)
 		return ERR_PTR(-ENOMEM);
 
+	node->age = 0;
 	node->pool = pool;
 	i915_active_init(&node->active, pool_active, pool_retire);
 

From eb2a27086a821a7993c5a3230022c85cc7ccfbda Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 11 Sep 2020 10:52:43 +0300
Subject: [PATCH 05/15] drm/i915: Fix an error code i915_gem_object_copy_blt()

This code should use "vma[1]" instead of "vma".  The "vma" variable is a
valid pointer.

Fixes: 6b05030496f7 ("drm/i915: Convert i915_gem_object/client_blt.c to use ww locking as well, v2.")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200911075243.GG12635@kadam
(cherry picked from commit 68ba71e3ae6dd86a23486655e33c5f8c9bd90777)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_object_blt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c
index d93eb36160c9..aee7ad3cc3c6 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c
@@ -364,7 +364,7 @@ int i915_gem_object_copy_blt(struct drm_i915_gem_object *src,
 
 	vma[1] = i915_vma_instance(dst, vm, NULL);
 	if (IS_ERR(vma[1]))
-		return PTR_ERR(vma);
+		return PTR_ERR(vma[1]);
 
 	i915_gem_ww_ctx_init(&ww, true);
 	intel_engine_pm_get(ce->engine);

From 7d5553147613b50149238ac1385c60e5c7cacb34 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 16 Sep 2020 10:00:58 +0100
Subject: [PATCH 06/15] drm/i915: Break up error capture compression loops with
 cond_resched()

As the error capture will compress user buffers as directed to by the
user, it can take an arbitrary amount of time and space. Break up the
compression loops with a call to cond_resched(), that will allow other
processes to schedule (avoiding the soft lockups) and also serve as a
warning should we try to make this loop atomic in the future.

Testcase: igt/gem_exec_capture/many-*
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: stable@vger.kernel.org
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200916090059.3189-2-chris@chris-wilson.co.uk
(cherry picked from commit 293f43c80c0027ff9299036c24218ac705ce584e)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/i915_gpu_error.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 3e6cbb0d1150..a635ec8d0b94 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -311,6 +311,8 @@ static int compress_page(struct i915_vma_compress *c,
 
 		if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK)
 			return -EIO;
+
+		cond_resched();
 	} while (zstream->avail_in);
 
 	/* Fallback to uncompressed if we increase size? */
@@ -397,6 +399,7 @@ static int compress_page(struct i915_vma_compress *c,
 	if (!(wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE)))
 		memcpy(ptr, src, PAGE_SIZE);
 	dst->pages[dst->page_count++] = ptr;
+	cond_resched();
 
 	return 0;
 }

From 159ace7ffe690bcdc7afbd9bab81e05dac2afeac Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Date: Fri, 18 Sep 2020 13:12:08 +0200
Subject: [PATCH 07/15] drm/i915: Fix uninitialised variable in
 intel_context_create_request.

In case backoff fails with an error, we return an undefined rq,
assign err to rq correctly.

Fixes: 8a929c9eb1c2 ("drm/i915: Use ww pinning for intel_context_create_request()")
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200918111208.1392128-1-maarten.lankhorst@linux.intel.com
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
(cherry picked from commit 4316b19dee27cc5cd34a95fdbc0a3a5237507701)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_context.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
index d301dda1b261..92a3f25c4006 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -472,6 +472,7 @@ retry:
 		err = i915_gem_ww_ctx_backoff(&ww);
 		if (!err)
 			goto retry;
+		rq = ERR_PTR(err);
 	} else {
 		rq = ERR_PTR(err);
 	}

From cef8ce5528bf61a91d3116f327fc6b78310facfa Mon Sep 17 00:00:00 2001
From: Matthew Auld <matthew.auld@intel.com>
Date: Mon, 21 Sep 2020 17:08:44 +0100
Subject: [PATCH 08/15] drm/i915: check i915_vm_alloc_pt_stash for errors

If we are really unlucky and encounter an error during
i915_vm_alloc_pt_stash, we end up passing an empty pt/pd stash all the
way down into the low-level ppgtt alloc code, leading to explosions,
since it expects at least the required number of pt/pd for the va range.

[  211.981418] BUG: kernel NULL pointer dereference, address: 0000000000000000
[  211.981421] #PF: supervisor read access in kernel mode
[  211.981422] #PF: error_code(0x0000) - not-present page
[  211.981424] PGD 80000008439cb067 P4D 80000008439cb067 PUD 84a37f067 PMD 0
[  211.981427] Oops: 0000 [#1] SMP PTI
[  211.981428] CPU: 1 PID: 1301 Comm: i915_selftest Tainted: G     U    I       5.9.0-rc5+ #3
[  211.981430] Hardware name:  /NUC6i7KYB, BIOS KYSKLi70.86A.0050.2017.0831.1924 08/31/2017
[  211.981521] RIP: 0010:__gen8_ppgtt_alloc+0x1ed/0x3c0 [i915]
[  211.981523] Code: c1 48 c7 c7 5d 5d fe c0 65 ff 0d ee 1d 03 3f e8 d9 91 1f e2 8b 55 c4 31 c0 48 8b 75 b8 85 d2 0f 95 c0 48 8b 1c c6 48 89 45 98 <48> 8b 03 48 8b 90 58 02 00 00 48 85 d2 0f 84 07 ea 15 00 48 81 fa
[  211.981526] RSP: 0018:ffffba2cc0eb3970 EFLAGS: 00010202
[  211.981527] RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000004
[  211.981529] RDX: 0000000000000002 RSI: ffff9be998bdb8c0 RDI: ffff9be99c844300
[  211.981530] RBP: ffffba2cc0eb39d8 R08: 0000000000000640 R09: ffff9be97cdfd000
[  211.981531] R10: ffff9be97cdfd614 R11: 0000000000000000 R12: 0000000000000000
[  211.981532] R13: ffff9be98607ba20 R14: ffff9be995a0b400 R15: ffffba2cc0eb39e8
[  211.981534] FS:  00007f0f10b31000(0000) GS:ffff9be99fc40000(0000) knlGS:0000000000000000
[  211.981536] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  211.981538] CR2: 0000000000000000 CR3: 000000084d74e006 CR4: 00000000003706e0
[  211.981539] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  211.981541] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  211.981542] Call Trace:
[  211.981609]  gen8_ppgtt_alloc+0x79/0x90 [i915]
[  211.981678]  ppgtt_bind_vma+0x36/0x80 [i915]
[  211.981756]  __vma_bind+0x39/0x40 [i915]
[  211.981818]  fence_work+0x21/0x98 [i915]
[  211.981879]  fence_notify+0x8d/0x128 [i915]
[  211.981939]  __i915_sw_fence_complete+0x62/0x240 [i915]
[  211.982018]  i915_vma_pin_ww+0x1ee/0x9c0 [i915]

Fixes: cd0452aa2a0d ("drm/i915: Preallocate stashes for vma page-directories")
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20200921160844.73186-1-matthew.auld@intel.com
(cherry picked from commit 1604cb2aa7fafd83e11f9257f765a5f5dd7c19d3)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/i915_vma.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 495d28f6d160..ffb5287e055a 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -892,9 +892,11 @@ int i915_vma_pin_ww(struct i915_vma *vma, struct i915_gem_ww_ctx *ww,
 
 		/* Allocate enough page directories to used PTE */
 		if (vma->vm->allocate_va_range) {
-			i915_vm_alloc_pt_stash(vma->vm,
-					       &work->stash,
-					       vma->size);
+			err = i915_vm_alloc_pt_stash(vma->vm,
+						     &work->stash,
+						     vma->size);
+			if (err)
+				goto err_fence;
 
 			err = i915_vm_pin_pt_stash(vma->vm,
 						   &work->stash);

From 4fe9af8e881d946bf60790eeb37a7c4f96e28382 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 23 Jul 2020 18:21:19 +0100
Subject: [PATCH 09/15] drm/i915/gem: Serialise debugfs i915_gem_objects with
 ctx->mutex

Since the debugfs may peek into the GEM contexts as the corresponding
client/fd is being closed, we may try and follow a dangling pointer.
However, the context closure itself is serialised with the ctx->mutex,
so if we hold that mutex as we inspect the state coupled in the context,
we know the pointers within the context are stable and will remain valid
as we inspect their tables.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: CQ Tang <cq.tang@intel.com>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: stable@vger.kernel.org
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200723172119.17649-3-chris@chris-wilson.co.uk
(cherry picked from commit 102f5aa491f262c818e607fc4fee08a724a76c69)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 784219962193..ea469168cd44 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -326,6 +326,7 @@ static void print_context_stats(struct seq_file *m,
 		}
 		i915_gem_context_unlock_engines(ctx);
 
+		mutex_lock(&ctx->mutex);
 		if (!IS_ERR_OR_NULL(ctx->file_priv)) {
 			struct file_stats stats = {
 				.vm = rcu_access_pointer(ctx->vm),
@@ -346,6 +347,7 @@ static void print_context_stats(struct seq_file *m,
 
 			print_file_stats(m, name, stats);
 		}
+		mutex_unlock(&ctx->mutex);
 
 		spin_lock(&i915->gem.contexts.lock);
 		list_safe_reset_next(ctx, cn, link);

From 5701a66edb6b3e09a7bdba3d0d64020a70562f87 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 25 Sep 2020 11:11:06 +0100
Subject: [PATCH 10/15] drm/i915: Redo "Remove i915_request.lock requirement
 for execution callbacks"

The reordering and rebasing of commit 2e4c6c1a9db5 ("drm/i915: Remove
i915_request.lock requirement for execution callbacks") caused it to
revert an earlier correction. Let us restore commit 99f0a640d464
("drm/i915: Remove requirement for holding i915_request.lock for
breadcrumbs")

Fixes: 2e4c6c1a9db5 ("drm/i915: Remove i915_request.lock requirement for execution callbacks")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200925101107.27869-1-chris@chris-wilson.co.uk
(cherry picked from commit 35faeb7de9ef83da510a048f2016061f1e31d5fc)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/i915_request.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 11e272422fb7..436ce368ddaa 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -593,16 +593,8 @@ xfer:
 	__notify_execute_cb_irq(request);
 
 	/* We may be recursing from the signal callback of another i915 fence */
-	if (!i915_request_signaled(request)) {
-		spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
-
-		if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
-			     &request->fence.flags) &&
-		    !i915_request_enable_breadcrumb(request))
-			intel_engine_signal_breadcrumbs(engine);
-
-		spin_unlock(&request->lock);
-	}
+	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
+		i915_request_enable_breadcrumb(request);
 
 	return result;
 }

From 3cfea8c97c93321578823c745e2ca8c8f03993ff Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 25 Sep 2020 11:11:07 +0100
Subject: [PATCH 11/15] drm/i915/gem: Hold request reference for canceling an
 active context

We have to be very careful while walking the timeline->requests list
under the RCU guard, as the requests (and so rq->link) use
SLAB_TYPESAFE_BY_RCU and so the requests may be reallocated within an
rcu grace period. As the requests are reallocated, they are removed from
one list and placed on another, and if we are iterating over that
request at that moment, the list iteration jumps from one list to the
next and promptly gets confused. Verify we hold the request reference
to ensure that the request is not added to a new list behind our backs.

<4> [582.745252] general protection fault, probably for non-canonical address 0xcccccccccccccd5c: 0000 [#1] PREEMPT SMP PTI
<4> [582.745297] CPU: 0 PID: 1475 Comm: gem_ctx_persist Not tainted 5.9.0-rc1-CI-CI_DRM_8908+ #1
<4> [582.745304] Hardware name: Intel Corporation NUC7CJYH/NUC7JYB, BIOS JYGLKCPX.86A.0027.2018.0125.1347 01/25/2018
<4> [582.745317] RIP: 0010:__lock_acquire+0x2c3/0x1f40
<4> [582.745323] Code: 00 65 8b 05 c7 8a ef 7e 85 c0 0f 85 b4 07 00 00 44 8b 9d c4 08 00 00 45 85 db 0f 84 0f 01 00 00 ba 05 00 00 00 e9 c8 06 00 00 <48> 81 3f c0 89 c7 82 b8 00 00 00 00 41 0f 45 c0 83 fe 01 41 89 c3
<4> [582.745334] RSP: 0018:ffffc9000461bc40 EFLAGS: 00010002
<4> [582.745340] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000
<4> [582.745345] RDX: 0000000000000000 RSI: 0000000000000000 RDI: cccccccccccccd5c
<4> [582.745350] RBP: ffff8881ec4a2880 R08: 0000000000000001 R09: 0000000000000001
<4> [582.745356] R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000
<4> [582.745361] R13: 0000000000000000 R14: 0000000000000000 R15: cccccccccccccd5c
<4> [582.745367] FS:  00007fb44da78e40(0000) GS:ffff888278000000(0000) knlGS:0000000000000000
<4> [582.745373] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
<4> [582.745378] CR2: 00007fb44daad040 CR3: 0000000268428000 CR4: 0000000000350ef0
<4> [582.745383] Call Trace:
<4> [582.745390]  ? __lock_acquire+0x913/0x1f40
<4> [582.745397]  lock_acquire+0xb5/0x3c0
<4> [582.745526]  ? kill_engines+0x19a/0x4b0 [i915]
<4> [582.745533]  ? find_held_lock+0x2d/0x90
<4> [582.745541]  _raw_spin_lock_irq+0x30/0x40
<4> [582.745635]  ? kill_engines+0x19a/0x4b0 [i915]
<4> [582.745727]  kill_engines+0x19a/0x4b0 [i915]
<4> [582.745820]  context_close+0x195/0x410 [i915]
<4> [582.745912]  i915_gem_context_close+0x5b/0x160 [i915]
<4> [582.745994]  i915_driver_postclose+0x14/0x40 [i915]
<4> [582.746003]  drm_file_free.part.13+0x240/0x290
<4> [582.746009]  drm_release_noglobal+0x16/0x50
<4> [582.746016]  __fput+0xa5/0x250
<4> [582.746021]  task_work_run+0x6e/0xb0
<4> [582.746028]  exit_to_user_mode_prepare+0x178/0x180
<4> [582.746034]  syscall_exit_to_user_mode+0x36/0x220
<4> [582.746040]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
<4> [582.746045] RIP: 0033:0x7fb44d1dc421
<4> [582.746050] Code: f7 d8 64 89 02 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 8b 05 ea cf 20 00 85 c0 75 16 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 3f f3 c3 0f 1f 44 00 00 53 89 fb 48 83 ec 10
<4> [582.746062] RSP: 002b:00007ffed2e83818 EFLAGS: 00000246 ORIG_RAX: 0000000000000003
<4> [582.746069] RAX: 0000000000000000 RBX: 0000556410bfe840 RCX: 00007fb44d1dc421
<4> [582.746075] RDX: 000000000000000a RSI: 00000000c0406469 RDI: 0000000000000008
<4> [582.746080] RBP: 0000000000000008 R08: 00007fb44d1c51cc R09: 00007fb44d1c5240
<4> [582.746086] R10: 0000000000000001 R11: 0000000000000246 R12: 00000000fffffffb
<4> [582.746091] R13: 0000000000000006 R14: 0000000000000000 R15: 000000000000000a
<4> [582.746099] Modules linked in: vgem mei_hdcp snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic ledtrig_audio btusb btrtl btbcm btintel x86_pkg_temp_thermal coretemp crct10dif_pclmul crc32_pclmul bluetooth ghash_clmulni_intel ecdh_generic ecc i915 r8169 realtek mei_me mei snd_hda_intel i2c_hid snd_intel_dspcfg snd_hda_codec snd_hwdep snd_hda_core snd_pcm pinctrl_geminilake pinctrl_intel prime_numbers [last unloaded: test_drm_mm]

Fixes: 736e785f9b28 ("drm/i915/gem: Reduce context termination list iteration guard to RCU")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200925101107.27869-2-chris@chris-wilson.co.uk
(cherry picked from commit badef44deff1fae8d21c5c1cfc4dde95fb5bf993)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c | 25 ++++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index cf5ecbde9e06..a548626fa8bc 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -460,8 +460,8 @@ __active_engine(struct i915_request *rq, struct intel_engine_cs **active)
 		spin_lock(&locked->active.lock);
 	}
 
-	if (!i915_request_completed(rq)) {
-		if (i915_request_is_active(rq) && rq->fence.error != -EIO)
+	if (i915_request_is_active(rq)) {
+		if (!i915_request_completed(rq))
 			*active = locked;
 		ret = true;
 	}
@@ -479,13 +479,26 @@ static struct intel_engine_cs *active_engine(struct intel_context *ce)
 	if (!ce->timeline)
 		return NULL;
 
+	/*
+	 * rq->link is only SLAB_TYPESAFE_BY_RCU, we need to hold a reference
+	 * to the request to prevent it being transferred to a new timeline
+	 * (and onto a new timeline->requests list).
+	 */
 	rcu_read_lock();
-	list_for_each_entry_rcu(rq, &ce->timeline->requests, link) {
-		if (i915_request_is_active(rq) && i915_request_completed(rq))
-			continue;
+	list_for_each_entry_reverse(rq, &ce->timeline->requests, link) {
+		bool found;
+
+		/* timeline is already completed upto this point? */
+		if (!i915_request_get_rcu(rq))
+			break;
 
 		/* Check with the backend if the request is inflight */
-		if (__active_engine(rq, &engine))
+		found = true;
+		if (likely(rcu_access_pointer(rq->timeline) == ce->timeline))
+			found = __active_engine(rq, &engine);
+
+		i915_request_put(rq);
+		if (found)
 			break;
 	}
 	rcu_read_unlock();

From 7d442ea7c504adcc9798b07cd8f6a0d235fca2da Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 28 Sep 2020 23:15:08 +0100
Subject: [PATCH 12/15] drm/i915: Cancel outstanding work after disabling
 heartbeats on an engine

We only allow persistent requests to remain on the GPU past the closure
of their containing context (and process) so long as they are continuously
checked for hangs or allow other requests to preempt them, as we need to
ensure forward progress of the system. If we allow persistent contexts
to remain on the system after the the hangcheck mechanism is disabled,
the system may grind to a halt. On disabling the mechanism, we sent a
pulse along the engine to remove all executing contexts from the engine
which would check for hung contexts -- but we did not prevent those
contexts from being resubmitted if they survived the final hangcheck.

Fixes: 9a40bddd47ca ("drm/i915/gt: Expose heartbeat interval via sysfs")
Testcase: igt/gem_ctx_persistence/heartbeat-stop
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: <stable@vger.kernel.org> # v5.7+
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200928221510.26044-1-chris@chris-wilson.co.uk
(cherry picked from commit 7a991cd3e3da9a56d5616b62d425db000a3242f2)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine.h | 9 +++++++++
 drivers/gpu/drm/i915/i915_request.c    | 5 +++++
 2 files changed, 14 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
index 08e2c000dcc3..7c3a1012e702 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine.h
@@ -337,4 +337,13 @@ intel_engine_has_preempt_reset(const struct intel_engine_cs *engine)
 	return intel_engine_has_preemption(engine);
 }
 
+static inline bool
+intel_engine_has_heartbeat(const struct intel_engine_cs *engine)
+{
+	if (!IS_ACTIVE(CONFIG_DRM_I915_HEARTBEAT_INTERVAL))
+		return false;
+
+	return READ_ONCE(engine->props.heartbeat_interval_ms);
+}
+
 #endif /* _INTEL_RINGBUFFER_H_ */
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 436ce368ddaa..0e813819b041 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -542,8 +542,13 @@ bool __i915_request_submit(struct i915_request *request)
 	if (i915_request_completed(request))
 		goto xfer;
 
+	if (unlikely(intel_context_is_closed(request->context) &&
+		     !intel_engine_has_heartbeat(engine)))
+		intel_context_set_banned(request->context);
+
 	if (unlikely(intel_context_is_banned(request->context)))
 		i915_request_set_error_once(request, -EIO);
+
 	if (unlikely(fatal_error(request->fence.error)))
 		__i915_request_skip(request);
 

From ca65fc0d8e01dca8fc82f0ccf433725469256c71 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 28 Sep 2020 23:15:09 +0100
Subject: [PATCH 13/15] drm/i915/gt: Always send a pulse down the engine after
 disabling heartbeat

Currently, we check we can send a pulse prior to disabling the
heartbeat to verify that we can change the heartbeat, but since we may
re-evaluate execution upon changing the heartbeat interval we need another
pulse afterwards to refresh execution.

v2: Tvrtko asked if we could reduce the double pulse to a single, which
opened up a discussion of how we should handle the pulse-error after
attempting to change the property, and the desire to serialise
adjustment of the property with its validating pulse, and unwind upon
failure.

Fixes: 9a40bddd47ca ("drm/i915/gt: Expose heartbeat interval via sysfs")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: <stable@vger.kernel.org> # v5.7+
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200928221510.26044-2-chris@chris-wilson.co.uk
(cherry picked from commit 3dd66a94de59d7792e7917eb3075342e70f06f44)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 .../gpu/drm/i915/gt/intel_engine_heartbeat.c  | 124 +++++++++++-------
 1 file changed, 76 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
index 8ffdf676c0a0..5067d0524d4b 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
@@ -177,36 +177,82 @@ void intel_engine_init_heartbeat(struct intel_engine_cs *engine)
 	INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat);
 }
 
-int intel_engine_set_heartbeat(struct intel_engine_cs *engine,
-			       unsigned long delay)
-{
-	int err;
-
-	/* Send one last pulse before to cleanup persistent hogs */
-	if (!delay && IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT)) {
-		err = intel_engine_pulse(engine);
-		if (err)
-			return err;
-	}
-
-	WRITE_ONCE(engine->props.heartbeat_interval_ms, delay);
-
-	if (intel_engine_pm_get_if_awake(engine)) {
-		if (delay)
-			intel_engine_unpark_heartbeat(engine);
-		else
-			intel_engine_park_heartbeat(engine);
-		intel_engine_pm_put(engine);
-	}
-
-	return 0;
-}
-
-int intel_engine_pulse(struct intel_engine_cs *engine)
+static int __intel_engine_pulse(struct intel_engine_cs *engine)
 {
 	struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
 	struct intel_context *ce = engine->kernel_context;
 	struct i915_request *rq;
+
+	lockdep_assert_held(&ce->timeline->mutex);
+	GEM_BUG_ON(!intel_engine_has_preemption(engine));
+	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
+
+	intel_context_enter(ce);
+	rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN);
+	intel_context_exit(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	__set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags);
+	idle_pulse(engine, rq);
+
+	__i915_request_commit(rq);
+	__i915_request_queue(rq, &attr);
+	GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
+
+	return 0;
+}
+
+static unsigned long set_heartbeat(struct intel_engine_cs *engine,
+				   unsigned long delay)
+{
+	unsigned long old;
+
+	old = xchg(&engine->props.heartbeat_interval_ms, delay);
+	if (delay)
+		intel_engine_unpark_heartbeat(engine);
+	else
+		intel_engine_park_heartbeat(engine);
+
+	return old;
+}
+
+int intel_engine_set_heartbeat(struct intel_engine_cs *engine,
+			       unsigned long delay)
+{
+	struct intel_context *ce = engine->kernel_context;
+	int err = 0;
+
+	if (!delay && !intel_engine_has_preempt_reset(engine))
+		return -ENODEV;
+
+	intel_engine_pm_get(engine);
+
+	err = mutex_lock_interruptible(&ce->timeline->mutex);
+	if (err)
+		goto out_rpm;
+
+	if (delay != engine->props.heartbeat_interval_ms) {
+		unsigned long saved = set_heartbeat(engine, delay);
+
+		/* recheck current execution */
+		if (intel_engine_has_preemption(engine)) {
+			err = __intel_engine_pulse(engine);
+			if (err)
+				set_heartbeat(engine, saved);
+		}
+	}
+
+	mutex_unlock(&ce->timeline->mutex);
+
+out_rpm:
+	intel_engine_pm_put(engine);
+	return err;
+}
+
+int intel_engine_pulse(struct intel_engine_cs *engine)
+{
+	struct intel_context *ce = engine->kernel_context;
 	int err;
 
 	if (!intel_engine_has_preemption(engine))
@@ -215,30 +261,12 @@ int intel_engine_pulse(struct intel_engine_cs *engine)
 	if (!intel_engine_pm_get_if_awake(engine))
 		return 0;
 
-	if (mutex_lock_interruptible(&ce->timeline->mutex)) {
-		err = -EINTR;
-		goto out_rpm;
+	err = -EINTR;
+	if (!mutex_lock_interruptible(&ce->timeline->mutex)) {
+		err = __intel_engine_pulse(engine);
+		mutex_unlock(&ce->timeline->mutex);
 	}
 
-	intel_context_enter(ce);
-	rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN);
-	intel_context_exit(ce);
-	if (IS_ERR(rq)) {
-		err = PTR_ERR(rq);
-		goto out_unlock;
-	}
-
-	__set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags);
-	idle_pulse(engine, rq);
-
-	__i915_request_commit(rq);
-	__i915_request_queue(rq, &attr);
-	GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
-	err = 0;
-
-out_unlock:
-	mutex_unlock(&ce->timeline->mutex);
-out_rpm:
 	intel_engine_pm_put(engine);
 	return err;
 }

From 651dabe27f9638f569f6a794f9d3cc1889cd315e Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 28 Sep 2020 23:15:10 +0100
Subject: [PATCH 14/15] drm/i915/gem: Always test execution status on closing
 the context

Verify that if a context is active at the time it is closed, that it is
either persistent and preemptible (with hangcheck running) or it shall
be removed from execution.

Fixes: 9a40bddd47ca ("drm/i915/gt: Expose heartbeat interval via sysfs")
Testcase: igt/gem_ctx_persistence/heartbeat-close
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: <stable@vger.kernel.org> # v5.7+
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200928221510.26044-3-chris@chris-wilson.co.uk
(cherry picked from commit d3bb2f9b5ee66d5e000293edd6b6575e59d11db9)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c | 48 +++++----------------
 1 file changed, 10 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index a548626fa8bc..4fd38101bb56 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -390,24 +390,6 @@ __context_engines_static(const struct i915_gem_context *ctx)
 	return rcu_dereference_protected(ctx->engines, true);
 }
 
-static bool __reset_engine(struct intel_engine_cs *engine)
-{
-	struct intel_gt *gt = engine->gt;
-	bool success = false;
-
-	if (!intel_has_reset_engine(gt))
-		return false;
-
-	if (!test_and_set_bit(I915_RESET_ENGINE + engine->id,
-			      &gt->reset.flags)) {
-		success = intel_engine_reset(engine, NULL) == 0;
-		clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
-				      &gt->reset.flags);
-	}
-
-	return success;
-}
-
 static void __reset_context(struct i915_gem_context *ctx,
 			    struct intel_engine_cs *engine)
 {
@@ -431,12 +413,7 @@ static bool __cancel_engine(struct intel_engine_cs *engine)
 	 * kill the banned context, we fallback to doing a local reset
 	 * instead.
 	 */
-	if (IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT) &&
-	    !intel_engine_pulse(engine))
-		return true;
-
-	/* If we are unable to send a pulse, try resetting this engine. */
-	return __reset_engine(engine);
+	return intel_engine_pulse(engine) == 0;
 }
 
 static bool
@@ -506,7 +483,7 @@ static struct intel_engine_cs *active_engine(struct intel_context *ce)
 	return engine;
 }
 
-static void kill_engines(struct i915_gem_engines *engines)
+static void kill_engines(struct i915_gem_engines *engines, bool ban)
 {
 	struct i915_gem_engines_iter it;
 	struct intel_context *ce;
@@ -521,7 +498,7 @@ static void kill_engines(struct i915_gem_engines *engines)
 	for_each_gem_engine(ce, engines, it) {
 		struct intel_engine_cs *engine;
 
-		if (intel_context_set_banned(ce))
+		if (ban && intel_context_set_banned(ce))
 			continue;
 
 		/*
@@ -534,7 +511,7 @@ static void kill_engines(struct i915_gem_engines *engines)
 		engine = active_engine(ce);
 
 		/* First attempt to gracefully cancel the context */
-		if (engine && !__cancel_engine(engine))
+		if (engine && !__cancel_engine(engine) && ban)
 			/*
 			 * If we are unable to send a preemptive pulse to bump
 			 * the context from the GPU, we have to resort to a full
@@ -544,8 +521,10 @@ static void kill_engines(struct i915_gem_engines *engines)
 	}
 }
 
-static void kill_stale_engines(struct i915_gem_context *ctx)
+static void kill_context(struct i915_gem_context *ctx)
 {
+	bool ban = (!i915_gem_context_is_persistent(ctx) ||
+		    !ctx->i915->params.enable_hangcheck);
 	struct i915_gem_engines *pos, *next;
 
 	spin_lock_irq(&ctx->stale.lock);
@@ -558,7 +537,7 @@ static void kill_stale_engines(struct i915_gem_context *ctx)
 
 		spin_unlock_irq(&ctx->stale.lock);
 
-		kill_engines(pos);
+		kill_engines(pos, ban);
 
 		spin_lock_irq(&ctx->stale.lock);
 		GEM_BUG_ON(i915_sw_fence_signaled(&pos->fence));
@@ -570,11 +549,6 @@ static void kill_stale_engines(struct i915_gem_context *ctx)
 	spin_unlock_irq(&ctx->stale.lock);
 }
 
-static void kill_context(struct i915_gem_context *ctx)
-{
-	kill_stale_engines(ctx);
-}
-
 static void engines_idle_release(struct i915_gem_context *ctx,
 				 struct i915_gem_engines *engines)
 {
@@ -609,7 +583,7 @@ static void engines_idle_release(struct i915_gem_context *ctx,
 
 kill:
 	if (list_empty(&engines->link)) /* raced, already closed */
-		kill_engines(engines);
+		kill_engines(engines, true);
 
 	i915_sw_fence_commit(&engines->fence);
 }
@@ -667,9 +641,7 @@ static void context_close(struct i915_gem_context *ctx)
 	 * case we opt to forcibly kill off all remaining requests on
 	 * context close.
 	 */
-	if (!i915_gem_context_is_persistent(ctx) ||
-	    !ctx->i915->params.enable_hangcheck)
-		kill_context(ctx);
+	kill_context(ctx);
 
 	i915_gem_context_put(ctx);
 }

From c60b93cd4862d108214a14e655358ea714d7a12a Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 28 Sep 2020 22:59:42 +0100
Subject: [PATCH 15/15] drm/i915: Avoid mixing integer types during batch
 copies

Be consistent and use unsigned long throughout the chunk copies to
avoid the inherent clumsiness of mixing integer types of different
widths and signs. Failing to take acount of a wider unsigned type when
using min_t can lead to treating it as a negative, only for it flip back
to a large unsigned value after passing a boundary check.

Fixes: ed13033f0287 ("drm/i915/cmdparser: Only cache the dst vmap")
Testcase: igt/gen9_exec_parse/bb-large
Reported-by: "Candelaria, Jared" <jared.candelaria@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: "Candelaria, Jared" <jared.candelaria@intel.com>
Cc: "Bloomfield, Jon" <jon.bloomfield@intel.com>
Cc: <stable@vger.kernel.org> # v4.9+
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200928215942.31917-1-chris@chris-wilson.co.uk
(cherry picked from commit b7eeb2b4132ccf1a7d38f434cde7043913d1ed3c)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c |  7 +++++--
 drivers/gpu/drm/i915/i915_cmd_parser.c         | 10 +++++-----
 drivers/gpu/drm/i915/i915_drv.h                |  4 ++--
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 5509946f1a1d..4b09bcd70cf4 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -2267,8 +2267,8 @@ struct eb_parse_work {
 	struct i915_vma *batch;
 	struct i915_vma *shadow;
 	struct i915_vma *trampoline;
-	unsigned int batch_offset;
-	unsigned int batch_length;
+	unsigned long batch_offset;
+	unsigned long batch_length;
 };
 
 static int __eb_parse(struct dma_fence_work *work)
@@ -2338,6 +2338,9 @@ static int eb_parse_pipeline(struct i915_execbuffer *eb,
 	struct eb_parse_work *pw;
 	int err;
 
+	GEM_BUG_ON(overflows_type(eb->batch_start_offset, pw->batch_offset));
+	GEM_BUG_ON(overflows_type(eb->batch_len, pw->batch_length));
+
 	pw = kzalloc(sizeof(*pw), GFP_KERNEL);
 	if (!pw)
 		return -ENOMEM;
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 5ac4a999f05a..e88970256e8e 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -1136,7 +1136,7 @@ find_reg(const struct intel_engine_cs *engine, u32 addr)
 /* Returns a vmap'd pointer to dst_obj, which the caller must unmap */
 static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
 		       struct drm_i915_gem_object *src_obj,
-		       u32 offset, u32 length)
+		       unsigned long offset, unsigned long length)
 {
 	bool needs_clflush;
 	void *dst, *src;
@@ -1166,8 +1166,8 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
 		}
 	}
 	if (IS_ERR(src)) {
+		unsigned long x, n;
 		void *ptr;
-		int x, n;
 
 		/*
 		 * We can avoid clflushing partial cachelines before the write
@@ -1184,7 +1184,7 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
 		ptr = dst;
 		x = offset_in_page(offset);
 		for (n = offset >> PAGE_SHIFT; length; n++) {
-			int len = min_t(int, length, PAGE_SIZE - x);
+			int len = min(length, PAGE_SIZE - x);
 
 			src = kmap_atomic(i915_gem_object_get_page(src_obj, n));
 			if (needs_clflush)
@@ -1414,8 +1414,8 @@ static bool shadow_needs_clflush(struct drm_i915_gem_object *obj)
  */
 int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 			    struct i915_vma *batch,
-			    u32 batch_offset,
-			    u32 batch_length,
+			    unsigned long batch_offset,
+			    unsigned long batch_length,
 			    struct i915_vma *shadow,
 			    bool trampoline)
 {
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 72a9449b674e..eef9a821c49c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1949,8 +1949,8 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine);
 void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine);
 int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 			    struct i915_vma *batch,
-			    u32 batch_offset,
-			    u32 batch_length,
+			    unsigned long batch_offset,
+			    unsigned long batch_length,
 			    struct i915_vma *shadow,
 			    bool trampoline);
 #define I915_CMD_PARSER_TRAMPOLINE_SIZE 8