linux/drivers/gpu/drm/i915/i915_gem_request.c

1372 lines
40 KiB
C
Raw Normal View History

/*
* Copyright © 2008-2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
*/
drm/i915: Refactor activity tracking for requests With the introduction of requests, we amplified the number of atomic refcounted objects we use and update every execbuffer; from none to several references, and a set of references that need to be changed. We also introduced interesting side-effects in the order of retiring requests and objects. Instead of independently tracking the last request for an object, track the active objects for each request. The object will reside in the buffer list of its most recent active request and so we reduce the kref interchange to a list_move. Now retirements are entirely driven by the request, dramatically simplifying activity tracking on the object themselves, and removing the ambiguity between retiring objects and retiring requests. Furthermore with the consolidation of managing the activity tracking centrally, we can look forward to using RCU to enable lockless lookup of the current active requests for an object. In the future, we will be able to query the status or wait upon rendering to an object without even touching the struct_mutex BKL. All told, less code, simpler and faster, and more extensible. v2: Add a typedef for the function pointer for convenience later. v3: Make the noop retirement callback explicit. Allow passing NULL to the init_request_active() which is expanded to a common noop function. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/1470293567-10811-16-git-send-email-chris@chris-wilson.co.uk
2016-08-04 06:52:35 +00:00
#include <linux/prefetch.h>
#include <linux/dma-fence-array.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/sched/signal.h>
drm/i915: Refactor activity tracking for requests With the introduction of requests, we amplified the number of atomic refcounted objects we use and update every execbuffer; from none to several references, and a set of references that need to be changed. We also introduced interesting side-effects in the order of retiring requests and objects. Instead of independently tracking the last request for an object, track the active objects for each request. The object will reside in the buffer list of its most recent active request and so we reduce the kref interchange to a list_move. Now retirements are entirely driven by the request, dramatically simplifying activity tracking on the object themselves, and removing the ambiguity between retiring objects and retiring requests. Furthermore with the consolidation of managing the activity tracking centrally, we can look forward to using RCU to enable lockless lookup of the current active requests for an object. In the future, we will be able to query the status or wait upon rendering to an object without even touching the struct_mutex BKL. All told, less code, simpler and faster, and more extensible. v2: Add a typedef for the function pointer for convenience later. v3: Make the noop retirement callback explicit. Allow passing NULL to the init_request_active() which is expanded to a common noop function. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/1470293567-10811-16-git-send-email-chris@chris-wilson.co.uk
2016-08-04 06:52:35 +00:00
#include "i915_drv.h"
dma-buf: Rename struct fence to dma_fence I plan to usurp the short name of struct fence for a core kernel struct, and so I need to rename the specialised fence/timeline for DMA operations to make room. A consensus was reached in https://lists.freedesktop.org/archives/dri-devel/2016-July/113083.html that making clear this fence applies to DMA operations was a good thing. Since then the patch has grown a bit as usage increases, so hopefully it remains a good thing! (v2...: rebase, rerun spatch) v3: Compile on msm, spotted a manual fixup that I broke. v4: Try again for msm, sorry Daniel coccinelle script: @@ @@ - struct fence + struct dma_fence @@ @@ - struct fence_ops + struct dma_fence_ops @@ @@ - struct fence_cb + struct dma_fence_cb @@ @@ - struct fence_array + struct dma_fence_array @@ @@ - enum fence_flag_bits + enum dma_fence_flag_bits @@ @@ ( - fence_init + dma_fence_init | - fence_release + dma_fence_release | - fence_free + dma_fence_free | - fence_get + dma_fence_get | - fence_get_rcu + dma_fence_get_rcu | - fence_put + dma_fence_put | - fence_signal + dma_fence_signal | - fence_signal_locked + dma_fence_signal_locked | - fence_default_wait + dma_fence_default_wait | - fence_add_callback + dma_fence_add_callback | - fence_remove_callback + dma_fence_remove_callback | - fence_enable_sw_signaling + dma_fence_enable_sw_signaling | - fence_is_signaled_locked + dma_fence_is_signaled_locked | - fence_is_signaled + dma_fence_is_signaled | - fence_is_later + dma_fence_is_later | - fence_later + dma_fence_later | - fence_wait_timeout + dma_fence_wait_timeout | - fence_wait_any_timeout + dma_fence_wait_any_timeout | - fence_wait + dma_fence_wait | - fence_context_alloc + dma_fence_context_alloc | - fence_array_create + dma_fence_array_create | - to_fence_array + to_dma_fence_array | - fence_is_array + dma_fence_is_array | - trace_fence_emit + trace_dma_fence_emit | - FENCE_TRACE + DMA_FENCE_TRACE | - FENCE_WARN + DMA_FENCE_WARN | - FENCE_ERR + DMA_FENCE_ERR ) ( ... ) Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk> Acked-by: Sumit Semwal <sumit.semwal@linaro.org> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> Link: http://patchwork.freedesktop.org/patch/msgid/20161025120045.28839-1-chris@chris-wilson.co.uk
2016-10-25 12:00:45 +00:00
static const char *i915_fence_get_driver_name(struct dma_fence *fence)
{
return "i915";
}
dma-buf: Rename struct fence to dma_fence I plan to usurp the short name of struct fence for a core kernel struct, and so I need to rename the specialised fence/timeline for DMA operations to make room. A consensus was reached in https://lists.freedesktop.org/archives/dri-devel/2016-July/113083.html that making clear this fence applies to DMA operations was a good thing. Since then the patch has grown a bit as usage increases, so hopefully it remains a good thing! (v2...: rebase, rerun spatch) v3: Compile on msm, spotted a manual fixup that I broke. v4: Try again for msm, sorry Daniel coccinelle script: @@ @@ - struct fence + struct dma_fence @@ @@ - struct fence_ops + struct dma_fence_ops @@ @@ - struct fence_cb + struct dma_fence_cb @@ @@ - struct fence_array + struct dma_fence_array @@ @@ - enum fence_flag_bits + enum dma_fence_flag_bits @@ @@ ( - fence_init + dma_fence_init | - fence_release + dma_fence_release | - fence_free + dma_fence_free | - fence_get + dma_fence_get | - fence_get_rcu + dma_fence_get_rcu | - fence_put + dma_fence_put | - fence_signal + dma_fence_signal | - fence_signal_locked + dma_fence_signal_locked | - fence_default_wait + dma_fence_default_wait | - fence_add_callback + dma_fence_add_callback | - fence_remove_callback + dma_fence_remove_callback | - fence_enable_sw_signaling + dma_fence_enable_sw_signaling | - fence_is_signaled_locked + dma_fence_is_signaled_locked | - fence_is_signaled + dma_fence_is_signaled | - fence_is_later + dma_fence_is_later | - fence_later + dma_fence_later | - fence_wait_timeout + dma_fence_wait_timeout | - fence_wait_any_timeout + dma_fence_wait_any_timeout | - fence_wait + dma_fence_wait | - fence_context_alloc + dma_fence_context_alloc | - fence_array_create + dma_fence_array_create | - to_fence_array + to_dma_fence_array | - fence_is_array + dma_fence_is_array | - trace_fence_emit + trace_dma_fence_emit | - FENCE_TRACE + DMA_FENCE_TRACE | - FENCE_WARN + DMA_FENCE_WARN | - FENCE_ERR + DMA_FENCE_ERR ) ( ... ) Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk> Acked-by: Sumit Semwal <sumit.semwal@linaro.org> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> Link: http://patchwork.freedesktop.org/patch/msgid/20161025120045.28839-1-chris@chris-wilson.co.uk
2016-10-25 12:00:45 +00:00
static const char *i915_fence_get_timeline_name(struct dma_fence *fence)
{
/* The timeline struct (as part of the ppgtt underneath a context)
* may be freed when the request is no longer in use by the GPU.
* We could extend the life of a context to beyond that of all
* fences, possibly keeping the hw resource around indefinitely,
* or we just give them a false name. Since
* dma_fence_ops.get_timeline_name is a debug feature, the occasional
* lie seems justifiable.
*/
if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
return "signaled";
return to_request(fence)->timeline->common->name;
}
dma-buf: Rename struct fence to dma_fence I plan to usurp the short name of struct fence for a core kernel struct, and so I need to rename the specialised fence/timeline for DMA operations to make room. A consensus was reached in https://lists.freedesktop.org/archives/dri-devel/2016-July/113083.html that making clear this fence applies to DMA operations was a good thing. Since then the patch has grown a bit as usage increases, so hopefully it remains a good thing! (v2...: rebase, rerun spatch) v3: Compile on msm, spotted a manual fixup that I broke. v4: Try again for msm, sorry Daniel coccinelle script: @@ @@ - struct fence + struct dma_fence @@ @@ - struct fence_ops + struct dma_fence_ops @@ @@ - struct fence_cb + struct dma_fence_cb @@ @@ - struct fence_array + struct dma_fence_array @@ @@ - enum fence_flag_bits + enum dma_fence_flag_bits @@ @@ ( - fence_init + dma_fence_init | - fence_release + dma_fence_release | - fence_free + dma_fence_free | - fence_get + dma_fence_get | - fence_get_rcu + dma_fence_get_rcu | - fence_put + dma_fence_put | - fence_signal + dma_fence_signal | - fence_signal_locked + dma_fence_signal_locked | - fence_default_wait + dma_fence_default_wait | - fence_add_callback + dma_fence_add_callback | - fence_remove_callback + dma_fence_remove_callback | - fence_enable_sw_signaling + dma_fence_enable_sw_signaling | - fence_is_signaled_locked + dma_fence_is_signaled_locked | - fence_is_signaled + dma_fence_is_signaled | - fence_is_later + dma_fence_is_later | - fence_later + dma_fence_later | - fence_wait_timeout + dma_fence_wait_timeout | - fence_wait_any_timeout + dma_fence_wait_any_timeout | - fence_wait + dma_fence_wait | - fence_context_alloc + dma_fence_context_alloc | - fence_array_create + dma_fence_array_create | - to_fence_array + to_dma_fence_array | - fence_is_array + dma_fence_is_array | - trace_fence_emit + trace_dma_fence_emit | - FENCE_TRACE + DMA_FENCE_TRACE | - FENCE_WARN + DMA_FENCE_WARN | - FENCE_ERR + DMA_FENCE_ERR ) ( ... ) Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk> Acked-by: Sumit Semwal <sumit.semwal@linaro.org> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> Link: http://patchwork.freedesktop.org/patch/msgid/20161025120045.28839-1-chris@chris-wilson.co.uk
2016-10-25 12:00:45 +00:00
static bool i915_fence_signaled(struct dma_fence *fence)
{
return i915_gem_request_completed(to_request(fence));
}
dma-buf: Rename struct fence to dma_fence I plan to usurp the short name of struct fence for a core kernel struct, and so I need to rename the specialised fence/timeline for DMA operations to make room. A consensus was reached in https://lists.freedesktop.org/archives/dri-devel/2016-July/113083.html that making clear this fence applies to DMA operations was a good thing. Since then the patch has grown a bit as usage increases, so hopefully it remains a good thing! (v2...: rebase, rerun spatch) v3: Compile on msm, spotted a manual fixup that I broke. v4: Try again for msm, sorry Daniel coccinelle script: @@ @@ - struct fence + struct dma_fence @@ @@ - struct fence_ops + struct dma_fence_ops @@ @@ - struct fence_cb + struct dma_fence_cb @@ @@ - struct fence_array + struct dma_fence_array @@ @@ - enum fence_flag_bits + enum dma_fence_flag_bits @@ @@ ( - fence_init + dma_fence_init | - fence_release + dma_fence_release | - fence_free + dma_fence_free | - fence_get + dma_fence_get | - fence_get_rcu + dma_fence_get_rcu | - fence_put + dma_fence_put | - fence_signal + dma_fence_signal | - fence_signal_locked + dma_fence_signal_locked | - fence_default_wait + dma_fence_default_wait | - fence_add_callback + dma_fence_add_callback | - fence_remove_callback + dma_fence_remove_callback | - fence_enable_sw_signaling + dma_fence_enable_sw_signaling | - fence_is_signaled_locked + dma_fence_is_signaled_locked | - fence_is_signaled + dma_fence_is_signaled | - fence_is_later + dma_fence_is_later | - fence_later + dma_fence_later | - fence_wait_timeout + dma_fence_wait_timeout | - fence_wait_any_timeout + dma_fence_wait_any_timeout | - fence_wait + dma_fence_wait | - fence_context_alloc + dma_fence_context_alloc | - fence_array_create + dma_fence_array_create | - to_fence_array + to_dma_fence_array | - fence_is_array + dma_fence_is_array | - trace_fence_emit + trace_dma_fence_emit | - FENCE_TRACE + DMA_FENCE_TRACE | - FENCE_WARN + DMA_FENCE_WARN | - FENCE_ERR + DMA_FENCE_ERR ) ( ... ) Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk> Acked-by: Sumit Semwal <sumit.semwal@linaro.org> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> Link: http://patchwork.freedesktop.org/patch/msgid/20161025120045.28839-1-chris@chris-wilson.co.uk
2016-10-25 12:00:45 +00:00
static bool i915_fence_enable_signaling(struct dma_fence *fence)
{
if (i915_fence_signaled(fence))
return false;
intel_engine_enable_signaling(to_request(fence), true);
return !i915_fence_signaled(fence);
}
dma-buf: Rename struct fence to dma_fence I plan to usurp the short name of struct fence for a core kernel struct, and so I need to rename the specialised fence/timeline for DMA operations to make room. A consensus was reached in https://lists.freedesktop.org/archives/dri-devel/2016-July/113083.html that making clear this fence applies to DMA operations was a good thing. Since then the patch has grown a bit as usage increases, so hopefully it remains a good thing! (v2...: rebase, rerun spatch) v3: Compile on msm, spotted a manual fixup that I broke. v4: Try again for msm, sorry Daniel coccinelle script: @@ @@ - struct fence + struct dma_fence @@ @@ - struct fence_ops + struct dma_fence_ops @@ @@ - struct fence_cb + struct dma_fence_cb @@ @@ - struct fence_array + struct dma_fence_array @@ @@ - enum fence_flag_bits + enum dma_fence_flag_bits @@ @@ ( - fence_init + dma_fence_init | - fence_release + dma_fence_release | - fence_free + dma_fence_free | - fence_get + dma_fence_get | - fence_get_rcu + dma_fence_get_rcu | - fence_put + dma_fence_put | - fence_signal + dma_fence_signal | - fence_signal_locked + dma_fence_signal_locked | - fence_default_wait + dma_fence_default_wait | - fence_add_callback + dma_fence_add_callback | - fence_remove_callback + dma_fence_remove_callback | - fence_enable_sw_signaling + dma_fence_enable_sw_signaling | - fence_is_signaled_locked + dma_fence_is_signaled_locked | - fence_is_signaled + dma_fence_is_signaled | - fence_is_later + dma_fence_is_later | - fence_later + dma_fence_later | - fence_wait_timeout + dma_fence_wait_timeout | - fence_wait_any_timeout + dma_fence_wait_any_timeout | - fence_wait + dma_fence_wait | - fence_context_alloc + dma_fence_context_alloc | - fence_array_create + dma_fence_array_create | - to_fence_array + to_dma_fence_array | - fence_is_array + dma_fence_is_array | - trace_fence_emit + trace_dma_fence_emit | - FENCE_TRACE + DMA_FENCE_TRACE | - FENCE_WARN + DMA_FENCE_WARN | - FENCE_ERR + DMA_FENCE_ERR ) ( ... ) Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk> Acked-by: Sumit Semwal <sumit.semwal@linaro.org> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> Link: http://patchwork.freedesktop.org/patch/msgid/20161025120045.28839-1-chris@chris-wilson.co.uk
2016-10-25 12:00:45 +00:00
static signed long i915_fence_wait(struct dma_fence *fence,
bool interruptible,
signed long timeout)
{
return i915_wait_request(to_request(fence), interruptible, timeout);
}
dma-buf: Rename struct fence to dma_fence I plan to usurp the short name of struct fence for a core kernel struct, and so I need to rename the specialised fence/timeline for DMA operations to make room. A consensus was reached in https://lists.freedesktop.org/archives/dri-devel/2016-July/113083.html that making clear this fence applies to DMA operations was a good thing. Since then the patch has grown a bit as usage increases, so hopefully it remains a good thing! (v2...: rebase, rerun spatch) v3: Compile on msm, spotted a manual fixup that I broke. v4: Try again for msm, sorry Daniel coccinelle script: @@ @@ - struct fence + struct dma_fence @@ @@ - struct fence_ops + struct dma_fence_ops @@ @@ - struct fence_cb + struct dma_fence_cb @@ @@ - struct fence_array + struct dma_fence_array @@ @@ - enum fence_flag_bits + enum dma_fence_flag_bits @@ @@ ( - fence_init + dma_fence_init | - fence_release + dma_fence_release | - fence_free + dma_fence_free | - fence_get + dma_fence_get | - fence_get_rcu + dma_fence_get_rcu | - fence_put + dma_fence_put | - fence_signal + dma_fence_signal | - fence_signal_locked + dma_fence_signal_locked | - fence_default_wait + dma_fence_default_wait | - fence_add_callback + dma_fence_add_callback | - fence_remove_callback + dma_fence_remove_callback | - fence_enable_sw_signaling + dma_fence_enable_sw_signaling | - fence_is_signaled_locked + dma_fence_is_signaled_locked | - fence_is_signaled + dma_fence_is_signaled | - fence_is_later + dma_fence_is_later | - fence_later + dma_fence_later | - fence_wait_timeout + dma_fence_wait_timeout | - fence_wait_any_timeout + dma_fence_wait_any_timeout | - fence_wait + dma_fence_wait | - fence_context_alloc + dma_fence_context_alloc | - fence_array_create + dma_fence_array_create | - to_fence_array + to_dma_fence_array | - fence_is_array + dma_fence_is_array | - trace_fence_emit + trace_dma_fence_emit | - FENCE_TRACE + DMA_FENCE_TRACE | - FENCE_WARN + DMA_FENCE_WARN | - FENCE_ERR + DMA_FENCE_ERR ) ( ... ) Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk> Acked-by: Sumit Semwal <sumit.semwal@linaro.org> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> Link: http://patchwork.freedesktop.org/patch/msgid/20161025120045.28839-1-chris@chris-wilson.co.uk
2016-10-25 12:00:45 +00:00
static void i915_fence_release(struct dma_fence *fence)
{
struct drm_i915_gem_request *req = to_request(fence);
/* The request is put onto a RCU freelist (i.e. the address
* is immediately reused), mark the fences as being freed now.
* Otherwise the debugobjects for the fences are only marked as
* freed when the slab cache itself is freed, and so we would get
* caught trying to reuse dead objects.
*/
i915_sw_fence_fini(&req->submit);
kmem_cache_free(req->i915->requests, req);
}
dma-buf: Rename struct fence to dma_fence I plan to usurp the short name of struct fence for a core kernel struct, and so I need to rename the specialised fence/timeline for DMA operations to make room. A consensus was reached in https://lists.freedesktop.org/archives/dri-devel/2016-July/113083.html that making clear this fence applies to DMA operations was a good thing. Since then the patch has grown a bit as usage increases, so hopefully it remains a good thing! (v2...: rebase, rerun spatch) v3: Compile on msm, spotted a manual fixup that I broke. v4: Try again for msm, sorry Daniel coccinelle script: @@ @@ - struct fence + struct dma_fence @@ @@ - struct fence_ops + struct dma_fence_ops @@ @@ - struct fence_cb + struct dma_fence_cb @@ @@ - struct fence_array + struct dma_fence_array @@ @@ - enum fence_flag_bits + enum dma_fence_flag_bits @@ @@ ( - fence_init + dma_fence_init | - fence_release + dma_fence_release | - fence_free + dma_fence_free | - fence_get + dma_fence_get | - fence_get_rcu + dma_fence_get_rcu | - fence_put + dma_fence_put | - fence_signal + dma_fence_signal | - fence_signal_locked + dma_fence_signal_locked | - fence_default_wait + dma_fence_default_wait | - fence_add_callback + dma_fence_add_callback | - fence_remove_callback + dma_fence_remove_callback | - fence_enable_sw_signaling + dma_fence_enable_sw_signaling | - fence_is_signaled_locked + dma_fence_is_signaled_locked | - fence_is_signaled + dma_fence_is_signaled | - fence_is_later + dma_fence_is_later | - fence_later + dma_fence_later | - fence_wait_timeout + dma_fence_wait_timeout | - fence_wait_any_timeout + dma_fence_wait_any_timeout | - fence_wait + dma_fence_wait | - fence_context_alloc + dma_fence_context_alloc | - fence_array_create + dma_fence_array_create | - to_fence_array + to_dma_fence_array | - fence_is_array + dma_fence_is_array | - trace_fence_emit + trace_dma_fence_emit | - FENCE_TRACE + DMA_FENCE_TRACE | - FENCE_WARN + DMA_FENCE_WARN | - FENCE_ERR + DMA_FENCE_ERR ) ( ... ) Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk> Acked-by: Sumit Semwal <sumit.semwal@linaro.org> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> Link: http://patchwork.freedesktop.org/patch/msgid/20161025120045.28839-1-chris@chris-wilson.co.uk
2016-10-25 12:00:45 +00:00
const struct dma_fence_ops i915_fence_ops = {
.get_driver_name = i915_fence_get_driver_name,
.get_timeline_name = i915_fence_get_timeline_name,
.enable_signaling = i915_fence_enable_signaling,
.signaled = i915_fence_signaled,
.wait = i915_fence_wait,
.release = i915_fence_release,
};
static inline void
i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
{
struct drm_i915_file_private *file_priv;
file_priv = request->file_priv;
if (!file_priv)
return;
spin_lock(&file_priv->mm.lock);
if (request->file_priv) {
list_del(&request->client_link);
request->file_priv = NULL;
}
spin_unlock(&file_priv->mm.lock);
}
static struct i915_dependency *
i915_dependency_alloc(struct drm_i915_private *i915)
{
return kmem_cache_alloc(i915->dependencies, GFP_KERNEL);
}
static void
i915_dependency_free(struct drm_i915_private *i915,
struct i915_dependency *dep)
{
kmem_cache_free(i915->dependencies, dep);
}
static void
__i915_priotree_add_dependency(struct i915_priotree *pt,
struct i915_priotree *signal,
struct i915_dependency *dep,
unsigned long flags)
{
drm/i915/scheduler: Execute requests in order of priorities Track the priority of each request and use it to determine the order in which we submit requests to the hardware via execlists. The priority of the request is determined by the user (eventually via the context) but may be overridden at any time by the driver. When we set the priority of the request, we bump the priority of all of its dependencies to match - so that a high priority drawing operation is not stuck behind a background task. When the request is ready to execute (i.e. we have signaled the submit fence following completion of all its dependencies, including third party fences), we put the request into a priority sorted rbtree to be submitted to the hardware. If the request is higher priority than all pending requests, it will be submitted on the next context-switch interrupt as soon as the hardware has completed the current request. We do not currently preempt any current execution to immediately run a very high priority request, at least not yet. One more limitation, is that this is first implementation is for execlists only so currently limited to gen8/gen9. v2: Replace recursive priority inheritance bumping with an iterative depth-first search list. v3: list_next_entry() for walking lists v4: Explain how the dfs solves the recursion problem with PI. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161114204105.29171-8-chris@chris-wilson.co.uk
2016-11-14 20:41:03 +00:00
INIT_LIST_HEAD(&dep->dfs_link);
list_add(&dep->wait_link, &signal->waiters_list);
list_add(&dep->signal_link, &pt->signalers_list);
dep->signaler = signal;
dep->flags = flags;
}
static int
i915_priotree_add_dependency(struct drm_i915_private *i915,
struct i915_priotree *pt,
struct i915_priotree *signal)
{
struct i915_dependency *dep;
dep = i915_dependency_alloc(i915);
if (!dep)
return -ENOMEM;
__i915_priotree_add_dependency(pt, signal, dep, I915_DEPENDENCY_ALLOC);
return 0;
}
static void
i915_priotree_fini(struct drm_i915_private *i915, struct i915_priotree *pt)
{
struct i915_dependency *dep, *next;
drm/i915: Split execlist priority queue into rbtree + linked list All the requests at the same priority are executed in FIFO order. They do not need to be stored in the rbtree themselves, as they are a simple list within a level. If we move the requests at one priority into a list, we can then reduce the rbtree to the set of priorities. This should keep the height of the rbtree small, as the number of active priorities can not exceed the number of active requests and should be typically only a few. Currently, we have ~2k possible different priority levels, that may increase to allow even more fine grained selection. Allocating those in advance seems a waste (and may be impossible), so we opt for allocating upon first use, and freeing after its requests are depleted. To avoid the possibility of an allocation failure causing us to lose a request, we preallocate the default priority (0) and bump any request to that priority if we fail to allocate it the appropriate plist. Having a request (that is ready to run, so not leading to corruption) execute out-of-order is better than leaking the request (and its dependency tree) entirely. There should be a benefit to reducing execlists_dequeue() to principally using a simple list (and reducing the frequency of both rbtree iteration and balancing on erase) but for typical workloads, request coalescing should be small enough that we don't notice any change. The main gain is from improving PI calls to schedule, and the explicit list within a level should make request unwinding simpler (we just need to insert at the head of the list rather than the tail and not have to make the rbtree search more complicated). v2: Avoid use-after-free when deleting a depleted priolist v3: Michał found the solution to handling the allocation failure gracefully. If we disable all priority scheduling following the allocation failure, those requests will be executed in fifo and we will ensure that this request and its dependencies are in strict fifo (even when it doesn't realise it is only a single list). Normal scheduling is restored once we know the device is idle, until the next failure! Suggested-by: Michał Wajdeczko <michal.wajdeczko@intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Michał Winiarski <michal.winiarski@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Reviewed-by: Michał Winiarski <michal.winiarski@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170517121007.27224-8-chris@chris-wilson.co.uk
2017-05-17 12:10:03 +00:00
GEM_BUG_ON(!list_empty(&pt->link));
drm/i915/scheduler: Execute requests in order of priorities Track the priority of each request and use it to determine the order in which we submit requests to the hardware via execlists. The priority of the request is determined by the user (eventually via the context) but may be overridden at any time by the driver. When we set the priority of the request, we bump the priority of all of its dependencies to match - so that a high priority drawing operation is not stuck behind a background task. When the request is ready to execute (i.e. we have signaled the submit fence following completion of all its dependencies, including third party fences), we put the request into a priority sorted rbtree to be submitted to the hardware. If the request is higher priority than all pending requests, it will be submitted on the next context-switch interrupt as soon as the hardware has completed the current request. We do not currently preempt any current execution to immediately run a very high priority request, at least not yet. One more limitation, is that this is first implementation is for execlists only so currently limited to gen8/gen9. v2: Replace recursive priority inheritance bumping with an iterative depth-first search list. v3: list_next_entry() for walking lists v4: Explain how the dfs solves the recursion problem with PI. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161114204105.29171-8-chris@chris-wilson.co.uk
2016-11-14 20:41:03 +00:00
/*
* Everyone we depended upon (the fences we wait to be signaled)
* should retire before us and remove themselves from our list.
* However, retirement is run independently on each timeline and
* so we may be called out-of-order.
*/
list_for_each_entry_safe(dep, next, &pt->signalers_list, signal_link) {
GEM_BUG_ON(!i915_priotree_signaled(dep->signaler));
GEM_BUG_ON(!list_empty(&dep->dfs_link));
list_del(&dep->wait_link);
if (dep->flags & I915_DEPENDENCY_ALLOC)
i915_dependency_free(i915, dep);
}
/* Remove ourselves from everyone who depends upon us */
list_for_each_entry_safe(dep, next, &pt->waiters_list, wait_link) {
GEM_BUG_ON(dep->signaler != pt);
GEM_BUG_ON(!list_empty(&dep->dfs_link));
list_del(&dep->signal_link);
if (dep->flags & I915_DEPENDENCY_ALLOC)
i915_dependency_free(i915, dep);
}
}
static void
i915_priotree_init(struct i915_priotree *pt)
{
INIT_LIST_HEAD(&pt->signalers_list);
INIT_LIST_HEAD(&pt->waiters_list);
drm/i915: Split execlist priority queue into rbtree + linked list All the requests at the same priority are executed in FIFO order. They do not need to be stored in the rbtree themselves, as they are a simple list within a level. If we move the requests at one priority into a list, we can then reduce the rbtree to the set of priorities. This should keep the height of the rbtree small, as the number of active priorities can not exceed the number of active requests and should be typically only a few. Currently, we have ~2k possible different priority levels, that may increase to allow even more fine grained selection. Allocating those in advance seems a waste (and may be impossible), so we opt for allocating upon first use, and freeing after its requests are depleted. To avoid the possibility of an allocation failure causing us to lose a request, we preallocate the default priority (0) and bump any request to that priority if we fail to allocate it the appropriate plist. Having a request (that is ready to run, so not leading to corruption) execute out-of-order is better than leaking the request (and its dependency tree) entirely. There should be a benefit to reducing execlists_dequeue() to principally using a simple list (and reducing the frequency of both rbtree iteration and balancing on erase) but for typical workloads, request coalescing should be small enough that we don't notice any change. The main gain is from improving PI calls to schedule, and the explicit list within a level should make request unwinding simpler (we just need to insert at the head of the list rather than the tail and not have to make the rbtree search more complicated). v2: Avoid use-after-free when deleting a depleted priolist v3: Michał found the solution to handling the allocation failure gracefully. If we disable all priority scheduling following the allocation failure, those requests will be executed in fifo and we will ensure that this request and its dependencies are in strict fifo (even when it doesn't realise it is only a single list). Normal scheduling is restored once we know the device is idle, until the next failure! Suggested-by: Michał Wajdeczko <michal.wajdeczko@intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Michał Winiarski <michal.winiarski@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Reviewed-by: Michał Winiarski <michal.winiarski@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170517121007.27224-8-chris@chris-wilson.co.uk
2017-05-17 12:10:03 +00:00
INIT_LIST_HEAD(&pt->link);
pt->priority = I915_PRIORITY_INVALID;
}
static int reset_all_global_seqno(struct drm_i915_private *i915, u32 seqno)
{
struct intel_engine_cs *engine;
enum intel_engine_id id;
int ret;
/* Carefully retire all requests without writing to the rings */
ret = i915_gem_wait_for_idle(i915,
I915_WAIT_INTERRUPTIBLE |
I915_WAIT_LOCKED);
if (ret)
return ret;
/* If the seqno wraps around, we need to clear the breadcrumb rbtree */
for_each_engine(engine, i915, id) {
struct i915_gem_timeline *timeline;
struct intel_timeline *tl = engine->timeline;
if (!i915_seqno_passed(seqno, tl->seqno)) {
/* spin until threads are complete */
while (intel_breadcrumbs_busy(engine))
cond_resched();
}
/* Check we are idle before we fiddle with hw state! */
GEM_BUG_ON(!intel_engine_is_idle(engine));
GEM_BUG_ON(i915_gem_active_isset(&engine->timeline->last_request));
/* Finally reset hw state */
intel_engine_init_global_seqno(engine, seqno);
tl->seqno = seqno;
list_for_each_entry(timeline, &i915->gt.timelines, link)
memset(timeline->engine[id].global_sync, 0,
sizeof(timeline->engine[id].global_sync));
}
return 0;
}
int i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno)
{
struct drm_i915_private *dev_priv = to_i915(dev);
lockdep_assert_held(&dev_priv->drm.struct_mutex);
if (seqno == 0)
return -EINVAL;
/* HWS page needs to be set less than what we
* will inject to ring
*/
return reset_all_global_seqno(dev_priv, seqno - 1);
}
static void mark_busy(struct drm_i915_private *i915)
{
if (i915->gt.awake)
return;
GEM_BUG_ON(!i915->gt.active_requests);
intel_runtime_pm_get_noresume(i915);
drm/i915: Restore GT performance in headless mode with DMC loaded It seems that the DMC likes to transition between the DC states a lot when there are no connected displays (no active power domains) during command submission. This activity on DC states has a negative impact on the performance of the chip with huge latencies observed in the interrupt handlers and elsewhere. Simple tests like igt/gem_latency -n 0 are slowed down by a factor of eight. Work around it by introducing a new power domain named, POWER_DOMAIN_GT_IRQ, associtated with the "DC off" power well, which is held for the duration of command submission activity. CNL has the same problem which will be addressed as a follow-up. Doing that requires a fix for a DC6 context corruption problem in the CNL DMC firmware which is yet to be released. v2: * Add commit text as comment in i915_gem_mark_busy. (Chris Wilson) * Protect macro body with braces. (Jani Nikula) v3: * Add dedicated power domain for clarity. (Chris, Imre) * Commit message and comment text updates. * Apply to all big-core GEN9 parts apart for Skylake which is pending DMC firmware release. v4: * Power domain should be inner to device runtime pm. (Chris) * Simplify NEEDS_CSR_GT_PERF_WA macro. (Chris) * Handle async DMC loading by moving the GT_IRQ power domain logic into intel_runtime_pm. (Daniel, Chris) * Include small core GEN9 as well. (Imre) v5 * Special handling for async DMC load is not needed since on failure the power domain reference is kept permanently taken. (Imre) v6: * Drop the NEEDS_CSR_GT_PERF_WA macro since all firmwares have now been deployed. (Imre, Chris) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100572 Testcase: igt/gem_exec_nop/headless Cc: Imre Deak <imre.deak@intel.com> Acked-by: Chris Wilson <chris@chris-wilson.co.uk> (v2) Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com> Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> (v5) Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> [Imre: Add note about applying the WA on CNL as a follow-up] Signed-off-by: Imre Deak <imre.deak@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20171205132854.26380-1-tvrtko.ursulin@linux.intel.com
2017-12-05 13:28:54 +00:00
/*
* It seems that the DMC likes to transition between the DC states a lot
* when there are no connected displays (no active power domains) during
* command submission.
*
* This activity has negative impact on the performance of the chip with
* huge latencies observed in the interrupt handler and elsewhere.
*
* Work around it by grabbing a GT IRQ power domain whilst there is any
* GT activity, preventing any DC state transitions.
*/
intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
i915->gt.awake = true;
if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
i915->gt.epoch = 1;
intel_enable_gt_powersave(i915);
i915_update_gfx_val(i915);
if (INTEL_GEN(i915) >= 6)
gen6_rps_busy(i915);
i915_pmu_gt_unparked(i915);
intel_engines_unpark(i915);
drm/i915: Always run hangcheck while the GPU is busy Previously, we relied on only running the hangcheck while somebody was waiting on the GPU, in order to minimise the amount of time hangcheck had to run. (If nobody was watching the GPU, nobody would notice if the GPU wasn't responding -- eventually somebody would care and so kick hangcheck into action.) However, this falls apart from around commit 4680816be336 ("drm/i915: Wait first for submission, before waiting for request completion"), as not all waiters declare themselves to hangcheck and so we could switch off hangcheck and miss GPU hangs even when waiting under the struct_mutex. If we enable hangcheck from the first request submission, and let it run until the GPU is idle again, we forgo all the complexity involved with only enabling around waiters. We just have to remember to be careful that we do not declare a GPU hang when idly waiting for the next request to be come ready, as we will run hangcheck continuously even when the engines are stalled waiting for external events. This should be true already as we should only be tracking requests submitted to hardware for execution as an indicator that the engine is busy. Fixes: 4680816be336 ("drm/i915: Wait first for submission, before waiting for request completion" Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104840 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20180129144104.3921-1-chris@chris-wilson.co.uk Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
2018-01-29 14:41:04 +00:00
i915_queue_hangcheck(i915);
queue_delayed_work(i915->wq,
&i915->gt.retire_work,
round_jiffies_up_relative(HZ));
}
static int reserve_engine(struct intel_engine_cs *engine)
{
struct drm_i915_private *i915 = engine->i915;
u32 active = ++engine->timeline->inflight_seqnos;
u32 seqno = engine->timeline->seqno;
int ret;
/* Reservation is fine until we need to wrap around */
if (unlikely(add_overflows(seqno, active))) {
ret = reset_all_global_seqno(i915, 0);
if (ret) {
engine->timeline->inflight_seqnos--;
return ret;
}
}
if (!i915->gt.active_requests++)
mark_busy(i915);
return 0;
}
static void unreserve_engine(struct intel_engine_cs *engine)
{
struct drm_i915_private *i915 = engine->i915;
if (!--i915->gt.active_requests) {
/* Cancel the mark_busy() from our reserve_engine() */
GEM_BUG_ON(!i915->gt.awake);
mod_delayed_work(i915->wq,
&i915->gt.idle_work,
msecs_to_jiffies(100));
}
GEM_BUG_ON(!engine->timeline->inflight_seqnos);
engine->timeline->inflight_seqnos--;
}
drm/i915: Refactor activity tracking for requests With the introduction of requests, we amplified the number of atomic refcounted objects we use and update every execbuffer; from none to several references, and a set of references that need to be changed. We also introduced interesting side-effects in the order of retiring requests and objects. Instead of independently tracking the last request for an object, track the active objects for each request. The object will reside in the buffer list of its most recent active request and so we reduce the kref interchange to a list_move. Now retirements are entirely driven by the request, dramatically simplifying activity tracking on the object themselves, and removing the ambiguity between retiring objects and retiring requests. Furthermore with the consolidation of managing the activity tracking centrally, we can look forward to using RCU to enable lockless lookup of the current active requests for an object. In the future, we will be able to query the status or wait upon rendering to an object without even touching the struct_mutex BKL. All told, less code, simpler and faster, and more extensible. v2: Add a typedef for the function pointer for convenience later. v3: Make the noop retirement callback explicit. Allow passing NULL to the init_request_active() which is expanded to a common noop function. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/1470293567-10811-16-git-send-email-chris@chris-wilson.co.uk
2016-08-04 06:52:35 +00:00
void i915_gem_retire_noop(struct i915_gem_active *active,
struct drm_i915_gem_request *request)
{
/* Space left intentionally blank */
}
static void advance_ring(struct drm_i915_gem_request *request)
{
unsigned int tail;
/* We know the GPU must have read the request to have
* sent us the seqno + interrupt, so use the position
* of tail of the request to update the last known position
* of the GPU head.
*
* Note this requires that we are always called in request
* completion order.
*/
if (list_is_last(&request->ring_link, &request->ring->request_list)) {
/* We may race here with execlists resubmitting this request
* as we retire it. The resubmission will move the ring->tail
* forwards (to request->wa_tail). We either read the
* current value that was written to hw, or the value that
* is just about to be. Either works, if we miss the last two
* noops - they are safe to be replayed on a reset.
*/
tail = READ_ONCE(request->ring->tail);
} else {
tail = request->postfix;
}
list_del(&request->ring_link);
request->ring->head = tail;
}
static void free_capture_list(struct drm_i915_gem_request *request)
{
struct i915_gem_capture_list *capture;
capture = request->capture_list;
while (capture) {
struct i915_gem_capture_list *next = capture->next;
kfree(capture);
capture = next;
}
}
static void i915_gem_request_retire(struct drm_i915_gem_request *request)
{
drm/i915: Unify active context tracking between legacy/execlists/guc The requests conversion introduced a nasty bug where we could generate a new request in the middle of constructing a request if we needed to idle the system in order to evict space for a context. The request to idle would be executed (and waited upon) before the current one, creating a minor havoc in the seqno accounting, as we will consider the current request to already be completed (prior to deferred seqno assignment) but ring->last_retired_head would have been updated and still could allow us to overwrite the current request before execution. We also employed two different mechanisms to track the active context until it was switched out. The legacy method allowed for waiting upon an active context (it could forcibly evict any vma, including context's), but the execlists method took a step backwards by pinning the vma for the entire active lifespan of the context (the only way to evict was to idle the entire GPU, not individual contexts). However, to circumvent the tricky issue of locking (i.e. we cannot take struct_mutex at the time of i915_gem_request_submit(), where we would want to move the previous context onto the active tracker and unpin it), we take the execlists approach and keep the contexts pinned until retirement. The benefit of the execlists approach, more important for execlists than legacy, was the reduction in work in pinning the context for each request - as the context was kept pinned until idle, it could short circuit the pinning for all active contexts. We introduce new engine vfuncs to pin and unpin the context respectively. The context is pinned at the start of the request, and only unpinned when the following request is retired (this ensures that the context is idle and coherent in main memory before we unpin it). We move the engine->last_context tracking into the retirement itself (rather than during request submission) in order to allow the submission to be reordered or unwound without undue difficultly. And finally an ulterior motive for unifying context handling was to prepare for mock requests. v2: Rename to last_retired_context, split out legacy_context tracking for MI_SET_CONTEXT. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161218153724.8439-3-chris@chris-wilson.co.uk
2016-12-18 15:37:20 +00:00
struct intel_engine_cs *engine = request->engine;
drm/i915: Refactor activity tracking for requests With the introduction of requests, we amplified the number of atomic refcounted objects we use and update every execbuffer; from none to several references, and a set of references that need to be changed. We also introduced interesting side-effects in the order of retiring requests and objects. Instead of independently tracking the last request for an object, track the active objects for each request. The object will reside in the buffer list of its most recent active request and so we reduce the kref interchange to a list_move. Now retirements are entirely driven by the request, dramatically simplifying activity tracking on the object themselves, and removing the ambiguity between retiring objects and retiring requests. Furthermore with the consolidation of managing the activity tracking centrally, we can look forward to using RCU to enable lockless lookup of the current active requests for an object. In the future, we will be able to query the status or wait upon rendering to an object without even touching the struct_mutex BKL. All told, less code, simpler and faster, and more extensible. v2: Add a typedef for the function pointer for convenience later. v3: Make the noop retirement callback explicit. Allow passing NULL to the init_request_active() which is expanded to a common noop function. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/1470293567-10811-16-git-send-email-chris@chris-wilson.co.uk
2016-08-04 06:52:35 +00:00
struct i915_gem_active *active, *next;
lockdep_assert_held(&request->i915->drm.struct_mutex);
GEM_BUG_ON(!i915_sw_fence_signaled(&request->submit));
GEM_BUG_ON(!i915_gem_request_completed(request));
GEM_BUG_ON(!request->i915->gt.active_requests);
trace_i915_gem_request_retire(request);
drm/i915: Unify active context tracking between legacy/execlists/guc The requests conversion introduced a nasty bug where we could generate a new request in the middle of constructing a request if we needed to idle the system in order to evict space for a context. The request to idle would be executed (and waited upon) before the current one, creating a minor havoc in the seqno accounting, as we will consider the current request to already be completed (prior to deferred seqno assignment) but ring->last_retired_head would have been updated and still could allow us to overwrite the current request before execution. We also employed two different mechanisms to track the active context until it was switched out. The legacy method allowed for waiting upon an active context (it could forcibly evict any vma, including context's), but the execlists method took a step backwards by pinning the vma for the entire active lifespan of the context (the only way to evict was to idle the entire GPU, not individual contexts). However, to circumvent the tricky issue of locking (i.e. we cannot take struct_mutex at the time of i915_gem_request_submit(), where we would want to move the previous context onto the active tracker and unpin it), we take the execlists approach and keep the contexts pinned until retirement. The benefit of the execlists approach, more important for execlists than legacy, was the reduction in work in pinning the context for each request - as the context was kept pinned until idle, it could short circuit the pinning for all active contexts. We introduce new engine vfuncs to pin and unpin the context respectively. The context is pinned at the start of the request, and only unpinned when the following request is retired (this ensures that the context is idle and coherent in main memory before we unpin it). We move the engine->last_context tracking into the retirement itself (rather than during request submission) in order to allow the submission to be reordered or unwound without undue difficultly. And finally an ulterior motive for unifying context handling was to prepare for mock requests. v2: Rename to last_retired_context, split out legacy_context tracking for MI_SET_CONTEXT. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161218153724.8439-3-chris@chris-wilson.co.uk
2016-12-18 15:37:20 +00:00
spin_lock_irq(&engine->timeline->lock);
list_del_init(&request->link);
drm/i915: Unify active context tracking between legacy/execlists/guc The requests conversion introduced a nasty bug where we could generate a new request in the middle of constructing a request if we needed to idle the system in order to evict space for a context. The request to idle would be executed (and waited upon) before the current one, creating a minor havoc in the seqno accounting, as we will consider the current request to already be completed (prior to deferred seqno assignment) but ring->last_retired_head would have been updated and still could allow us to overwrite the current request before execution. We also employed two different mechanisms to track the active context until it was switched out. The legacy method allowed for waiting upon an active context (it could forcibly evict any vma, including context's), but the execlists method took a step backwards by pinning the vma for the entire active lifespan of the context (the only way to evict was to idle the entire GPU, not individual contexts). However, to circumvent the tricky issue of locking (i.e. we cannot take struct_mutex at the time of i915_gem_request_submit(), where we would want to move the previous context onto the active tracker and unpin it), we take the execlists approach and keep the contexts pinned until retirement. The benefit of the execlists approach, more important for execlists than legacy, was the reduction in work in pinning the context for each request - as the context was kept pinned until idle, it could short circuit the pinning for all active contexts. We introduce new engine vfuncs to pin and unpin the context respectively. The context is pinned at the start of the request, and only unpinned when the following request is retired (this ensures that the context is idle and coherent in main memory before we unpin it). We move the engine->last_context tracking into the retirement itself (rather than during request submission) in order to allow the submission to be reordered or unwound without undue difficultly. And finally an ulterior motive for unifying context handling was to prepare for mock requests. v2: Rename to last_retired_context, split out legacy_context tracking for MI_SET_CONTEXT. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161218153724.8439-3-chris@chris-wilson.co.uk
2016-12-18 15:37:20 +00:00
spin_unlock_irq(&engine->timeline->lock);
unreserve_engine(request->engine);
advance_ring(request);
free_capture_list(request);
drm/i915: Refactor activity tracking for requests With the introduction of requests, we amplified the number of atomic refcounted objects we use and update every execbuffer; from none to several references, and a set of references that need to be changed. We also introduced interesting side-effects in the order of retiring requests and objects. Instead of independently tracking the last request for an object, track the active objects for each request. The object will reside in the buffer list of its most recent active request and so we reduce the kref interchange to a list_move. Now retirements are entirely driven by the request, dramatically simplifying activity tracking on the object themselves, and removing the ambiguity between retiring objects and retiring requests. Furthermore with the consolidation of managing the activity tracking centrally, we can look forward to using RCU to enable lockless lookup of the current active requests for an object. In the future, we will be able to query the status or wait upon rendering to an object without even touching the struct_mutex BKL. All told, less code, simpler and faster, and more extensible. v2: Add a typedef for the function pointer for convenience later. v3: Make the noop retirement callback explicit. Allow passing NULL to the init_request_active() which is expanded to a common noop function. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/1470293567-10811-16-git-send-email-chris@chris-wilson.co.uk
2016-08-04 06:52:35 +00:00
/* Walk through the active list, calling retire on each. This allows
* objects to track their GPU activity and mark themselves as idle
* when their *last* active request is completed (updating state
* tracking lists for eviction, active references for GEM, etc).
*
* As the ->retire() may free the node, we decouple it first and
* pass along the auxiliary information (to avoid dereferencing
* the node after the callback).
*/
list_for_each_entry_safe(active, next, &request->active_list, link) {
/* In microbenchmarks or focusing upon time inside the kernel,
* we may spend an inordinate amount of time simply handling
* the retirement of requests and processing their callbacks.
* Of which, this loop itself is particularly hot due to the
* cache misses when jumping around the list of i915_gem_active.
* So we try to keep this loop as streamlined as possible and
* also prefetch the next i915_gem_active to try and hide
* the likely cache miss.
*/
prefetchw(next);
INIT_LIST_HEAD(&active->link);
drm/i915: Enable lockless lookup of request tracking via RCU If we enable RCU for the requests (providing a grace period where we can inspect a "dead" request before it is freed), we can allow callers to carefully perform lockless lookup of an active request. However, by enabling deferred freeing of requests, we can potentially hog a lot of memory when dealing with tens of thousands of requests per second - with a quick insertion of a synchronize_rcu() inside our shrinker callback, that issue disappears. v2: Currently, it is our responsibility to handle reclaim i.e. to avoid hogging memory with the delayed slab frees. At the moment, we wait for a grace period in the shrinker, and block for all RCU callbacks on oom. Suggested alternatives focus on flushing our RCU callback when we have a certain number of outstanding request frees, and blocking on that flush after a second high watermark. (So rather than wait for the system to run out of memory, we stop issuing requests - both are nondeterministic.) Paul E. McKenney wrote: Another approach is synchronize_rcu() after some largish number of requests. The advantage of this approach is that it throttles the production of callbacks at the source. The corresponding disadvantage is that it slows things up. Another approach is to use call_rcu(), but if the previous call_rcu() is still in flight, block waiting for it. Yet another approach is the get_state_synchronize_rcu() / cond_synchronize_rcu() pair. The idea is to do something like this: cond_synchronize_rcu(cookie); cookie = get_state_synchronize_rcu(); You would of course do an initial get_state_synchronize_rcu() to get things going. This would not block unless there was less than one grace period's worth of time between invocations. But this assumes a busy system, where there is almost always a grace period in flight. But you can make that happen as follows: cond_synchronize_rcu(cookie); cookie = get_state_synchronize_rcu(); call_rcu(&my_rcu_head, noop_function); Note that you need additional code to make sure that the old callback has completed before doing a new one. Setting and clearing a flag with appropriate memory ordering control suffices (e.g,. smp_load_acquire() and smp_store_release()). v3: More comments on compiler and processor order of operations within the RCU lookup and discover we can use rcu_access_pointer() here instead. v4: Wrap i915_gem_active_get_rcu() to take the rcu_read_lock itself. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com> Cc: "Goel, Akash" <akash.goel@intel.com> Cc: Josh Triplett <josh@joshtriplett.org> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> Link: http://patchwork.freedesktop.org/patch/msgid/1470324762-2545-25-git-send-email-chris@chris-wilson.co.uk
2016-08-04 15:32:41 +00:00
RCU_INIT_POINTER(active->request, NULL);
drm/i915: Refactor activity tracking for requests With the introduction of requests, we amplified the number of atomic refcounted objects we use and update every execbuffer; from none to several references, and a set of references that need to be changed. We also introduced interesting side-effects in the order of retiring requests and objects. Instead of independently tracking the last request for an object, track the active objects for each request. The object will reside in the buffer list of its most recent active request and so we reduce the kref interchange to a list_move. Now retirements are entirely driven by the request, dramatically simplifying activity tracking on the object themselves, and removing the ambiguity between retiring objects and retiring requests. Furthermore with the consolidation of managing the activity tracking centrally, we can look forward to using RCU to enable lockless lookup of the current active requests for an object. In the future, we will be able to query the status or wait upon rendering to an object without even touching the struct_mutex BKL. All told, less code, simpler and faster, and more extensible. v2: Add a typedef for the function pointer for convenience later. v3: Make the noop retirement callback explicit. Allow passing NULL to the init_request_active() which is expanded to a common noop function. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/1470293567-10811-16-git-send-email-chris@chris-wilson.co.uk
2016-08-04 06:52:35 +00:00
active->retire(active, request);
}
i915_gem_request_remove_from_client(request);
/* Retirement decays the ban score as it is a sign of ctx progress */
atomic_dec_if_positive(&request->ctx->ban_score);
drm/i915: Unify active context tracking between legacy/execlists/guc The requests conversion introduced a nasty bug where we could generate a new request in the middle of constructing a request if we needed to idle the system in order to evict space for a context. The request to idle would be executed (and waited upon) before the current one, creating a minor havoc in the seqno accounting, as we will consider the current request to already be completed (prior to deferred seqno assignment) but ring->last_retired_head would have been updated and still could allow us to overwrite the current request before execution. We also employed two different mechanisms to track the active context until it was switched out. The legacy method allowed for waiting upon an active context (it could forcibly evict any vma, including context's), but the execlists method took a step backwards by pinning the vma for the entire active lifespan of the context (the only way to evict was to idle the entire GPU, not individual contexts). However, to circumvent the tricky issue of locking (i.e. we cannot take struct_mutex at the time of i915_gem_request_submit(), where we would want to move the previous context onto the active tracker and unpin it), we take the execlists approach and keep the contexts pinned until retirement. The benefit of the execlists approach, more important for execlists than legacy, was the reduction in work in pinning the context for each request - as the context was kept pinned until idle, it could short circuit the pinning for all active contexts. We introduce new engine vfuncs to pin and unpin the context respectively. The context is pinned at the start of the request, and only unpinned when the following request is retired (this ensures that the context is idle and coherent in main memory before we unpin it). We move the engine->last_context tracking into the retirement itself (rather than during request submission) in order to allow the submission to be reordered or unwound without undue difficultly. And finally an ulterior motive for unifying context handling was to prepare for mock requests. v2: Rename to last_retired_context, split out legacy_context tracking for MI_SET_CONTEXT. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161218153724.8439-3-chris@chris-wilson.co.uk
2016-12-18 15:37:20 +00:00
/* The backing object for the context is done after switching to the
* *next* context. Therefore we cannot retire the previous context until
* the next context has already started running. However, since we
* cannot take the required locks at i915_gem_request_submit() we
* defer the unpinning of the active context to now, retirement of
* the subsequent request.
*/
if (engine->last_retired_context)
engine->context_unpin(engine, engine->last_retired_context);
engine->last_retired_context = request->ctx;
drm/i915: Move GEM activity tracking into a common struct reservation_object In preparation to support many distinct timelines, we need to expand the activity tracking on the GEM object to handle more than just a request per engine. We already use the struct reservation_object on the dma-buf to handle many fence contexts, so integrating that into the GEM object itself is the preferred solution. (For example, we can now share the same reservation_object between every consumer/producer using this buffer and skip the manual import/export via dma-buf.) v2: Reimplement busy-ioctl (by walking the reservation object), postpone the ABI change for another day. Similarly use the reservation object to find the last_write request (if active and from i915) for choosing display CS flips. Caveats: * busy-ioctl: busy-ioctl only reports on the native fences, it will not warn of stalls (in set-domain-ioctl, pread/pwrite etc) if the object is being rendered to by external fences. It also will not report the same busy state as wait-ioctl (or polling on the dma-buf) in the same circumstances. On the plus side, it does retain reporting of which *i915* engines are engaged with this object. * non-blocking atomic modesets take a step backwards as the wait for render completion blocks the ioctl. This is fixed in a subsequent patch to use a fence instead for awaiting on the rendering, see "drm/i915: Restore nonblocking awaits for modesetting" * dynamic array manipulation for shared-fences in reservation is slower than the previous lockless static assignment (e.g. gem_exec_lut_handle runtime on ivb goes from 42s to 66s), mainly due to atomic operations (maintaining the fence refcounts). * loss of object-level retirement callbacks, emulated by VMA retirement tracking. * minor loss of object-level last activity information from debugfs, could be replaced with per-vma information if desired Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-21-chris@chris-wilson.co.uk
2016-10-28 12:58:44 +00:00
drm/i915: Avoid keeping waitboost active for signaling threads Once a client has requested a waitboost, we keep that waitboost active until all clients are no longer waiting. This is because we don't distinguish which waiter deserves the boost. However, with the advent of fence signaling, the signaler threads appear as waiters to the RPS interrupt handler. So instead of using a single boolean to track when to keep the waitboost active, use a counter of all outstanding waitboosted requests. At this point, I have removed all vestiges of the rate limiting on clients. Whilst this means that compositors should remain more fluid, it also means that boosts are more prevalent. See commit b29c19b64528 ("drm/i915: Boost RPS frequency for CPU stalls") for a longer discussion on the pros and cons of both approaches. A drawback of this implementation is that it requires constant request submission to keep the waitboost trimmed (as it is now cancelled when the request is completed). This will be fine for a busy system, but near idle the boosts may be kept for longer than desired (effectively tens of vblanks worstcase) and there is a reliance on rc6 instead. v2: Remove defunct rps.client_lock Reported-by: Michał Winiarski <michal.winiarski@intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Michał Winiarski <michal.winiarski@intel.com> Reviewed-by: Michał Winiarski <michal.winiarski@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170628123548.9236-1-chris@chris-wilson.co.uk
2017-06-28 12:35:48 +00:00
spin_lock_irq(&request->lock);
if (request->waitboost)
atomic_dec(&request->i915->gt_pm.rps.num_waiters);
if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags))
dma_fence_signal_locked(&request->fence);
if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
intel_engine_cancel_signaling(request);
drm/i915: Avoid keeping waitboost active for signaling threads Once a client has requested a waitboost, we keep that waitboost active until all clients are no longer waiting. This is because we don't distinguish which waiter deserves the boost. However, with the advent of fence signaling, the signaler threads appear as waiters to the RPS interrupt handler. So instead of using a single boolean to track when to keep the waitboost active, use a counter of all outstanding waitboosted requests. At this point, I have removed all vestiges of the rate limiting on clients. Whilst this means that compositors should remain more fluid, it also means that boosts are more prevalent. See commit b29c19b64528 ("drm/i915: Boost RPS frequency for CPU stalls") for a longer discussion on the pros and cons of both approaches. A drawback of this implementation is that it requires constant request submission to keep the waitboost trimmed (as it is now cancelled when the request is completed). This will be fine for a busy system, but near idle the boosts may be kept for longer than desired (effectively tens of vblanks worstcase) and there is a reliance on rc6 instead. v2: Remove defunct rps.client_lock Reported-by: Michał Winiarski <michal.winiarski@intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Michał Winiarski <michal.winiarski@intel.com> Reviewed-by: Michał Winiarski <michal.winiarski@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170628123548.9236-1-chris@chris-wilson.co.uk
2017-06-28 12:35:48 +00:00
spin_unlock_irq(&request->lock);
i915_priotree_fini(request->i915, &request->priotree);
i915_gem_request_put(request);
}
void i915_gem_request_retire_upto(struct drm_i915_gem_request *req)
{
struct intel_engine_cs *engine = req->engine;
struct drm_i915_gem_request *tmp;
lockdep_assert_held(&req->i915->drm.struct_mutex);
GEM_BUG_ON(!i915_gem_request_completed(req));
if (list_empty(&req->link))
return;
do {
tmp = list_first_entry(&engine->timeline->requests,
typeof(*tmp), link);
i915_gem_request_retire(tmp);
} while (tmp != req);
}
static u32 timeline_get_seqno(struct intel_timeline *tl)
{
return ++tl->seqno;
}
void __i915_gem_request_submit(struct drm_i915_gem_request *request)
{
struct intel_engine_cs *engine = request->engine;
struct intel_timeline *timeline;
u32 seqno;
GEM_BUG_ON(!irqs_disabled());
lockdep_assert_held(&engine->timeline->lock);
trace_i915_gem_request_execute(request);
/* Transfer from per-context onto the global per-engine timeline */
timeline = engine->timeline;
GEM_BUG_ON(timeline == request->timeline);
GEM_BUG_ON(request->global_seqno);
seqno = timeline_get_seqno(timeline);
GEM_BUG_ON(!seqno);
GEM_BUG_ON(i915_seqno_passed(intel_engine_get_seqno(engine), seqno));
/* We may be recursing from the signal callback of another i915 fence */
spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
request->global_seqno = seqno;
if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
intel_engine_enable_signaling(request, false);
spin_unlock(&request->lock);
engine->emit_breadcrumb(request,
request->ring->vaddr + request->postfix);
spin_lock(&request->timeline->lock);
list_move_tail(&request->link, &timeline->requests);
spin_unlock(&request->timeline->lock);
wake_up_all(&request->execute);
}
void i915_gem_request_submit(struct drm_i915_gem_request *request)
{
struct intel_engine_cs *engine = request->engine;
unsigned long flags;
/* Will be called from irq-context when using foreign fences. */
spin_lock_irqsave(&engine->timeline->lock, flags);
__i915_gem_request_submit(request);
spin_unlock_irqrestore(&engine->timeline->lock, flags);
}
void __i915_gem_request_unsubmit(struct drm_i915_gem_request *request)
{
struct intel_engine_cs *engine = request->engine;
struct intel_timeline *timeline;
GEM_BUG_ON(!irqs_disabled());
lockdep_assert_held(&engine->timeline->lock);
/* Only unwind in reverse order, required so that the per-context list
* is kept in seqno/ring order.
*/
GEM_BUG_ON(!request->global_seqno);
GEM_BUG_ON(request->global_seqno != engine->timeline->seqno);
GEM_BUG_ON(i915_seqno_passed(intel_engine_get_seqno(engine),
request->global_seqno));
engine->timeline->seqno--;
/* We may be recursing from the signal callback of another i915 fence */
spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
request->global_seqno = 0;
if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
intel_engine_cancel_signaling(request);
spin_unlock(&request->lock);
/* Transfer back from the global per-engine timeline to per-context */
timeline = request->timeline;
GEM_BUG_ON(timeline == engine->timeline);
spin_lock(&timeline->lock);
list_move(&request->link, &timeline->requests);
spin_unlock(&timeline->lock);
/* We don't need to wake_up any waiters on request->execute, they
* will get woken by any other event or us re-adding this request
* to the engine timeline (__i915_gem_request_submit()). The waiters
* should be quite adapt at finding that the request now has a new
* global_seqno to the one they went to sleep on.
*/
}
void i915_gem_request_unsubmit(struct drm_i915_gem_request *request)
{
struct intel_engine_cs *engine = request->engine;
unsigned long flags;
/* Will be called from irq-context when using foreign fences. */
spin_lock_irqsave(&engine->timeline->lock, flags);
__i915_gem_request_unsubmit(request);
spin_unlock_irqrestore(&engine->timeline->lock, flags);
}
static int __i915_sw_fence_call
submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
{
struct drm_i915_gem_request *request =
container_of(fence, typeof(*request), submit);
switch (state) {
case FENCE_COMPLETE:
trace_i915_gem_request_submit(request);
drm/i915: Use rcu instead of stop_machine in set_wedged stop_machine is not really a locking primitive we should use, except when the hw folks tell us the hw is broken and that's the only way to work around it. This patch tries to address the locking abuse of stop_machine() from commit 20e4933c478a1ca694b38fa4ac44d99e659941f5 Author: Chris Wilson <chris@chris-wilson.co.uk> Date: Tue Nov 22 14:41:21 2016 +0000 drm/i915: Stop the machine as we install the wedged submit_request handler Chris said parts of the reasons for going with stop_machine() was that it's no overhead for the fast-path. But these callbacks use irqsave spinlocks and do a bunch of MMIO, and rcu_read_lock is _real_ fast. To stay as close as possible to the stop_machine semantics we first update all the submit function pointers to the nop handler, then call synchronize_rcu() to make sure no new requests can be submitted. This should give us exactly the huge barrier we want. I pondered whether we should annotate engine->submit_request as __rcu and use rcu_assign_pointer and rcu_dereference on it. But the reason behind those is to make sure the compiler/cpu barriers are there for when you have an actual data structure you point at, to make sure all the writes are seen correctly on the read side. But we just have a function pointer, and .text isn't changed, so no need for these barriers and hence no need for annotations. Unfortunately there's a complication with the call to intel_engine_init_global_seqno: - Without stop_machine we must hold the corresponding spinlock. - Without stop_machine we must ensure that all requests are marked as having failed with dma_fence_set_error() before we call it. That means we need to split the nop request submission into two phases, both synchronized with rcu: 1. Only stop submitting the requests to hw and mark them as failed. 2. After all pending requests in the scheduler/ring are suitably marked up as failed and we can force complete them all, also force complete by calling intel_engine_init_global_seqno(). This should fix the followwing lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 4.14.0-rc3-CI-CI_DRM_3179+ #1 Tainted: G U ------------------------------------------------------ kworker/3:4/562 is trying to acquire lock: (cpu_hotplug_lock.rw_sem){++++}, at: [<ffffffff8113d4bc>] stop_machine+0x1c/0x40 but task is already holding lock: (&dev->struct_mutex){+.+.}, at: [<ffffffffa0136588>] i915_reset_device+0x1e8/0x260 [i915] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #6 (&dev->struct_mutex){+.+.}: __lock_acquire+0x1420/0x15e0 lock_acquire+0xb0/0x200 __mutex_lock+0x86/0x9b0 mutex_lock_interruptible_nested+0x1b/0x20 i915_mutex_lock_interruptible+0x51/0x130 [i915] i915_gem_fault+0x209/0x650 [i915] __do_fault+0x1e/0x80 __handle_mm_fault+0xa08/0xed0 handle_mm_fault+0x156/0x300 __do_page_fault+0x2c5/0x570 do_page_fault+0x28/0x250 page_fault+0x22/0x30 -> #5 (&mm->mmap_sem){++++}: __lock_acquire+0x1420/0x15e0 lock_acquire+0xb0/0x200 __might_fault+0x68/0x90 _copy_to_user+0x23/0x70 filldir+0xa5/0x120 dcache_readdir+0xf9/0x170 iterate_dir+0x69/0x1a0 SyS_getdents+0xa5/0x140 entry_SYSCALL_64_fastpath+0x1c/0xb1 -> #4 (&sb->s_type->i_mutex_key#5){++++}: down_write+0x3b/0x70 handle_create+0xcb/0x1e0 devtmpfsd+0x139/0x180 kthread+0x152/0x190 ret_from_fork+0x27/0x40 -> #3 ((complete)&req.done){+.+.}: __lock_acquire+0x1420/0x15e0 lock_acquire+0xb0/0x200 wait_for_common+0x58/0x210 wait_for_completion+0x1d/0x20 devtmpfs_create_node+0x13d/0x160 device_add+0x5eb/0x620 device_create_groups_vargs+0xe0/0xf0 device_create+0x3a/0x40 msr_device_create+0x2b/0x40 cpuhp_invoke_callback+0xc9/0xbf0 cpuhp_thread_fun+0x17b/0x240 smpboot_thread_fn+0x18a/0x280 kthread+0x152/0x190 ret_from_fork+0x27/0x40 -> #2 (cpuhp_state-up){+.+.}: __lock_acquire+0x1420/0x15e0 lock_acquire+0xb0/0x200 cpuhp_issue_call+0x133/0x1c0 __cpuhp_setup_state_cpuslocked+0x139/0x2a0 __cpuhp_setup_state+0x46/0x60 page_writeback_init+0x43/0x67 pagecache_init+0x3d/0x42 start_kernel+0x3a8/0x3fc x86_64_start_reservations+0x2a/0x2c x86_64_start_kernel+0x6d/0x70 verify_cpu+0x0/0xfb -> #1 (cpuhp_state_mutex){+.+.}: __lock_acquire+0x1420/0x15e0 lock_acquire+0xb0/0x200 __mutex_lock+0x86/0x9b0 mutex_lock_nested+0x1b/0x20 __cpuhp_setup_state_cpuslocked+0x53/0x2a0 __cpuhp_setup_state+0x46/0x60 page_alloc_init+0x28/0x30 start_kernel+0x145/0x3fc x86_64_start_reservations+0x2a/0x2c x86_64_start_kernel+0x6d/0x70 verify_cpu+0x0/0xfb -> #0 (cpu_hotplug_lock.rw_sem){++++}: check_prev_add+0x430/0x840 __lock_acquire+0x1420/0x15e0 lock_acquire+0xb0/0x200 cpus_read_lock+0x3d/0xb0 stop_machine+0x1c/0x40 i915_gem_set_wedged+0x1a/0x20 [i915] i915_reset+0xb9/0x230 [i915] i915_reset_device+0x1f6/0x260 [i915] i915_handle_error+0x2d8/0x430 [i915] hangcheck_declare_hang+0xd3/0xf0 [i915] i915_hangcheck_elapsed+0x262/0x2d0 [i915] process_one_work+0x233/0x660 worker_thread+0x4e/0x3b0 kthread+0x152/0x190 ret_from_fork+0x27/0x40 other info that might help us debug this: Chain exists of: cpu_hotplug_lock.rw_sem --> &mm->mmap_sem --> &dev->struct_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&dev->struct_mutex); lock(&mm->mmap_sem); lock(&dev->struct_mutex); lock(cpu_hotplug_lock.rw_sem); *** DEADLOCK *** 3 locks held by kworker/3:4/562: #0: ("events_long"){+.+.}, at: [<ffffffff8109c64a>] process_one_work+0x1aa/0x660 #1: ((&(&i915->gpu_error.hangcheck_work)->work)){+.+.}, at: [<ffffffff8109c64a>] process_one_work+0x1aa/0x660 #2: (&dev->struct_mutex){+.+.}, at: [<ffffffffa0136588>] i915_reset_device+0x1e8/0x260 [i915] stack backtrace: CPU: 3 PID: 562 Comm: kworker/3:4 Tainted: G U 4.14.0-rc3-CI-CI_DRM_3179+ #1 Hardware name: /NUC7i5BNB, BIOS BNKBL357.86A.0048.2017.0704.1415 07/04/2017 Workqueue: events_long i915_hangcheck_elapsed [i915] Call Trace: dump_stack+0x68/0x9f print_circular_bug+0x235/0x3c0 ? lockdep_init_map_crosslock+0x20/0x20 check_prev_add+0x430/0x840 ? irq_work_queue+0x86/0xe0 ? wake_up_klogd+0x53/0x70 __lock_acquire+0x1420/0x15e0 ? __lock_acquire+0x1420/0x15e0 ? lockdep_init_map_crosslock+0x20/0x20 lock_acquire+0xb0/0x200 ? stop_machine+0x1c/0x40 ? i915_gem_object_truncate+0x50/0x50 [i915] cpus_read_lock+0x3d/0xb0 ? stop_machine+0x1c/0x40 stop_machine+0x1c/0x40 i915_gem_set_wedged+0x1a/0x20 [i915] i915_reset+0xb9/0x230 [i915] i915_reset_device+0x1f6/0x260 [i915] ? gen8_gt_irq_ack+0x170/0x170 [i915] ? work_on_cpu_safe+0x60/0x60 i915_handle_error+0x2d8/0x430 [i915] ? vsnprintf+0xd1/0x4b0 ? scnprintf+0x3a/0x70 hangcheck_declare_hang+0xd3/0xf0 [i915] ? intel_runtime_pm_put+0x56/0xa0 [i915] i915_hangcheck_elapsed+0x262/0x2d0 [i915] process_one_work+0x233/0x660 worker_thread+0x4e/0x3b0 kthread+0x152/0x190 ? process_one_work+0x660/0x660 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x27/0x40 Setting dangerous option reset - tainting kernel i915 0000:00:02.0: Resetting chip after gpu hang Setting dangerous option reset - tainting kernel i915 0000:00:02.0: Resetting chip after gpu hang v2: Have 1 global synchronize_rcu() barrier across all engines, and improve commit message. v3: We need to protect the seqno update with the timeline spinlock (in set_wedged) to avoid racing with other updates of the seqno, like we already do in nop_submit_request (Chris). v4: Use two-phase sequence to plug the race Chris spotted where we can complete requests before they're marked up with -EIO. v5: Review from Chris: - simplify nop_submit_request. - Add comment to rcu_read_lock section. - Align comments with the new style. v6: Remove unused variable to appease CI. Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102886 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103096 Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Marta Lofstedt <marta.lofstedt@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20171011091019.1425-1-daniel.vetter@ffwll.ch
2017-10-11 09:10:19 +00:00
/*
* We need to serialize use of the submit_request() callback with its
* hotplugging performed during an emergency i915_gem_set_wedged().
* We use the RCU mechanism to mark the critical section in order to
* force i915_gem_set_wedged() to wait until the submit_request() is
* completed before proceeding.
*/
rcu_read_lock();
request->engine->submit_request(request);
drm/i915: Use rcu instead of stop_machine in set_wedged stop_machine is not really a locking primitive we should use, except when the hw folks tell us the hw is broken and that's the only way to work around it. This patch tries to address the locking abuse of stop_machine() from commit 20e4933c478a1ca694b38fa4ac44d99e659941f5 Author: Chris Wilson <chris@chris-wilson.co.uk> Date: Tue Nov 22 14:41:21 2016 +0000 drm/i915: Stop the machine as we install the wedged submit_request handler Chris said parts of the reasons for going with stop_machine() was that it's no overhead for the fast-path. But these callbacks use irqsave spinlocks and do a bunch of MMIO, and rcu_read_lock is _real_ fast. To stay as close as possible to the stop_machine semantics we first update all the submit function pointers to the nop handler, then call synchronize_rcu() to make sure no new requests can be submitted. This should give us exactly the huge barrier we want. I pondered whether we should annotate engine->submit_request as __rcu and use rcu_assign_pointer and rcu_dereference on it. But the reason behind those is to make sure the compiler/cpu barriers are there for when you have an actual data structure you point at, to make sure all the writes are seen correctly on the read side. But we just have a function pointer, and .text isn't changed, so no need for these barriers and hence no need for annotations. Unfortunately there's a complication with the call to intel_engine_init_global_seqno: - Without stop_machine we must hold the corresponding spinlock. - Without stop_machine we must ensure that all requests are marked as having failed with dma_fence_set_error() before we call it. That means we need to split the nop request submission into two phases, both synchronized with rcu: 1. Only stop submitting the requests to hw and mark them as failed. 2. After all pending requests in the scheduler/ring are suitably marked up as failed and we can force complete them all, also force complete by calling intel_engine_init_global_seqno(). This should fix the followwing lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 4.14.0-rc3-CI-CI_DRM_3179+ #1 Tainted: G U ------------------------------------------------------ kworker/3:4/562 is trying to acquire lock: (cpu_hotplug_lock.rw_sem){++++}, at: [<ffffffff8113d4bc>] stop_machine+0x1c/0x40 but task is already holding lock: (&dev->struct_mutex){+.+.}, at: [<ffffffffa0136588>] i915_reset_device+0x1e8/0x260 [i915] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #6 (&dev->struct_mutex){+.+.}: __lock_acquire+0x1420/0x15e0 lock_acquire+0xb0/0x200 __mutex_lock+0x86/0x9b0 mutex_lock_interruptible_nested+0x1b/0x20 i915_mutex_lock_interruptible+0x51/0x130 [i915] i915_gem_fault+0x209/0x650 [i915] __do_fault+0x1e/0x80 __handle_mm_fault+0xa08/0xed0 handle_mm_fault+0x156/0x300 __do_page_fault+0x2c5/0x570 do_page_fault+0x28/0x250 page_fault+0x22/0x30 -> #5 (&mm->mmap_sem){++++}: __lock_acquire+0x1420/0x15e0 lock_acquire+0xb0/0x200 __might_fault+0x68/0x90 _copy_to_user+0x23/0x70 filldir+0xa5/0x120 dcache_readdir+0xf9/0x170 iterate_dir+0x69/0x1a0 SyS_getdents+0xa5/0x140 entry_SYSCALL_64_fastpath+0x1c/0xb1 -> #4 (&sb->s_type->i_mutex_key#5){++++}: down_write+0x3b/0x70 handle_create+0xcb/0x1e0 devtmpfsd+0x139/0x180 kthread+0x152/0x190 ret_from_fork+0x27/0x40 -> #3 ((complete)&req.done){+.+.}: __lock_acquire+0x1420/0x15e0 lock_acquire+0xb0/0x200 wait_for_common+0x58/0x210 wait_for_completion+0x1d/0x20 devtmpfs_create_node+0x13d/0x160 device_add+0x5eb/0x620 device_create_groups_vargs+0xe0/0xf0 device_create+0x3a/0x40 msr_device_create+0x2b/0x40 cpuhp_invoke_callback+0xc9/0xbf0 cpuhp_thread_fun+0x17b/0x240 smpboot_thread_fn+0x18a/0x280 kthread+0x152/0x190 ret_from_fork+0x27/0x40 -> #2 (cpuhp_state-up){+.+.}: __lock_acquire+0x1420/0x15e0 lock_acquire+0xb0/0x200 cpuhp_issue_call+0x133/0x1c0 __cpuhp_setup_state_cpuslocked+0x139/0x2a0 __cpuhp_setup_state+0x46/0x60 page_writeback_init+0x43/0x67 pagecache_init+0x3d/0x42 start_kernel+0x3a8/0x3fc x86_64_start_reservations+0x2a/0x2c x86_64_start_kernel+0x6d/0x70 verify_cpu+0x0/0xfb -> #1 (cpuhp_state_mutex){+.+.}: __lock_acquire+0x1420/0x15e0 lock_acquire+0xb0/0x200 __mutex_lock+0x86/0x9b0 mutex_lock_nested+0x1b/0x20 __cpuhp_setup_state_cpuslocked+0x53/0x2a0 __cpuhp_setup_state+0x46/0x60 page_alloc_init+0x28/0x30 start_kernel+0x145/0x3fc x86_64_start_reservations+0x2a/0x2c x86_64_start_kernel+0x6d/0x70 verify_cpu+0x0/0xfb -> #0 (cpu_hotplug_lock.rw_sem){++++}: check_prev_add+0x430/0x840 __lock_acquire+0x1420/0x15e0 lock_acquire+0xb0/0x200 cpus_read_lock+0x3d/0xb0 stop_machine+0x1c/0x40 i915_gem_set_wedged+0x1a/0x20 [i915] i915_reset+0xb9/0x230 [i915] i915_reset_device+0x1f6/0x260 [i915] i915_handle_error+0x2d8/0x430 [i915] hangcheck_declare_hang+0xd3/0xf0 [i915] i915_hangcheck_elapsed+0x262/0x2d0 [i915] process_one_work+0x233/0x660 worker_thread+0x4e/0x3b0 kthread+0x152/0x190 ret_from_fork+0x27/0x40 other info that might help us debug this: Chain exists of: cpu_hotplug_lock.rw_sem --> &mm->mmap_sem --> &dev->struct_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&dev->struct_mutex); lock(&mm->mmap_sem); lock(&dev->struct_mutex); lock(cpu_hotplug_lock.rw_sem); *** DEADLOCK *** 3 locks held by kworker/3:4/562: #0: ("events_long"){+.+.}, at: [<ffffffff8109c64a>] process_one_work+0x1aa/0x660 #1: ((&(&i915->gpu_error.hangcheck_work)->work)){+.+.}, at: [<ffffffff8109c64a>] process_one_work+0x1aa/0x660 #2: (&dev->struct_mutex){+.+.}, at: [<ffffffffa0136588>] i915_reset_device+0x1e8/0x260 [i915] stack backtrace: CPU: 3 PID: 562 Comm: kworker/3:4 Tainted: G U 4.14.0-rc3-CI-CI_DRM_3179+ #1 Hardware name: /NUC7i5BNB, BIOS BNKBL357.86A.0048.2017.0704.1415 07/04/2017 Workqueue: events_long i915_hangcheck_elapsed [i915] Call Trace: dump_stack+0x68/0x9f print_circular_bug+0x235/0x3c0 ? lockdep_init_map_crosslock+0x20/0x20 check_prev_add+0x430/0x840 ? irq_work_queue+0x86/0xe0 ? wake_up_klogd+0x53/0x70 __lock_acquire+0x1420/0x15e0 ? __lock_acquire+0x1420/0x15e0 ? lockdep_init_map_crosslock+0x20/0x20 lock_acquire+0xb0/0x200 ? stop_machine+0x1c/0x40 ? i915_gem_object_truncate+0x50/0x50 [i915] cpus_read_lock+0x3d/0xb0 ? stop_machine+0x1c/0x40 stop_machine+0x1c/0x40 i915_gem_set_wedged+0x1a/0x20 [i915] i915_reset+0xb9/0x230 [i915] i915_reset_device+0x1f6/0x260 [i915] ? gen8_gt_irq_ack+0x170/0x170 [i915] ? work_on_cpu_safe+0x60/0x60 i915_handle_error+0x2d8/0x430 [i915] ? vsnprintf+0xd1/0x4b0 ? scnprintf+0x3a/0x70 hangcheck_declare_hang+0xd3/0xf0 [i915] ? intel_runtime_pm_put+0x56/0xa0 [i915] i915_hangcheck_elapsed+0x262/0x2d0 [i915] process_one_work+0x233/0x660 worker_thread+0x4e/0x3b0 kthread+0x152/0x190 ? process_one_work+0x660/0x660 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x27/0x40 Setting dangerous option reset - tainting kernel i915 0000:00:02.0: Resetting chip after gpu hang Setting dangerous option reset - tainting kernel i915 0000:00:02.0: Resetting chip after gpu hang v2: Have 1 global synchronize_rcu() barrier across all engines, and improve commit message. v3: We need to protect the seqno update with the timeline spinlock (in set_wedged) to avoid racing with other updates of the seqno, like we already do in nop_submit_request (Chris). v4: Use two-phase sequence to plug the race Chris spotted where we can complete requests before they're marked up with -EIO. v5: Review from Chris: - simplify nop_submit_request. - Add comment to rcu_read_lock section. - Align comments with the new style. v6: Remove unused variable to appease CI. Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102886 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103096 Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Marta Lofstedt <marta.lofstedt@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20171011091019.1425-1-daniel.vetter@ffwll.ch
2017-10-11 09:10:19 +00:00
rcu_read_unlock();
break;
case FENCE_FREE:
i915_gem_request_put(request);
break;
}
return NOTIFY_DONE;
}
/**
* i915_gem_request_alloc - allocate a request structure
*
* @engine: engine that we wish to issue the request on.
* @ctx: context that the request will be associated with.
*
* Returns a pointer to the allocated request if successful,
* or an error code if not.
*/
struct drm_i915_gem_request *
i915_gem_request_alloc(struct intel_engine_cs *engine,
struct i915_gem_context *ctx)
{
struct drm_i915_private *dev_priv = engine->i915;
struct drm_i915_gem_request *req;
struct intel_ring *ring;
int ret;
lockdep_assert_held(&dev_priv->drm.struct_mutex);
/*
* Preempt contexts are reserved for exclusive use to inject a
* preemption context switch. They are never to be used for any trivial
* request!
*/
GEM_BUG_ON(ctx == dev_priv->preempt_context);
/* ABI: Before userspace accesses the GPU (e.g. execbuffer), report
* EIO if the GPU is already wedged.
*/
if (i915_terminally_wedged(&dev_priv->gpu_error))
return ERR_PTR(-EIO);
drm/i915: Unify active context tracking between legacy/execlists/guc The requests conversion introduced a nasty bug where we could generate a new request in the middle of constructing a request if we needed to idle the system in order to evict space for a context. The request to idle would be executed (and waited upon) before the current one, creating a minor havoc in the seqno accounting, as we will consider the current request to already be completed (prior to deferred seqno assignment) but ring->last_retired_head would have been updated and still could allow us to overwrite the current request before execution. We also employed two different mechanisms to track the active context until it was switched out. The legacy method allowed for waiting upon an active context (it could forcibly evict any vma, including context's), but the execlists method took a step backwards by pinning the vma for the entire active lifespan of the context (the only way to evict was to idle the entire GPU, not individual contexts). However, to circumvent the tricky issue of locking (i.e. we cannot take struct_mutex at the time of i915_gem_request_submit(), where we would want to move the previous context onto the active tracker and unpin it), we take the execlists approach and keep the contexts pinned until retirement. The benefit of the execlists approach, more important for execlists than legacy, was the reduction in work in pinning the context for each request - as the context was kept pinned until idle, it could short circuit the pinning for all active contexts. We introduce new engine vfuncs to pin and unpin the context respectively. The context is pinned at the start of the request, and only unpinned when the following request is retired (this ensures that the context is idle and coherent in main memory before we unpin it). We move the engine->last_context tracking into the retirement itself (rather than during request submission) in order to allow the submission to be reordered or unwound without undue difficultly. And finally an ulterior motive for unifying context handling was to prepare for mock requests. v2: Rename to last_retired_context, split out legacy_context tracking for MI_SET_CONTEXT. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161218153724.8439-3-chris@chris-wilson.co.uk
2016-12-18 15:37:20 +00:00
/* Pinning the contexts may generate requests in order to acquire
* GGTT space, so do this first before we reserve a seqno for
* ourselves.
*/
ring = engine->context_pin(engine, ctx);
if (IS_ERR(ring))
return ERR_CAST(ring);
GEM_BUG_ON(!ring);
ret = reserve_engine(engine);
drm/i915: Unify active context tracking between legacy/execlists/guc The requests conversion introduced a nasty bug where we could generate a new request in the middle of constructing a request if we needed to idle the system in order to evict space for a context. The request to idle would be executed (and waited upon) before the current one, creating a minor havoc in the seqno accounting, as we will consider the current request to already be completed (prior to deferred seqno assignment) but ring->last_retired_head would have been updated and still could allow us to overwrite the current request before execution. We also employed two different mechanisms to track the active context until it was switched out. The legacy method allowed for waiting upon an active context (it could forcibly evict any vma, including context's), but the execlists method took a step backwards by pinning the vma for the entire active lifespan of the context (the only way to evict was to idle the entire GPU, not individual contexts). However, to circumvent the tricky issue of locking (i.e. we cannot take struct_mutex at the time of i915_gem_request_submit(), where we would want to move the previous context onto the active tracker and unpin it), we take the execlists approach and keep the contexts pinned until retirement. The benefit of the execlists approach, more important for execlists than legacy, was the reduction in work in pinning the context for each request - as the context was kept pinned until idle, it could short circuit the pinning for all active contexts. We introduce new engine vfuncs to pin and unpin the context respectively. The context is pinned at the start of the request, and only unpinned when the following request is retired (this ensures that the context is idle and coherent in main memory before we unpin it). We move the engine->last_context tracking into the retirement itself (rather than during request submission) in order to allow the submission to be reordered or unwound without undue difficultly. And finally an ulterior motive for unifying context handling was to prepare for mock requests. v2: Rename to last_retired_context, split out legacy_context tracking for MI_SET_CONTEXT. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161218153724.8439-3-chris@chris-wilson.co.uk
2016-12-18 15:37:20 +00:00
if (ret)
goto err_unpin;
ret = intel_ring_wait_for_space(ring, MIN_SPACE_FOR_ADD_REQUEST);
if (ret)
goto err_unreserve;
/* Move the oldest request to the slab-cache (if not in use!) */
req = list_first_entry_or_null(&engine->timeline->requests,
typeof(*req), link);
if (req && i915_gem_request_completed(req))
i915_gem_request_retire(req);
drm/i915: Do not overwrite the request with zero on reallocation When using RCU lookup for the request, commit 0eafec6d3244 ("drm/i915: Enable lockless lookup of request tracking via RCU"), we acknowledge that we may race with another thread that could have reallocated the request. In order for the first thread not to blow up, the second thread must not clear the request completed before overwriting it. In the RCU lookup, we allow for the engine/seqno to be replaced but we do not allow for it to be zeroed. The choice we make is to either add extra checking to the RCU lookup, or embrace the inherent races (as intended). It is more complicated as we need to manually clear everything we depend upon being zero initialised, but we benefit from not emiting the memset() to clear the entire frequently allocated structure (that memset turns up in throughput profiles). And at the same time, the lookup remains flexible for future adjustments. v2: Old style LRC requires another variable to be initialize. (The danger inherent in not zeroing everything.) v3: request->batch also needs to be cleared v4: signaling.tsk is no long used unset, but pid still exists Fixes: 0eafec6d3244 ("drm/i915: Enable lockless lookup of request...") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: "Goel, Akash" <akash.goel@intel.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> Link: http://patchwork.freedesktop.org/patch/msgid/1470731014-6894-2-git-send-email-chris@chris-wilson.co.uk
2016-08-09 08:23:34 +00:00
/* Beware: Dragons be flying overhead.
*
* We use RCU to look up requests in flight. The lookups may
* race with the request being allocated from the slab freelist.
* That is the request we are writing to here, may be in the process
* of being read by __i915_gem_active_get_rcu(). As such,
drm/i915: Do not overwrite the request with zero on reallocation When using RCU lookup for the request, commit 0eafec6d3244 ("drm/i915: Enable lockless lookup of request tracking via RCU"), we acknowledge that we may race with another thread that could have reallocated the request. In order for the first thread not to blow up, the second thread must not clear the request completed before overwriting it. In the RCU lookup, we allow for the engine/seqno to be replaced but we do not allow for it to be zeroed. The choice we make is to either add extra checking to the RCU lookup, or embrace the inherent races (as intended). It is more complicated as we need to manually clear everything we depend upon being zero initialised, but we benefit from not emiting the memset() to clear the entire frequently allocated structure (that memset turns up in throughput profiles). And at the same time, the lookup remains flexible for future adjustments. v2: Old style LRC requires another variable to be initialize. (The danger inherent in not zeroing everything.) v3: request->batch also needs to be cleared v4: signaling.tsk is no long used unset, but pid still exists Fixes: 0eafec6d3244 ("drm/i915: Enable lockless lookup of request...") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: "Goel, Akash" <akash.goel@intel.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> Link: http://patchwork.freedesktop.org/patch/msgid/1470731014-6894-2-git-send-email-chris@chris-wilson.co.uk
2016-08-09 08:23:34 +00:00
* we have to be very careful when overwriting the contents. During
* the RCU lookup, we change chase the request->engine pointer,
* read the request->global_seqno and increment the reference count.
drm/i915: Do not overwrite the request with zero on reallocation When using RCU lookup for the request, commit 0eafec6d3244 ("drm/i915: Enable lockless lookup of request tracking via RCU"), we acknowledge that we may race with another thread that could have reallocated the request. In order for the first thread not to blow up, the second thread must not clear the request completed before overwriting it. In the RCU lookup, we allow for the engine/seqno to be replaced but we do not allow for it to be zeroed. The choice we make is to either add extra checking to the RCU lookup, or embrace the inherent races (as intended). It is more complicated as we need to manually clear everything we depend upon being zero initialised, but we benefit from not emiting the memset() to clear the entire frequently allocated structure (that memset turns up in throughput profiles). And at the same time, the lookup remains flexible for future adjustments. v2: Old style LRC requires another variable to be initialize. (The danger inherent in not zeroing everything.) v3: request->batch also needs to be cleared v4: signaling.tsk is no long used unset, but pid still exists Fixes: 0eafec6d3244 ("drm/i915: Enable lockless lookup of request...") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: "Goel, Akash" <akash.goel@intel.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> Link: http://patchwork.freedesktop.org/patch/msgid/1470731014-6894-2-git-send-email-chris@chris-wilson.co.uk
2016-08-09 08:23:34 +00:00
*
* The reference count is incremented atomically. If it is zero,
* the lookup knows the request is unallocated and complete. Otherwise,
* it is either still in use, or has been reallocated and reset
dma-buf: Rename struct fence to dma_fence I plan to usurp the short name of struct fence for a core kernel struct, and so I need to rename the specialised fence/timeline for DMA operations to make room. A consensus was reached in https://lists.freedesktop.org/archives/dri-devel/2016-July/113083.html that making clear this fence applies to DMA operations was a good thing. Since then the patch has grown a bit as usage increases, so hopefully it remains a good thing! (v2...: rebase, rerun spatch) v3: Compile on msm, spotted a manual fixup that I broke. v4: Try again for msm, sorry Daniel coccinelle script: @@ @@ - struct fence + struct dma_fence @@ @@ - struct fence_ops + struct dma_fence_ops @@ @@ - struct fence_cb + struct dma_fence_cb @@ @@ - struct fence_array + struct dma_fence_array @@ @@ - enum fence_flag_bits + enum dma_fence_flag_bits @@ @@ ( - fence_init + dma_fence_init | - fence_release + dma_fence_release | - fence_free + dma_fence_free | - fence_get + dma_fence_get | - fence_get_rcu + dma_fence_get_rcu | - fence_put + dma_fence_put | - fence_signal + dma_fence_signal | - fence_signal_locked + dma_fence_signal_locked | - fence_default_wait + dma_fence_default_wait | - fence_add_callback + dma_fence_add_callback | - fence_remove_callback + dma_fence_remove_callback | - fence_enable_sw_signaling + dma_fence_enable_sw_signaling | - fence_is_signaled_locked + dma_fence_is_signaled_locked | - fence_is_signaled + dma_fence_is_signaled | - fence_is_later + dma_fence_is_later | - fence_later + dma_fence_later | - fence_wait_timeout + dma_fence_wait_timeout | - fence_wait_any_timeout + dma_fence_wait_any_timeout | - fence_wait + dma_fence_wait | - fence_context_alloc + dma_fence_context_alloc | - fence_array_create + dma_fence_array_create | - to_fence_array + to_dma_fence_array | - fence_is_array + dma_fence_is_array | - trace_fence_emit + trace_dma_fence_emit | - FENCE_TRACE + DMA_FENCE_TRACE | - FENCE_WARN + DMA_FENCE_WARN | - FENCE_ERR + DMA_FENCE_ERR ) ( ... ) Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk> Acked-by: Sumit Semwal <sumit.semwal@linaro.org> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> Link: http://patchwork.freedesktop.org/patch/msgid/20161025120045.28839-1-chris@chris-wilson.co.uk
2016-10-25 12:00:45 +00:00
* with dma_fence_init(). This increment is safe for release as we
* check that the request we have a reference to and matches the active
drm/i915: Do not overwrite the request with zero on reallocation When using RCU lookup for the request, commit 0eafec6d3244 ("drm/i915: Enable lockless lookup of request tracking via RCU"), we acknowledge that we may race with another thread that could have reallocated the request. In order for the first thread not to blow up, the second thread must not clear the request completed before overwriting it. In the RCU lookup, we allow for the engine/seqno to be replaced but we do not allow for it to be zeroed. The choice we make is to either add extra checking to the RCU lookup, or embrace the inherent races (as intended). It is more complicated as we need to manually clear everything we depend upon being zero initialised, but we benefit from not emiting the memset() to clear the entire frequently allocated structure (that memset turns up in throughput profiles). And at the same time, the lookup remains flexible for future adjustments. v2: Old style LRC requires another variable to be initialize. (The danger inherent in not zeroing everything.) v3: request->batch also needs to be cleared v4: signaling.tsk is no long used unset, but pid still exists Fixes: 0eafec6d3244 ("drm/i915: Enable lockless lookup of request...") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: "Goel, Akash" <akash.goel@intel.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> Link: http://patchwork.freedesktop.org/patch/msgid/1470731014-6894-2-git-send-email-chris@chris-wilson.co.uk
2016-08-09 08:23:34 +00:00
* request.
*
* Before we increment the refcount, we chase the request->engine
* pointer. We must not call kmem_cache_zalloc() or else we set
* that pointer to NULL and cause a crash during the lookup. If
* we see the request is completed (based on the value of the
* old engine and seqno), the lookup is complete and reports NULL.
* If we decide the request is not completed (new engine or seqno),
* then we grab a reference and double check that it is still the
* active request - which it won't be and restart the lookup.
*
* Do not use kmem_cache_zalloc() here!
*/
req = kmem_cache_alloc(dev_priv->requests,
GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
if (unlikely(!req)) {
/* Ratelimit ourselves to prevent oom from malicious clients */
ret = i915_gem_wait_for_idle(dev_priv,
I915_WAIT_LOCKED |
I915_WAIT_INTERRUPTIBLE);
if (ret)
goto err_unreserve;
/*
* We've forced the client to stall and catch up with whatever
* backlog there might have been. As we are assuming that we
* caused the mempressure, now is an opportune time to
* recover as much memory from the request pool as is possible.
* Having already penalized the client to stall, we spend
* a little extra time to re-optimise page allocation.
*/
kmem_cache_shrink(dev_priv->requests);
rcu_barrier(); /* Recover the TYPESAFE_BY_RCU pages */
req = kmem_cache_alloc(dev_priv->requests, GFP_KERNEL);
if (!req) {
ret = -ENOMEM;
goto err_unreserve;
}
}
req->timeline = i915_gem_context_lookup_timeline(ctx, engine);
GEM_BUG_ON(req->timeline == engine->timeline);
spin_lock_init(&req->lock);
dma-buf: Rename struct fence to dma_fence I plan to usurp the short name of struct fence for a core kernel struct, and so I need to rename the specialised fence/timeline for DMA operations to make room. A consensus was reached in https://lists.freedesktop.org/archives/dri-devel/2016-July/113083.html that making clear this fence applies to DMA operations was a good thing. Since then the patch has grown a bit as usage increases, so hopefully it remains a good thing! (v2...: rebase, rerun spatch) v3: Compile on msm, spotted a manual fixup that I broke. v4: Try again for msm, sorry Daniel coccinelle script: @@ @@ - struct fence + struct dma_fence @@ @@ - struct fence_ops + struct dma_fence_ops @@ @@ - struct fence_cb + struct dma_fence_cb @@ @@ - struct fence_array + struct dma_fence_array @@ @@ - enum fence_flag_bits + enum dma_fence_flag_bits @@ @@ ( - fence_init + dma_fence_init | - fence_release + dma_fence_release | - fence_free + dma_fence_free | - fence_get + dma_fence_get | - fence_get_rcu + dma_fence_get_rcu | - fence_put + dma_fence_put | - fence_signal + dma_fence_signal | - fence_signal_locked + dma_fence_signal_locked | - fence_default_wait + dma_fence_default_wait | - fence_add_callback + dma_fence_add_callback | - fence_remove_callback + dma_fence_remove_callback | - fence_enable_sw_signaling + dma_fence_enable_sw_signaling | - fence_is_signaled_locked + dma_fence_is_signaled_locked | - fence_is_signaled + dma_fence_is_signaled | - fence_is_later + dma_fence_is_later | - fence_later + dma_fence_later | - fence_wait_timeout + dma_fence_wait_timeout | - fence_wait_any_timeout + dma_fence_wait_any_timeout | - fence_wait + dma_fence_wait | - fence_context_alloc + dma_fence_context_alloc | - fence_array_create + dma_fence_array_create | - to_fence_array + to_dma_fence_array | - fence_is_array + dma_fence_is_array | - trace_fence_emit + trace_dma_fence_emit | - FENCE_TRACE + DMA_FENCE_TRACE | - FENCE_WARN + DMA_FENCE_WARN | - FENCE_ERR + DMA_FENCE_ERR ) ( ... ) Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk> Acked-by: Sumit Semwal <sumit.semwal@linaro.org> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> Link: http://patchwork.freedesktop.org/patch/msgid/20161025120045.28839-1-chris@chris-wilson.co.uk
2016-10-25 12:00:45 +00:00
dma_fence_init(&req->fence,
&i915_fence_ops,
&req->lock,
req->timeline->fence_context,
timeline_get_seqno(req->timeline));
/* We bump the ref for the fence chain */
i915_sw_fence_init(&i915_gem_request_get(req)->submit, submit_notify);
init_waitqueue_head(&req->execute);
i915_priotree_init(&req->priotree);
drm/i915: Refactor activity tracking for requests With the introduction of requests, we amplified the number of atomic refcounted objects we use and update every execbuffer; from none to several references, and a set of references that need to be changed. We also introduced interesting side-effects in the order of retiring requests and objects. Instead of independently tracking the last request for an object, track the active objects for each request. The object will reside in the buffer list of its most recent active request and so we reduce the kref interchange to a list_move. Now retirements are entirely driven by the request, dramatically simplifying activity tracking on the object themselves, and removing the ambiguity between retiring objects and retiring requests. Furthermore with the consolidation of managing the activity tracking centrally, we can look forward to using RCU to enable lockless lookup of the current active requests for an object. In the future, we will be able to query the status or wait upon rendering to an object without even touching the struct_mutex BKL. All told, less code, simpler and faster, and more extensible. v2: Add a typedef for the function pointer for convenience later. v3: Make the noop retirement callback explicit. Allow passing NULL to the init_request_active() which is expanded to a common noop function. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/1470293567-10811-16-git-send-email-chris@chris-wilson.co.uk
2016-08-04 06:52:35 +00:00
INIT_LIST_HEAD(&req->active_list);
req->i915 = dev_priv;
req->engine = engine;
drm/i915: Unify active context tracking between legacy/execlists/guc The requests conversion introduced a nasty bug where we could generate a new request in the middle of constructing a request if we needed to idle the system in order to evict space for a context. The request to idle would be executed (and waited upon) before the current one, creating a minor havoc in the seqno accounting, as we will consider the current request to already be completed (prior to deferred seqno assignment) but ring->last_retired_head would have been updated and still could allow us to overwrite the current request before execution. We also employed two different mechanisms to track the active context until it was switched out. The legacy method allowed for waiting upon an active context (it could forcibly evict any vma, including context's), but the execlists method took a step backwards by pinning the vma for the entire active lifespan of the context (the only way to evict was to idle the entire GPU, not individual contexts). However, to circumvent the tricky issue of locking (i.e. we cannot take struct_mutex at the time of i915_gem_request_submit(), where we would want to move the previous context onto the active tracker and unpin it), we take the execlists approach and keep the contexts pinned until retirement. The benefit of the execlists approach, more important for execlists than legacy, was the reduction in work in pinning the context for each request - as the context was kept pinned until idle, it could short circuit the pinning for all active contexts. We introduce new engine vfuncs to pin and unpin the context respectively. The context is pinned at the start of the request, and only unpinned when the following request is retired (this ensures that the context is idle and coherent in main memory before we unpin it). We move the engine->last_context tracking into the retirement itself (rather than during request submission) in order to allow the submission to be reordered or unwound without undue difficultly. And finally an ulterior motive for unifying context handling was to prepare for mock requests. v2: Rename to last_retired_context, split out legacy_context tracking for MI_SET_CONTEXT. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161218153724.8439-3-chris@chris-wilson.co.uk
2016-12-18 15:37:20 +00:00
req->ctx = ctx;
req->ring = ring;
drm/i915: Do not overwrite the request with zero on reallocation When using RCU lookup for the request, commit 0eafec6d3244 ("drm/i915: Enable lockless lookup of request tracking via RCU"), we acknowledge that we may race with another thread that could have reallocated the request. In order for the first thread not to blow up, the second thread must not clear the request completed before overwriting it. In the RCU lookup, we allow for the engine/seqno to be replaced but we do not allow for it to be zeroed. The choice we make is to either add extra checking to the RCU lookup, or embrace the inherent races (as intended). It is more complicated as we need to manually clear everything we depend upon being zero initialised, but we benefit from not emiting the memset() to clear the entire frequently allocated structure (that memset turns up in throughput profiles). And at the same time, the lookup remains flexible for future adjustments. v2: Old style LRC requires another variable to be initialize. (The danger inherent in not zeroing everything.) v3: request->batch also needs to be cleared v4: signaling.tsk is no long used unset, but pid still exists Fixes: 0eafec6d3244 ("drm/i915: Enable lockless lookup of request...") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: "Goel, Akash" <akash.goel@intel.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> Link: http://patchwork.freedesktop.org/patch/msgid/1470731014-6894-2-git-send-email-chris@chris-wilson.co.uk
2016-08-09 08:23:34 +00:00
/* No zalloc, must clear what we need by hand */
req->global_seqno = 0;
req->signaling.wait.seqno = 0;
drm/i915: Do not overwrite the request with zero on reallocation When using RCU lookup for the request, commit 0eafec6d3244 ("drm/i915: Enable lockless lookup of request tracking via RCU"), we acknowledge that we may race with another thread that could have reallocated the request. In order for the first thread not to blow up, the second thread must not clear the request completed before overwriting it. In the RCU lookup, we allow for the engine/seqno to be replaced but we do not allow for it to be zeroed. The choice we make is to either add extra checking to the RCU lookup, or embrace the inherent races (as intended). It is more complicated as we need to manually clear everything we depend upon being zero initialised, but we benefit from not emiting the memset() to clear the entire frequently allocated structure (that memset turns up in throughput profiles). And at the same time, the lookup remains flexible for future adjustments. v2: Old style LRC requires another variable to be initialize. (The danger inherent in not zeroing everything.) v3: request->batch also needs to be cleared v4: signaling.tsk is no long used unset, but pid still exists Fixes: 0eafec6d3244 ("drm/i915: Enable lockless lookup of request...") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: "Goel, Akash" <akash.goel@intel.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> Link: http://patchwork.freedesktop.org/patch/msgid/1470731014-6894-2-git-send-email-chris@chris-wilson.co.uk
2016-08-09 08:23:34 +00:00
req->file_priv = NULL;
req->batch = NULL;
req->capture_list = NULL;
drm/i915: Avoid keeping waitboost active for signaling threads Once a client has requested a waitboost, we keep that waitboost active until all clients are no longer waiting. This is because we don't distinguish which waiter deserves the boost. However, with the advent of fence signaling, the signaler threads appear as waiters to the RPS interrupt handler. So instead of using a single boolean to track when to keep the waitboost active, use a counter of all outstanding waitboosted requests. At this point, I have removed all vestiges of the rate limiting on clients. Whilst this means that compositors should remain more fluid, it also means that boosts are more prevalent. See commit b29c19b64528 ("drm/i915: Boost RPS frequency for CPU stalls") for a longer discussion on the pros and cons of both approaches. A drawback of this implementation is that it requires constant request submission to keep the waitboost trimmed (as it is now cancelled when the request is completed). This will be fine for a busy system, but near idle the boosts may be kept for longer than desired (effectively tens of vblanks worstcase) and there is a reliance on rc6 instead. v2: Remove defunct rps.client_lock Reported-by: Michał Winiarski <michal.winiarski@intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Michał Winiarski <michal.winiarski@intel.com> Reviewed-by: Michał Winiarski <michal.winiarski@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170628123548.9236-1-chris@chris-wilson.co.uk
2017-06-28 12:35:48 +00:00
req->waitboost = false;
drm/i915: Do not overwrite the request with zero on reallocation When using RCU lookup for the request, commit 0eafec6d3244 ("drm/i915: Enable lockless lookup of request tracking via RCU"), we acknowledge that we may race with another thread that could have reallocated the request. In order for the first thread not to blow up, the second thread must not clear the request completed before overwriting it. In the RCU lookup, we allow for the engine/seqno to be replaced but we do not allow for it to be zeroed. The choice we make is to either add extra checking to the RCU lookup, or embrace the inherent races (as intended). It is more complicated as we need to manually clear everything we depend upon being zero initialised, but we benefit from not emiting the memset() to clear the entire frequently allocated structure (that memset turns up in throughput profiles). And at the same time, the lookup remains flexible for future adjustments. v2: Old style LRC requires another variable to be initialize. (The danger inherent in not zeroing everything.) v3: request->batch also needs to be cleared v4: signaling.tsk is no long used unset, but pid still exists Fixes: 0eafec6d3244 ("drm/i915: Enable lockless lookup of request...") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: "Goel, Akash" <akash.goel@intel.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> Link: http://patchwork.freedesktop.org/patch/msgid/1470731014-6894-2-git-send-email-chris@chris-wilson.co.uk
2016-08-09 08:23:34 +00:00
/*
* Reserve space in the ring buffer for all the commands required to
* eventually emit this request. This is to guarantee that the
* i915_add_request() call can't fail. Note that the reserve may need
* to be redone if the request is not actually submitted straight
* away, e.g. because a GPU scheduler has deferred it.
*/
req->reserved_space = MIN_SPACE_FOR_ADD_REQUEST;
GEM_BUG_ON(req->reserved_space < engine->emit_breadcrumb_sz);
/*
* Record the position of the start of the request so that
* should we detect the updated seqno part-way through the
* GPU processing the request, we never over-estimate the
* position of the head.
*/
req->head = req->ring->emit;
/* Unconditionally invalidate GPU caches and TLBs. */
ret = engine->emit_flush(req, EMIT_INVALIDATE);
if (ret)
goto err_unwind;
ret = engine->request_alloc(req);
if (ret)
goto err_unwind;
/* Check that we didn't interrupt ourselves with a new request */
GEM_BUG_ON(req->timeline->seqno != req->fence.seqno);
return req;
err_unwind:
req->ring->emit = req->head;
/* Make sure we didn't add ourselves to external state before freeing */
GEM_BUG_ON(!list_empty(&req->active_list));
GEM_BUG_ON(!list_empty(&req->priotree.signalers_list));
GEM_BUG_ON(!list_empty(&req->priotree.waiters_list));
kmem_cache_free(dev_priv->requests, req);
err_unreserve:
unreserve_engine(engine);
drm/i915: Unify active context tracking between legacy/execlists/guc The requests conversion introduced a nasty bug where we could generate a new request in the middle of constructing a request if we needed to idle the system in order to evict space for a context. The request to idle would be executed (and waited upon) before the current one, creating a minor havoc in the seqno accounting, as we will consider the current request to already be completed (prior to deferred seqno assignment) but ring->last_retired_head would have been updated and still could allow us to overwrite the current request before execution. We also employed two different mechanisms to track the active context until it was switched out. The legacy method allowed for waiting upon an active context (it could forcibly evict any vma, including context's), but the execlists method took a step backwards by pinning the vma for the entire active lifespan of the context (the only way to evict was to idle the entire GPU, not individual contexts). However, to circumvent the tricky issue of locking (i.e. we cannot take struct_mutex at the time of i915_gem_request_submit(), where we would want to move the previous context onto the active tracker and unpin it), we take the execlists approach and keep the contexts pinned until retirement. The benefit of the execlists approach, more important for execlists than legacy, was the reduction in work in pinning the context for each request - as the context was kept pinned until idle, it could short circuit the pinning for all active contexts. We introduce new engine vfuncs to pin and unpin the context respectively. The context is pinned at the start of the request, and only unpinned when the following request is retired (this ensures that the context is idle and coherent in main memory before we unpin it). We move the engine->last_context tracking into the retirement itself (rather than during request submission) in order to allow the submission to be reordered or unwound without undue difficultly. And finally an ulterior motive for unifying context handling was to prepare for mock requests. v2: Rename to last_retired_context, split out legacy_context tracking for MI_SET_CONTEXT. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161218153724.8439-3-chris@chris-wilson.co.uk
2016-12-18 15:37:20 +00:00
err_unpin:
engine->context_unpin(engine, ctx);
return ERR_PTR(ret);
}
static int
i915_gem_request_await_request(struct drm_i915_gem_request *to,
struct drm_i915_gem_request *from)
{
int ret;
GEM_BUG_ON(to == from);
GEM_BUG_ON(to->timeline == from->timeline);
drm/i915: Confirm the request is still active before adding it to the await Although we do check the completion-status of the request before actually adding a wait on it (either to its submit fence or its completion dma-fence), we currently do not check before adding it to the dependency lists. In fact, without checking for a completed request we may try to use the signaler after it has been retired and its dependency tree freed: [ 60.044057] BUG: KASAN: use-after-free in __list_add_valid+0x1d/0xd0 at addr ffff880348c9e6a0 [ 60.044118] Read of size 8 by task gem_exec_fence/530 [ 60.044164] CPU: 1 PID: 530 Comm: gem_exec_fence Tainted: G E 4.11.0-rc7+ #46 [ 60.044226] Hardware name: ��������������������������������� ���������������������������������/���������������������������������, BIOS RYBDWi35.86A.0246.2 [ 60.044290] Call Trace: [ 60.044337] dump_stack+0x4d/0x6a [ 60.044383] kasan_object_err+0x21/0x70 [ 60.044435] kasan_report+0x225/0x4e0 [ 60.044488] ? __list_add_valid+0x1d/0xd0 [ 60.044534] ? kasan_kmalloc+0xad/0xe0 [ 60.044587] __asan_load8+0x5e/0x70 [ 60.044639] __list_add_valid+0x1d/0xd0 [ 60.044788] __i915_priotree_add_dependency+0x67/0x130 [i915] [ 60.044895] i915_gem_request_await_request+0xa8/0x370 [i915] [ 60.044974] i915_gem_request_await_dma_fence+0x129/0x140 [i915] [ 60.045049] i915_gem_do_execbuffer.isra.37+0xb0a/0x26b0 [i915] [ 60.045077] ? save_stack+0xb1/0xd0 [ 60.045105] ? save_stack_trace+0x1b/0x20 [ 60.045132] ? save_stack+0x46/0xd0 [ 60.045158] ? kasan_kmalloc+0xad/0xe0 [ 60.045184] ? __kmalloc+0xd8/0x670 [ 60.045229] ? drm_ioctl+0x359/0x640 [drm] [ 60.045256] ? SyS_ioctl+0x41/0x70 [ 60.045330] ? i915_vma_move_to_active+0x540/0x540 [i915] [ 60.045360] ? tty_insert_flip_string_flags+0xa1/0xf0 [ 60.045387] ? tty_flip_buffer_push+0x63/0x70 [ 60.045414] ? remove_wait_queue+0xa9/0xc0 [ 60.045441] ? kasan_unpoison_shadow+0x35/0x50 [ 60.045467] ? kasan_kmalloc+0xad/0xe0 [ 60.045494] ? kasan_check_write+0x14/0x20 [ 60.045568] i915_gem_execbuffer2+0xdb/0x2a0 [i915] [ 60.045616] drm_ioctl+0x359/0x640 [drm] [ 60.045705] ? i915_gem_execbuffer+0x5a0/0x5a0 [i915] [ 60.045751] ? drm_version+0x150/0x150 [drm] [ 60.045778] ? compat_start_thread+0x60/0x60 [ 60.045805] ? plist_del+0xda/0x1a0 [ 60.045833] do_vfs_ioctl+0x12e/0x910 [ 60.045860] ? ioctl_preallocate+0x130/0x130 [ 60.045886] ? pci_mmcfg_check_reserved+0xc0/0xc0 [ 60.045913] ? vfs_write+0x196/0x240 [ 60.045939] ? __fget_light+0xa7/0xc0 [ 60.045965] SyS_ioctl+0x41/0x70 [ 60.045991] entry_SYSCALL_64_fastpath+0x17/0x98 [ 60.046017] RIP: 0033:0x7feb2baefc47 [ 60.046042] RSP: 002b:00007fff56d28e58 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 60.046075] RAX: ffffffffffffffda RBX: 00007fff56d290a8 RCX: 00007feb2baefc47 [ 60.046102] RDX: 00007fff56d29050 RSI: 00000000c0406469 RDI: 0000000000000003 [ 60.046129] RBP: 00007fff56d29050 R08: 000055ecc4cd27d0 R09: 00007feb2bda8600 [ 60.046154] R10: 0000000000000073 R11: 0000000000000246 R12: 00000000c0406469 [ 60.046177] R13: 0000000000000003 R14: 000000000000000f R15: 0000000000000099 [ 60.046203] Object at ffff880348c9e680, in cache i915_dependency size: 64 [ 60.046225] Allocated: [ 60.046246] PID = 530 [ 60.046269] save_stack_trace+0x1b/0x20 [ 60.046292] save_stack+0x46/0xd0 [ 60.046318] kasan_kmalloc+0xad/0xe0 [ 60.046343] kasan_slab_alloc+0x12/0x20 [ 60.046368] kmem_cache_alloc+0xab/0x650 [ 60.046445] i915_gem_request_await_request+0x88/0x370 [i915] [ 60.046559] i915_gem_request_await_dma_fence+0x129/0x140 [i915] [ 60.046705] i915_gem_do_execbuffer.isra.37+0xb0a/0x26b0 [i915] [ 60.046849] i915_gem_execbuffer2+0xdb/0x2a0 [i915] [ 60.046936] drm_ioctl+0x359/0x640 [drm] [ 60.046987] do_vfs_ioctl+0x12e/0x910 [ 60.047038] SyS_ioctl+0x41/0x70 [ 60.047090] entry_SYSCALL_64_fastpath+0x17/0x98 [ 60.047139] Freed: [ 60.047179] PID = 530 [ 60.047223] save_stack_trace+0x1b/0x20 [ 60.047269] save_stack+0x46/0xd0 [ 60.047317] kasan_slab_free+0x72/0xc0 [ 60.047366] kmem_cache_free+0x39/0x160 [ 60.047512] i915_gem_request_retire+0x83f/0x930 [i915] [ 60.047657] i915_gem_request_alloc+0x166/0x600 [i915] [ 60.047799] i915_gem_do_execbuffer.isra.37+0xad8/0x26b0 [i915] [ 60.047897] i915_gem_execbuffer2+0xdb/0x2a0 [i915] [ 60.047942] drm_ioctl+0x359/0x640 [drm] [ 60.047968] do_vfs_ioctl+0x12e/0x910 [ 60.047993] SyS_ioctl+0x41/0x70 [ 60.048019] entry_SYSCALL_64_fastpath+0x17/0x98 [ 60.048044] Memory state around the buggy address: [ 60.048066] ffff880348c9e580: 00 00 00 00 00 00 00 00 fc fc fc fc fc fc fc fc [ 60.048105] ffff880348c9e600: 00 00 00 00 00 00 00 00 fc fc fc fc fc fc fc fc [ 60.048138] >ffff880348c9e680: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 60.048170] ^ [ 60.048191] ffff880348c9e700: 00 00 00 00 00 00 00 00 fc fc fc fc fc fc fc fc [ 60.048225] ffff880348c9e780: 00 00 00 00 00 00 00 00 fc fc fc fc fc fc fc fc Note to hit the use-after-free requires us to be passed back a request via a fence-array, that is from explicit fencing accumulated into a sync-file fence-array. Fixes: 52e542090701 ("drm/i915/scheduler: Record all dependencies upon request construction") Testcase: igt/gem_exec_fence/expired-history Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Michał Winiarski <michal.winiarski@intel.com> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170422081537.6468-1-chris@chris-wilson.co.uk
2017-04-22 08:15:37 +00:00
if (i915_gem_request_completed(from))
return 0;
if (to->engine->schedule) {
ret = i915_priotree_add_dependency(to->i915,
&to->priotree,
&from->priotree);
if (ret < 0)
return ret;
}
if (to->engine == from->engine) {
ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
&from->submit,
I915_FENCE_GFP);
return ret < 0 ? ret : 0;
}
drm/i915: Remove the spin-request during execbuf await_request Originally we would enable and disable the breadcrumb interrupt immediately on demand. This was slow enough to have a large impact (>30%) on tasks that hopped between engines. However, by using a shadow to keep the irq alive for an extra interrupt (see commit 67b807a89230 ("drm/i915: Delay disabling the user interrupt for breadcrumbs")) and by recently reducing the cost in adding ourselves to the signal tree, we no longer need to spin-request during await_request to avoid delays in throughput tests. Without the earlier patches to stop the wakeup when signaling if the irq was already active, we saw no improvement in execbuf overhead (and corresponding contention in other clients) despite the removal of the spinner in a simple test like glxgears. This means there will be scenarios where now we spend longer enabling the interrupt than we would have spent spinning, but these are not likely to have as noticeable an impact as the high frequency test cases (where there should not be any regression). Ulterior motive: generalising the engine->sync_to to handle different types of semaphores and non-semaphores. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Oscar Mateo <oscar.mateo@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170608111405.16466-4-chris@chris-wilson.co.uk
2017-06-08 11:14:05 +00:00
if (to->engine->semaphore.sync_to) {
u32 seqno;
drm/i915: Remove the spin-request during execbuf await_request Originally we would enable and disable the breadcrumb interrupt immediately on demand. This was slow enough to have a large impact (>30%) on tasks that hopped between engines. However, by using a shadow to keep the irq alive for an extra interrupt (see commit 67b807a89230 ("drm/i915: Delay disabling the user interrupt for breadcrumbs")) and by recently reducing the cost in adding ourselves to the signal tree, we no longer need to spin-request during await_request to avoid delays in throughput tests. Without the earlier patches to stop the wakeup when signaling if the irq was already active, we saw no improvement in execbuf overhead (and corresponding contention in other clients) despite the removal of the spinner in a simple test like glxgears. This means there will be scenarios where now we spend longer enabling the interrupt than we would have spent spinning, but these are not likely to have as noticeable an impact as the high frequency test cases (where there should not be any regression). Ulterior motive: generalising the engine->sync_to to handle different types of semaphores and non-semaphores. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Oscar Mateo <oscar.mateo@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170608111405.16466-4-chris@chris-wilson.co.uk
2017-06-08 11:14:05 +00:00
GEM_BUG_ON(!from->engine->semaphore.signal);
drm/i915: Remove the spin-request during execbuf await_request Originally we would enable and disable the breadcrumb interrupt immediately on demand. This was slow enough to have a large impact (>30%) on tasks that hopped between engines. However, by using a shadow to keep the irq alive for an extra interrupt (see commit 67b807a89230 ("drm/i915: Delay disabling the user interrupt for breadcrumbs")) and by recently reducing the cost in adding ourselves to the signal tree, we no longer need to spin-request during await_request to avoid delays in throughput tests. Without the earlier patches to stop the wakeup when signaling if the irq was already active, we saw no improvement in execbuf overhead (and corresponding contention in other clients) despite the removal of the spinner in a simple test like glxgears. This means there will be scenarios where now we spend longer enabling the interrupt than we would have spent spinning, but these are not likely to have as noticeable an impact as the high frequency test cases (where there should not be any regression). Ulterior motive: generalising the engine->sync_to to handle different types of semaphores and non-semaphores. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Oscar Mateo <oscar.mateo@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170608111405.16466-4-chris@chris-wilson.co.uk
2017-06-08 11:14:05 +00:00
seqno = i915_gem_request_global_seqno(from);
if (!seqno)
goto await_dma_fence;
if (seqno <= to->timeline->global_sync[from->engine->id])
return 0;
trace_i915_gem_ring_sync_to(to, from);
ret = to->engine->semaphore.sync_to(to, from);
if (ret)
return ret;
to->timeline->global_sync[from->engine->id] = seqno;
drm/i915: Remove the spin-request during execbuf await_request Originally we would enable and disable the breadcrumb interrupt immediately on demand. This was slow enough to have a large impact (>30%) on tasks that hopped between engines. However, by using a shadow to keep the irq alive for an extra interrupt (see commit 67b807a89230 ("drm/i915: Delay disabling the user interrupt for breadcrumbs")) and by recently reducing the cost in adding ourselves to the signal tree, we no longer need to spin-request during await_request to avoid delays in throughput tests. Without the earlier patches to stop the wakeup when signaling if the irq was already active, we saw no improvement in execbuf overhead (and corresponding contention in other clients) despite the removal of the spinner in a simple test like glxgears. This means there will be scenarios where now we spend longer enabling the interrupt than we would have spent spinning, but these are not likely to have as noticeable an impact as the high frequency test cases (where there should not be any regression). Ulterior motive: generalising the engine->sync_to to handle different types of semaphores and non-semaphores. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Oscar Mateo <oscar.mateo@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170608111405.16466-4-chris@chris-wilson.co.uk
2017-06-08 11:14:05 +00:00
return 0;
}
await_dma_fence:
ret = i915_sw_fence_await_dma_fence(&to->submit,
&from->fence, 0,
I915_FENCE_GFP);
return ret < 0 ? ret : 0;
}
int
i915_gem_request_await_dma_fence(struct drm_i915_gem_request *req,
struct dma_fence *fence)
{
struct dma_fence **child = &fence;
unsigned int nchild = 1;
int ret;
/* Note that if the fence-array was created in signal-on-any mode,
* we should *not* decompose it into its individual fences. However,
* we don't currently store which mode the fence-array is operating
* in. Fortunately, the only user of signal-on-any is private to
* amdgpu and we should not see any incoming fence-array from
* sync-file being in signal-on-any mode.
*/
if (dma_fence_is_array(fence)) {
struct dma_fence_array *array = to_dma_fence_array(fence);
child = array->fences;
nchild = array->num_fences;
GEM_BUG_ON(!nchild);
}
do {
fence = *child++;
if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
continue;
/*
* Requests on the same timeline are explicitly ordered, along
* with their dependencies, by i915_add_request() which ensures
* that requests are submitted in-order through each ring.
*/
if (fence->context == req->fence.context)
continue;
drm/i915: Squash repeated awaits on the same fence Track the latest fence waited upon on each context, and only add a new asynchronous wait if the new fence is more recent than the recorded fence for that context. This requires us to filter out unordered timelines, which are noted by DMA_FENCE_NO_CONTEXT. However, in the absence of a universal identifier, we have to use our own i915->mm.unordered_timeline token. v2: Throw around the debug crutches v3: Inline the likely case of the pre-allocation cache being full. v4: Drop the pre-allocation support, we can lose the most recent fence in case of allocation failure -- it just means we may emit more awaits than strictly necessary but will not break. v5: Trim allocation size for leaf nodes, they only need an array of u32 not pointers. v6: Create mock_timeline to tidy selftest writing v7: s/intel_timeline_sync_get/intel_timeline_sync_is_later/ (Tvrtko) v8: Prune the stale sync points when we idle. v9: Include a small benchmark in the kselftests v10: Separate the idr implementation into its own compartment. (Tvrkto) v11: Refactor igt_sync kselftests to avoid deep nesting (Tvrkto) v12: __sync_leaf_idx() to assert that p->height is 0 when checking leaves v13: kselftests to investigate struct i915_syncmap itself (Tvrtko) v14: Foray into ascii art graphs v15: Take into account that the random lookup/insert does 2 prng calls, not 1, when benchmarking, and use for_each_set_bit() (Tvrtko) v16: Improved ascii art Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170503093924.5320-4-chris@chris-wilson.co.uk
2017-05-03 09:39:21 +00:00
/* Squash repeated waits to the same timelines */
if (fence->context != req->i915->mm.unordered_timeline &&
intel_timeline_sync_is_later(req->timeline, fence))
continue;
if (dma_fence_is_i915(fence))
ret = i915_gem_request_await_request(req,
to_request(fence));
else
ret = i915_sw_fence_await_dma_fence(&req->submit, fence,
I915_FENCE_TIMEOUT,
I915_FENCE_GFP);
if (ret < 0)
return ret;
drm/i915: Squash repeated awaits on the same fence Track the latest fence waited upon on each context, and only add a new asynchronous wait if the new fence is more recent than the recorded fence for that context. This requires us to filter out unordered timelines, which are noted by DMA_FENCE_NO_CONTEXT. However, in the absence of a universal identifier, we have to use our own i915->mm.unordered_timeline token. v2: Throw around the debug crutches v3: Inline the likely case of the pre-allocation cache being full. v4: Drop the pre-allocation support, we can lose the most recent fence in case of allocation failure -- it just means we may emit more awaits than strictly necessary but will not break. v5: Trim allocation size for leaf nodes, they only need an array of u32 not pointers. v6: Create mock_timeline to tidy selftest writing v7: s/intel_timeline_sync_get/intel_timeline_sync_is_later/ (Tvrtko) v8: Prune the stale sync points when we idle. v9: Include a small benchmark in the kselftests v10: Separate the idr implementation into its own compartment. (Tvrkto) v11: Refactor igt_sync kselftests to avoid deep nesting (Tvrkto) v12: __sync_leaf_idx() to assert that p->height is 0 when checking leaves v13: kselftests to investigate struct i915_syncmap itself (Tvrtko) v14: Foray into ascii art graphs v15: Take into account that the random lookup/insert does 2 prng calls, not 1, when benchmarking, and use for_each_set_bit() (Tvrtko) v16: Improved ascii art Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170503093924.5320-4-chris@chris-wilson.co.uk
2017-05-03 09:39:21 +00:00
/* Record the latest fence used against each timeline */
if (fence->context != req->i915->mm.unordered_timeline)
intel_timeline_sync_set(req->timeline, fence);
} while (--nchild);
return 0;
}
/**
* i915_gem_request_await_object - set this request to (async) wait upon a bo
*
* @to: request we are wishing to use
* @obj: object which may be in use on another ring.
*
* This code is meant to abstract object synchronization with the GPU.
* Conceptually we serialise writes between engines inside the GPU.
* We only allow one engine to write into a buffer at any time, but
* multiple readers. To ensure each has a coherent view of memory, we must:
*
* - If there is an outstanding write request to the object, the new
* request must wait for it to complete (either CPU or in hw, requests
* on the same ring will be naturally ordered).
*
* - If we are a write request (pending_write_domain is set), the new
* request must wait for outstanding read requests to complete.
*
* Returns 0 if successful, else propagates up the lower layer error.
*/
int
i915_gem_request_await_object(struct drm_i915_gem_request *to,
struct drm_i915_gem_object *obj,
bool write)
{
drm/i915: Move GEM activity tracking into a common struct reservation_object In preparation to support many distinct timelines, we need to expand the activity tracking on the GEM object to handle more than just a request per engine. We already use the struct reservation_object on the dma-buf to handle many fence contexts, so integrating that into the GEM object itself is the preferred solution. (For example, we can now share the same reservation_object between every consumer/producer using this buffer and skip the manual import/export via dma-buf.) v2: Reimplement busy-ioctl (by walking the reservation object), postpone the ABI change for another day. Similarly use the reservation object to find the last_write request (if active and from i915) for choosing display CS flips. Caveats: * busy-ioctl: busy-ioctl only reports on the native fences, it will not warn of stalls (in set-domain-ioctl, pread/pwrite etc) if the object is being rendered to by external fences. It also will not report the same busy state as wait-ioctl (or polling on the dma-buf) in the same circumstances. On the plus side, it does retain reporting of which *i915* engines are engaged with this object. * non-blocking atomic modesets take a step backwards as the wait for render completion blocks the ioctl. This is fixed in a subsequent patch to use a fence instead for awaiting on the rendering, see "drm/i915: Restore nonblocking awaits for modesetting" * dynamic array manipulation for shared-fences in reservation is slower than the previous lockless static assignment (e.g. gem_exec_lut_handle runtime on ivb goes from 42s to 66s), mainly due to atomic operations (maintaining the fence refcounts). * loss of object-level retirement callbacks, emulated by VMA retirement tracking. * minor loss of object-level last activity information from debugfs, could be replaced with per-vma information if desired Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-21-chris@chris-wilson.co.uk
2016-10-28 12:58:44 +00:00
struct dma_fence *excl;
int ret = 0;
if (write) {
drm/i915: Move GEM activity tracking into a common struct reservation_object In preparation to support many distinct timelines, we need to expand the activity tracking on the GEM object to handle more than just a request per engine. We already use the struct reservation_object on the dma-buf to handle many fence contexts, so integrating that into the GEM object itself is the preferred solution. (For example, we can now share the same reservation_object between every consumer/producer using this buffer and skip the manual import/export via dma-buf.) v2: Reimplement busy-ioctl (by walking the reservation object), postpone the ABI change for another day. Similarly use the reservation object to find the last_write request (if active and from i915) for choosing display CS flips. Caveats: * busy-ioctl: busy-ioctl only reports on the native fences, it will not warn of stalls (in set-domain-ioctl, pread/pwrite etc) if the object is being rendered to by external fences. It also will not report the same busy state as wait-ioctl (or polling on the dma-buf) in the same circumstances. On the plus side, it does retain reporting of which *i915* engines are engaged with this object. * non-blocking atomic modesets take a step backwards as the wait for render completion blocks the ioctl. This is fixed in a subsequent patch to use a fence instead for awaiting on the rendering, see "drm/i915: Restore nonblocking awaits for modesetting" * dynamic array manipulation for shared-fences in reservation is slower than the previous lockless static assignment (e.g. gem_exec_lut_handle runtime on ivb goes from 42s to 66s), mainly due to atomic operations (maintaining the fence refcounts). * loss of object-level retirement callbacks, emulated by VMA retirement tracking. * minor loss of object-level last activity information from debugfs, could be replaced with per-vma information if desired Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-21-chris@chris-wilson.co.uk
2016-10-28 12:58:44 +00:00
struct dma_fence **shared;
unsigned int count, i;
ret = reservation_object_get_fences_rcu(obj->resv,
&excl, &count, &shared);
if (ret)
return ret;
for (i = 0; i < count; i++) {
ret = i915_gem_request_await_dma_fence(to, shared[i]);
if (ret)
break;
dma_fence_put(shared[i]);
}
for (; i < count; i++)
dma_fence_put(shared[i]);
kfree(shared);
} else {
drm/i915: Move GEM activity tracking into a common struct reservation_object In preparation to support many distinct timelines, we need to expand the activity tracking on the GEM object to handle more than just a request per engine. We already use the struct reservation_object on the dma-buf to handle many fence contexts, so integrating that into the GEM object itself is the preferred solution. (For example, we can now share the same reservation_object between every consumer/producer using this buffer and skip the manual import/export via dma-buf.) v2: Reimplement busy-ioctl (by walking the reservation object), postpone the ABI change for another day. Similarly use the reservation object to find the last_write request (if active and from i915) for choosing display CS flips. Caveats: * busy-ioctl: busy-ioctl only reports on the native fences, it will not warn of stalls (in set-domain-ioctl, pread/pwrite etc) if the object is being rendered to by external fences. It also will not report the same busy state as wait-ioctl (or polling on the dma-buf) in the same circumstances. On the plus side, it does retain reporting of which *i915* engines are engaged with this object. * non-blocking atomic modesets take a step backwards as the wait for render completion blocks the ioctl. This is fixed in a subsequent patch to use a fence instead for awaiting on the rendering, see "drm/i915: Restore nonblocking awaits for modesetting" * dynamic array manipulation for shared-fences in reservation is slower than the previous lockless static assignment (e.g. gem_exec_lut_handle runtime on ivb goes from 42s to 66s), mainly due to atomic operations (maintaining the fence refcounts). * loss of object-level retirement callbacks, emulated by VMA retirement tracking. * minor loss of object-level last activity information from debugfs, could be replaced with per-vma information if desired Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-21-chris@chris-wilson.co.uk
2016-10-28 12:58:44 +00:00
excl = reservation_object_get_excl_rcu(obj->resv);
}
drm/i915: Move GEM activity tracking into a common struct reservation_object In preparation to support many distinct timelines, we need to expand the activity tracking on the GEM object to handle more than just a request per engine. We already use the struct reservation_object on the dma-buf to handle many fence contexts, so integrating that into the GEM object itself is the preferred solution. (For example, we can now share the same reservation_object between every consumer/producer using this buffer and skip the manual import/export via dma-buf.) v2: Reimplement busy-ioctl (by walking the reservation object), postpone the ABI change for another day. Similarly use the reservation object to find the last_write request (if active and from i915) for choosing display CS flips. Caveats: * busy-ioctl: busy-ioctl only reports on the native fences, it will not warn of stalls (in set-domain-ioctl, pread/pwrite etc) if the object is being rendered to by external fences. It also will not report the same busy state as wait-ioctl (or polling on the dma-buf) in the same circumstances. On the plus side, it does retain reporting of which *i915* engines are engaged with this object. * non-blocking atomic modesets take a step backwards as the wait for render completion blocks the ioctl. This is fixed in a subsequent patch to use a fence instead for awaiting on the rendering, see "drm/i915: Restore nonblocking awaits for modesetting" * dynamic array manipulation for shared-fences in reservation is slower than the previous lockless static assignment (e.g. gem_exec_lut_handle runtime on ivb goes from 42s to 66s), mainly due to atomic operations (maintaining the fence refcounts). * loss of object-level retirement callbacks, emulated by VMA retirement tracking. * minor loss of object-level last activity information from debugfs, could be replaced with per-vma information if desired Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-21-chris@chris-wilson.co.uk
2016-10-28 12:58:44 +00:00
if (excl) {
if (ret == 0)
ret = i915_gem_request_await_dma_fence(to, excl);
drm/i915: Move GEM activity tracking into a common struct reservation_object In preparation to support many distinct timelines, we need to expand the activity tracking on the GEM object to handle more than just a request per engine. We already use the struct reservation_object on the dma-buf to handle many fence contexts, so integrating that into the GEM object itself is the preferred solution. (For example, we can now share the same reservation_object between every consumer/producer using this buffer and skip the manual import/export via dma-buf.) v2: Reimplement busy-ioctl (by walking the reservation object), postpone the ABI change for another day. Similarly use the reservation object to find the last_write request (if active and from i915) for choosing display CS flips. Caveats: * busy-ioctl: busy-ioctl only reports on the native fences, it will not warn of stalls (in set-domain-ioctl, pread/pwrite etc) if the object is being rendered to by external fences. It also will not report the same busy state as wait-ioctl (or polling on the dma-buf) in the same circumstances. On the plus side, it does retain reporting of which *i915* engines are engaged with this object. * non-blocking atomic modesets take a step backwards as the wait for render completion blocks the ioctl. This is fixed in a subsequent patch to use a fence instead for awaiting on the rendering, see "drm/i915: Restore nonblocking awaits for modesetting" * dynamic array manipulation for shared-fences in reservation is slower than the previous lockless static assignment (e.g. gem_exec_lut_handle runtime on ivb goes from 42s to 66s), mainly due to atomic operations (maintaining the fence refcounts). * loss of object-level retirement callbacks, emulated by VMA retirement tracking. * minor loss of object-level last activity information from debugfs, could be replaced with per-vma information if desired Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-21-chris@chris-wilson.co.uk
2016-10-28 12:58:44 +00:00
dma_fence_put(excl);
}
drm/i915: Move GEM activity tracking into a common struct reservation_object In preparation to support many distinct timelines, we need to expand the activity tracking on the GEM object to handle more than just a request per engine. We already use the struct reservation_object on the dma-buf to handle many fence contexts, so integrating that into the GEM object itself is the preferred solution. (For example, we can now share the same reservation_object between every consumer/producer using this buffer and skip the manual import/export via dma-buf.) v2: Reimplement busy-ioctl (by walking the reservation object), postpone the ABI change for another day. Similarly use the reservation object to find the last_write request (if active and from i915) for choosing display CS flips. Caveats: * busy-ioctl: busy-ioctl only reports on the native fences, it will not warn of stalls (in set-domain-ioctl, pread/pwrite etc) if the object is being rendered to by external fences. It also will not report the same busy state as wait-ioctl (or polling on the dma-buf) in the same circumstances. On the plus side, it does retain reporting of which *i915* engines are engaged with this object. * non-blocking atomic modesets take a step backwards as the wait for render completion blocks the ioctl. This is fixed in a subsequent patch to use a fence instead for awaiting on the rendering, see "drm/i915: Restore nonblocking awaits for modesetting" * dynamic array manipulation for shared-fences in reservation is slower than the previous lockless static assignment (e.g. gem_exec_lut_handle runtime on ivb goes from 42s to 66s), mainly due to atomic operations (maintaining the fence refcounts). * loss of object-level retirement callbacks, emulated by VMA retirement tracking. * minor loss of object-level last activity information from debugfs, could be replaced with per-vma information if desired Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-21-chris@chris-wilson.co.uk
2016-10-28 12:58:44 +00:00
return ret;
}
/*
* NB: This function is not allowed to fail. Doing so would mean the the
* request is not being tracked for completion but the work itself is
* going to happen on the hardware. This would be a Bad Thing(tm).
*/
void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
{
struct intel_engine_cs *engine = request->engine;
struct intel_ring *ring = request->ring;
struct intel_timeline *timeline = request->timeline;
struct drm_i915_gem_request *prev;
drm/i915: Emit to ringbuffer directly This removes the usage of intel_ring_emit in favour of directly writing to the ring buffer. intel_ring_emit was preventing the compiler for optimising fetch and increment of the current ring buffer pointer and therefore generating very verbose code for every write. It had no useful purpose since all ringbuffer operations are started and ended with intel_ring_begin and intel_ring_advance respectively, with no bail out in the middle possible, so it is fine to increment the tail in intel_ring_begin and let the code manage the pointer itself. Useless instruction removal amounts to approximately two and half kilobytes of saved text on my build. Not sure if this has any measurable performance implications but executing a ton of useless instructions on fast paths cannot be good. v2: * Change return from intel_ring_begin to error pointer by popular demand. * Move tail increment to intel_ring_advance to enable some error checking. v3: * Move tail advance back into intel_ring_begin. * Rebase and tidy. v4: * Complete rebase after a few months since v3. v5: * Remove unecessary cast and fix !debug compile. (Chris Wilson) v6: * Make intel_ring_offset take request as well. * Fix recording of request postfix plus a sprinkle of asserts. (Chris Wilson) v7: * Use intel_ring_offset to get the postfix. (Chris Wilson) * Convert GVT code as well. v8: * Rename *out++ to *cs++. v9: * Fix GVT out to cs conversion in GVT. v10: * Rebase for new intel_ring_begin in selftests. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Zhi Wang <zhi.a.wang@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170214113242.29241-1-tvrtko.ursulin@linux.intel.com
2017-02-14 11:32:42 +00:00
u32 *cs;
int err;
lockdep_assert_held(&request->i915->drm.struct_mutex);
trace_i915_gem_request_add(request);
drm/i915: Add a sanity check that no request is submitted in the middle It is an error to start a new request on the same timeline (ringbuffer) as the current one before the current is submitted. If there are two requests emitting to the ringbuffer at the same time, the operation is undefined. We can catch this by checking for the timeline having a later seqno than ours when we come to submit our request. Currently we have this check at the end of __i915_add_request, but having an early check as well isolates a failure in the caller versus a failure in sealing the request (i.e. from inside __i915_add_request itself). For example, CI is currently tripping over this late assertion on ctg/ilk: [ 100.329399] [IGT] gem_cs_tlb: starting subtest basic-default [ 100.336333] ------------[ cut here ]------------ [ 100.336341] kernel BUG at drivers/gpu/drm/i915/i915_gem_request.c:908! [ 100.336347] invalid opcode: 0000 [#1] PREEMPT SMP [ 100.336351] Modules linked in: snd_hda_intel i915 snd_hda_codec_generic snd_hda_codec snd_hwdep snd_hda_core snd_pcm coretemp mei_me lpc_ich mei e1000e ptp pps_core [last unloaded: i915] [ 100.336373] CPU: 0 PID: 6308 Comm: gem_cs_tlb Tainted: G U 4.10.0-rc3-CI-CI_DRM_2045+ #1 [ 100.336380] Hardware name: LENOVO 7465CTO/7465CTO, BIOS 6DET44WW (2.08 ) 04/22/2009 [ 100.336386] task: ffff88012b738040 task.stack: ffffc90000560000 [ 100.336441] RIP: 0010:__i915_add_request+0x4aa/0x510 [i915] [ 100.336445] RSP: 0018:ffffc90000563ac0 EFLAGS: 00010212 [ 100.336451] RAX: 0000000000005d52 RBX: ffff880133bb84c0 RCX: 0000000000000001 [ 100.336456] RDX: 0000000080000001 RSI: ffff88012b738860 RDI: 00000000ffffffff [ 100.336461] RBP: ffffc90000563b00 R08: ffff880133bb8780 R09: 0000000000000000 [ 100.336466] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88012f53d950 [ 100.336472] R13: ffff88012a2b0af8 R14: ffff88012a5b0008 R15: ffff88012f53d960 [ 100.336477] FS: 00007f0d19da38c0(0000) GS:ffff88013bc00000(0000) knlGS:0000000000000000 [ 100.336483] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 100.336488] CR2: 00007f0d17706000 CR3: 000000012aa3e000 CR4: 00000000000406f0 [ 100.336496] Call Trace: [ 100.336527] i915_gem_switch_to_kernel_context+0x131/0x1b0 [i915] [ 100.336559] i915_gem_evict_vm+0x202/0x2b0 [i915] [ 100.336590] i915_gem_execbuffer_reserve.isra.9+0x3ae/0x440 [i915] [ 100.336623] i915_gem_do_execbuffer.isra.15+0x6d9/0x1b20 [i915] [ 100.336656] i915_gem_execbuffer2+0xc0/0x250 [i915] [ 100.336666] drm_ioctl+0x200/0x450 [ 100.336697] ? i915_gem_execbuffer+0x330/0x330 [i915] [ 100.336708] do_vfs_ioctl+0x90/0x6e0 [ 100.336716] ? up_read+0x1a/0x40 [ 100.336723] ? trace_hardirqs_on_caller+0x122/0x1b0 [ 100.336730] SyS_ioctl+0x3c/0x70 [ 100.336738] entry_SYSCALL_64_fastpath+0x1c/0xb1 [ 100.336745] RIP: 0033:0x7f0d187cb357 [ 100.336750] RSP: 002b:00007ffe0b2f7c28 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 100.336761] RAX: ffffffffffffffda RBX: 00007ffe0b2f7d60 RCX: 00007f0d187cb357 [ 100.336768] RDX: 00007ffe0b2f7d00 RSI: 0000000040406469 RDI: 0000000000000003 [ 100.336775] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000022 [ 100.336782] R10: 0000000000000007 R11: 0000000000000246 R12: 0000000000000002 [ 100.336789] R13: 0000000000419101 R14: 00007ffe0b2f7d60 R15: 00007ffe0b2f7d50 [ 100.336797] Code: 5f 74 1e e9 d4 fb ff ff e8 bc 1e 9c e0 e9 ae fb ff ff 4c 89 e7 e8 77 22 fd ff e9 88 fd ff ff 0f 0b e8 a3 1e 9c e0 e9 b1 fb ff ff <0f> 0b 0f 0b e8 fd af ab e0 85 c0 75 c2 48 c7 c2 80 2c 71 a0 be [ 100.336877] RIP: __i915_add_request+0x4aa/0x510 [i915] RSP: ffffc90000563ac0 [ 100.336886] ---[ end trace 22b36545479e5eb7 ]--- Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170111140858.1922-1-chris@chris-wilson.co.uk Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
2017-01-11 14:08:58 +00:00
/* Make sure that no request gazumped us - if it was allocated after
* our i915_gem_request_alloc() and called __i915_add_request() before
* us, the timeline will hold its seqno which is later than ours.
*/
GEM_BUG_ON(timeline->seqno != request->fence.seqno);
drm/i915: Add a sanity check that no request is submitted in the middle It is an error to start a new request on the same timeline (ringbuffer) as the current one before the current is submitted. If there are two requests emitting to the ringbuffer at the same time, the operation is undefined. We can catch this by checking for the timeline having a later seqno than ours when we come to submit our request. Currently we have this check at the end of __i915_add_request, but having an early check as well isolates a failure in the caller versus a failure in sealing the request (i.e. from inside __i915_add_request itself). For example, CI is currently tripping over this late assertion on ctg/ilk: [ 100.329399] [IGT] gem_cs_tlb: starting subtest basic-default [ 100.336333] ------------[ cut here ]------------ [ 100.336341] kernel BUG at drivers/gpu/drm/i915/i915_gem_request.c:908! [ 100.336347] invalid opcode: 0000 [#1] PREEMPT SMP [ 100.336351] Modules linked in: snd_hda_intel i915 snd_hda_codec_generic snd_hda_codec snd_hwdep snd_hda_core snd_pcm coretemp mei_me lpc_ich mei e1000e ptp pps_core [last unloaded: i915] [ 100.336373] CPU: 0 PID: 6308 Comm: gem_cs_tlb Tainted: G U 4.10.0-rc3-CI-CI_DRM_2045+ #1 [ 100.336380] Hardware name: LENOVO 7465CTO/7465CTO, BIOS 6DET44WW (2.08 ) 04/22/2009 [ 100.336386] task: ffff88012b738040 task.stack: ffffc90000560000 [ 100.336441] RIP: 0010:__i915_add_request+0x4aa/0x510 [i915] [ 100.336445] RSP: 0018:ffffc90000563ac0 EFLAGS: 00010212 [ 100.336451] RAX: 0000000000005d52 RBX: ffff880133bb84c0 RCX: 0000000000000001 [ 100.336456] RDX: 0000000080000001 RSI: ffff88012b738860 RDI: 00000000ffffffff [ 100.336461] RBP: ffffc90000563b00 R08: ffff880133bb8780 R09: 0000000000000000 [ 100.336466] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88012f53d950 [ 100.336472] R13: ffff88012a2b0af8 R14: ffff88012a5b0008 R15: ffff88012f53d960 [ 100.336477] FS: 00007f0d19da38c0(0000) GS:ffff88013bc00000(0000) knlGS:0000000000000000 [ 100.336483] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 100.336488] CR2: 00007f0d17706000 CR3: 000000012aa3e000 CR4: 00000000000406f0 [ 100.336496] Call Trace: [ 100.336527] i915_gem_switch_to_kernel_context+0x131/0x1b0 [i915] [ 100.336559] i915_gem_evict_vm+0x202/0x2b0 [i915] [ 100.336590] i915_gem_execbuffer_reserve.isra.9+0x3ae/0x440 [i915] [ 100.336623] i915_gem_do_execbuffer.isra.15+0x6d9/0x1b20 [i915] [ 100.336656] i915_gem_execbuffer2+0xc0/0x250 [i915] [ 100.336666] drm_ioctl+0x200/0x450 [ 100.336697] ? i915_gem_execbuffer+0x330/0x330 [i915] [ 100.336708] do_vfs_ioctl+0x90/0x6e0 [ 100.336716] ? up_read+0x1a/0x40 [ 100.336723] ? trace_hardirqs_on_caller+0x122/0x1b0 [ 100.336730] SyS_ioctl+0x3c/0x70 [ 100.336738] entry_SYSCALL_64_fastpath+0x1c/0xb1 [ 100.336745] RIP: 0033:0x7f0d187cb357 [ 100.336750] RSP: 002b:00007ffe0b2f7c28 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 100.336761] RAX: ffffffffffffffda RBX: 00007ffe0b2f7d60 RCX: 00007f0d187cb357 [ 100.336768] RDX: 00007ffe0b2f7d00 RSI: 0000000040406469 RDI: 0000000000000003 [ 100.336775] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000022 [ 100.336782] R10: 0000000000000007 R11: 0000000000000246 R12: 0000000000000002 [ 100.336789] R13: 0000000000419101 R14: 00007ffe0b2f7d60 R15: 00007ffe0b2f7d50 [ 100.336797] Code: 5f 74 1e e9 d4 fb ff ff e8 bc 1e 9c e0 e9 ae fb ff ff 4c 89 e7 e8 77 22 fd ff e9 88 fd ff ff 0f 0b e8 a3 1e 9c e0 e9 b1 fb ff ff <0f> 0b 0f 0b e8 fd af ab e0 85 c0 75 c2 48 c7 c2 80 2c 71 a0 be [ 100.336877] RIP: __i915_add_request+0x4aa/0x510 [i915] RSP: ffffc90000563ac0 [ 100.336886] ---[ end trace 22b36545479e5eb7 ]--- Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170111140858.1922-1-chris@chris-wilson.co.uk Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
2017-01-11 14:08:58 +00:00
/*
* To ensure that this call will not fail, space for its emissions
* should already have been reserved in the ring buffer. Let the ring
* know that it is time to use that space up.
*/
request->reserved_space = 0;
/*
* Emit any outstanding flushes - execbuf can fail to emit the flush
* after having emitted the batchbuffer command. Hence we need to fix
* things up similar to emitting the lazy request. The difference here
* is that the flush _must_ happen before the next request, no matter
* what.
*/
if (flush_caches) {
err = engine->emit_flush(request, EMIT_FLUSH);
/* Not allowed to fail! */
WARN(err, "engine->emit_flush() failed: %d!\n", err);
}
/* Record the position of the start of the breadcrumb so that
* should we detect the updated seqno part-way through the
* GPU processing the request, we never over-estimate the
* position of the ring's HEAD.
*/
drm/i915: Emit to ringbuffer directly This removes the usage of intel_ring_emit in favour of directly writing to the ring buffer. intel_ring_emit was preventing the compiler for optimising fetch and increment of the current ring buffer pointer and therefore generating very verbose code for every write. It had no useful purpose since all ringbuffer operations are started and ended with intel_ring_begin and intel_ring_advance respectively, with no bail out in the middle possible, so it is fine to increment the tail in intel_ring_begin and let the code manage the pointer itself. Useless instruction removal amounts to approximately two and half kilobytes of saved text on my build. Not sure if this has any measurable performance implications but executing a ton of useless instructions on fast paths cannot be good. v2: * Change return from intel_ring_begin to error pointer by popular demand. * Move tail increment to intel_ring_advance to enable some error checking. v3: * Move tail advance back into intel_ring_begin. * Rebase and tidy. v4: * Complete rebase after a few months since v3. v5: * Remove unecessary cast and fix !debug compile. (Chris Wilson) v6: * Make intel_ring_offset take request as well. * Fix recording of request postfix plus a sprinkle of asserts. (Chris Wilson) v7: * Use intel_ring_offset to get the postfix. (Chris Wilson) * Convert GVT code as well. v8: * Rename *out++ to *cs++. v9: * Fix GVT out to cs conversion in GVT. v10: * Rebase for new intel_ring_begin in selftests. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Zhi Wang <zhi.a.wang@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170214113242.29241-1-tvrtko.ursulin@linux.intel.com
2017-02-14 11:32:42 +00:00
cs = intel_ring_begin(request, engine->emit_breadcrumb_sz);
GEM_BUG_ON(IS_ERR(cs));
request->postfix = intel_ring_offset(request, cs);
/* Seal the request and mark it as pending execution. Note that
* we may inspect this state, without holding any locks, during
* hangcheck. Hence we apply the barrier to ensure that we do not
* see a more recent value in the hws than we are tracking.
*/
prev = i915_gem_active_raw(&timeline->last_request,
&request->i915->drm.struct_mutex);
if (prev) {
i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
&request->submitq);
if (engine->schedule)
__i915_priotree_add_dependency(&request->priotree,
&prev->priotree,
&request->dep,
0);
}
spin_lock_irq(&timeline->lock);
list_add_tail(&request->link, &timeline->requests);
spin_unlock_irq(&timeline->lock);
GEM_BUG_ON(timeline->seqno != request->fence.seqno);
i915_gem_active_set(&timeline->last_request, request);
list_add_tail(&request->ring_link, &ring->request_list);
request->emitted_jiffies = jiffies;
/* Let the backend know a new request has arrived that may need
* to adjust the existing execution schedule due to a high priority
* request - i.e. we may want to preempt the current request in order
* to run a high priority dependency chain *before* we can execute this
* request.
*
* This is called before the request is ready to run so that we can
* decide whether to preempt the entire chain so that it is ready to
* run at the earliest possible convenience.
*/
if (engine->schedule)
engine->schedule(request, request->ctx->priority);
local_bh_disable();
i915_sw_fence_commit(&request->submit);
local_bh_enable(); /* Kick the execlists tasklet if just scheduled */
}
static unsigned long local_clock_us(unsigned int *cpu)
{
unsigned long t;
/* Cheaply and approximately convert from nanoseconds to microseconds.
* The result and subsequent calculations are also defined in the same
* approximate microseconds units. The principal source of timing
* error here is from the simple truncation.
*
* Note that local_clock() is only defined wrt to the current CPU;
* the comparisons are no longer valid if we switch CPUs. Instead of
* blocking preemption for the entire busywait, we can detect the CPU
* switch and use that as indicator of system load and a reason to
* stop busywaiting, see busywait_stop().
*/
*cpu = get_cpu();
t = local_clock() >> 10;
put_cpu();
return t;
}
static bool busywait_stop(unsigned long timeout, unsigned int cpu)
{
unsigned int this_cpu;
if (time_after(local_clock_us(&this_cpu), timeout))
return true;
return this_cpu != cpu;
}
static bool __i915_spin_request(const struct drm_i915_gem_request *req,
u32 seqno, int state, unsigned long timeout_us)
{
struct intel_engine_cs *engine = req->engine;
unsigned int irq, cpu;
GEM_BUG_ON(!seqno);
/*
* Only wait for the request if we know it is likely to complete.
*
* We don't track the timestamps around requests, nor the average
* request length, so we do not have a good indicator that this
* request will complete within the timeout. What we do know is the
* order in which requests are executed by the engine and so we can
* tell if the request has started. If the request hasn't started yet,
* it is a fair assumption that it will not complete within our
* relatively short timeout.
*/
if (!i915_seqno_passed(intel_engine_get_seqno(engine), seqno - 1))
return false;
/* When waiting for high frequency requests, e.g. during synchronous
* rendering split between the CPU and GPU, the finite amount of time
* required to set up the irq and wait upon it limits the response
* rate. By busywaiting on the request completion for a short while we
* can service the high frequency waits as quick as possible. However,
* if it is a slow request, we want to sleep as quickly as possible.
* The tradeoff between waiting and sleeping is roughly the time it
* takes to sleep on a request, on the order of a microsecond.
*/
irq = atomic_read(&engine->irq_count);
timeout_us += local_clock_us(&cpu);
do {
if (i915_seqno_passed(intel_engine_get_seqno(engine), seqno))
return seqno == i915_gem_request_global_seqno(req);
/* Seqno are meant to be ordered *before* the interrupt. If
* we see an interrupt without a corresponding seqno advance,
* assume we won't see one in the near future but require
* the engine->seqno_barrier() to fixup coherency.
*/
if (atomic_read(&engine->irq_count) != irq)
break;
if (signal_pending_state(state, current))
break;
if (busywait_stop(timeout_us, cpu))
break;
cpu_relax();
} while (!need_resched());
return false;
}
static bool __i915_wait_request_check_and_reset(struct drm_i915_gem_request *request)
{
if (likely(!i915_reset_handoff(&request->i915->gpu_error)))
return false;
__set_current_state(TASK_RUNNING);
i915_reset(request->i915, 0);
return true;
}
/**
* i915_wait_request - wait until execution of request has finished
* @req: the request to wait upon
* @flags: how to wait
* @timeout: how long to wait in jiffies
*
* i915_wait_request() waits for the request to be completed, for a
* maximum of @timeout jiffies (with MAX_SCHEDULE_TIMEOUT implying an
* unbounded wait).
*
* If the caller holds the struct_mutex, the caller must pass I915_WAIT_LOCKED
* in via the flags, and vice versa if the struct_mutex is not held, the caller
* must not specify that the wait is locked.
*
* Returns the remaining time (in jiffies) if the request completed, which may
* be zero or -ETIME if the request is unfinished after the timeout expires.
* May return -EINTR is called with I915_WAIT_INTERRUPTIBLE and a signal is
* pending before the request completes.
*/
long i915_wait_request(struct drm_i915_gem_request *req,
unsigned int flags,
long timeout)
{
const int state = flags & I915_WAIT_INTERRUPTIBLE ?
TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
wait_queue_head_t *errq = &req->i915->gpu_error.wait_queue;
DEFINE_WAIT_FUNC(reset, default_wake_function);
DEFINE_WAIT_FUNC(exec, default_wake_function);
struct intel_wait wait;
might_sleep();
#if IS_ENABLED(CONFIG_LOCKDEP)
GEM_BUG_ON(debug_locks &&
!!lockdep_is_held(&req->i915->drm.struct_mutex) !=
!!(flags & I915_WAIT_LOCKED));
#endif
GEM_BUG_ON(timeout < 0);
if (i915_gem_request_completed(req))
return timeout;
if (!timeout)
return -ETIME;
trace_i915_gem_request_wait_begin(req, flags);
add_wait_queue(&req->execute, &exec);
if (flags & I915_WAIT_LOCKED)
add_wait_queue(errq, &reset);
intel_wait_init(&wait, req);
restart:
do {
set_current_state(state);
if (intel_wait_update_request(&wait, req))
break;
if (flags & I915_WAIT_LOCKED &&
__i915_wait_request_check_and_reset(req))
continue;
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
goto complete;
}
if (!timeout) {
timeout = -ETIME;
goto complete;
}
timeout = io_schedule_timeout(timeout);
} while (1);
GEM_BUG_ON(!intel_wait_has_seqno(&wait));
GEM_BUG_ON(!i915_sw_fence_signaled(&req->submit));
/* Optimistic short spin before touching IRQs */
if (__i915_spin_request(req, wait.seqno, state, 5))
goto complete;
set_current_state(state);
if (intel_engine_add_wait(req->engine, &wait))
/* In order to check that we haven't missed the interrupt
* as we enabled it, we need to kick ourselves to do a
* coherent check on the seqno before we sleep.
*/
goto wakeup;
if (flags & I915_WAIT_LOCKED)
__i915_wait_request_check_and_reset(req);
for (;;) {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
if (!timeout) {
timeout = -ETIME;
break;
}
timeout = io_schedule_timeout(timeout);
if (intel_wait_complete(&wait) &&
intel_wait_check_request(&wait, req))
break;
set_current_state(state);
wakeup:
/* Carefully check if the request is complete, giving time
* for the seqno to be visible following the interrupt.
* We also have to check in case we are kicked by the GPU
* reset in order to drop the struct_mutex.
*/
if (__i915_request_irq_complete(req))
break;
/* If the GPU is hung, and we hold the lock, reset the GPU
* and then check for completion. On a full reset, the engine's
* HW seqno will be advanced passed us and we are complete.
* If we do a partial reset, we have to wait for the GPU to
* resume and update the breadcrumb.
*
* If we don't hold the mutex, we can just wait for the worker
* to come along and update the breadcrumb (either directly
* itself, or indirectly by recovering the GPU).
*/
if (flags & I915_WAIT_LOCKED &&
__i915_wait_request_check_and_reset(req))
continue;
/* Only spin if we know the GPU is processing this request */
if (__i915_spin_request(req, wait.seqno, state, 2))
break;
if (!intel_wait_check_request(&wait, req)) {
intel_engine_remove_wait(req->engine, &wait);
goto restart;
}
}
intel_engine_remove_wait(req->engine, &wait);
complete:
__set_current_state(TASK_RUNNING);
if (flags & I915_WAIT_LOCKED)
remove_wait_queue(errq, &reset);
remove_wait_queue(&req->execute, &exec);
trace_i915_gem_request_wait_end(req);
return timeout;
}
static void engine_retire_requests(struct intel_engine_cs *engine)
{
struct drm_i915_gem_request *request, *next;
u32 seqno = intel_engine_get_seqno(engine);
LIST_HEAD(retire);
spin_lock_irq(&engine->timeline->lock);
list_for_each_entry_safe(request, next,
&engine->timeline->requests, link) {
if (!i915_seqno_passed(seqno, request->global_seqno))
break;
list_move_tail(&request->link, &retire);
}
spin_unlock_irq(&engine->timeline->lock);
list_for_each_entry_safe(request, next, &retire, link)
i915_gem_request_retire(request);
}
void i915_gem_retire_requests(struct drm_i915_private *dev_priv)
{
struct intel_engine_cs *engine;
enum intel_engine_id id;
lockdep_assert_held(&dev_priv->drm.struct_mutex);
if (!dev_priv->gt.active_requests)
return;
for_each_engine(engine, dev_priv, id)
engine_retire_requests(engine);
}
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
#include "selftests/mock_request.c"
#include "selftests/i915_gem_request.c"
#endif