2016-07-13 15:03:40 +00:00
|
|
|
/*
|
|
|
|
* Copyright © 2016 Intel Corporation
|
|
|
|
*
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
* Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2017-10-09 11:02:57 +00:00
|
|
|
#include <drm/drm_print.h>
|
|
|
|
|
2019-05-28 09:29:49 +00:00
|
|
|
#include "gem/i915_gem_context.h"
|
|
|
|
|
2016-07-13 15:03:40 +00:00
|
|
|
#include "i915_drv.h"
|
2019-04-24 17:48:39 +00:00
|
|
|
|
2019-06-21 07:07:44 +00:00
|
|
|
#include "gt/intel_gt.h"
|
|
|
|
|
2019-04-24 17:48:39 +00:00
|
|
|
#include "intel_engine.h"
|
drm/i915: Invert the GEM wakeref hierarchy
In the current scheme, on submitting a request we take a single global
GEM wakeref, which trickles down to wake up all GT power domains. This
is undesirable as we would like to be able to localise our power
management to the available power domains and to remove the global GEM
operations from the heart of the driver. (The intent there is to push
global GEM decisions to the boundary as used by the GEM user interface.)
Now during request construction, each request is responsible via its
logical context to acquire a wakeref on each power domain it intends to
utilize. Currently, each request takes a wakeref on the engine(s) and
the engines themselves take a chipset wakeref. This gives us a
transition on each engine which we can extend if we want to insert more
powermangement control (such as soft rc6). The global GEM operations
that currently require a struct_mutex are reduced to listening to pm
events from the chipset GT wakeref. As we reduce the struct_mutex
requirement, these listeners should evaporate.
Perhaps the biggest immediate change is that this removes the
struct_mutex requirement around GT power management, allowing us greater
flexibility in request construction. Another important knock-on effect,
is that by tracking engine usage, we can insert a switch back to the
kernel context on that engine immediately, avoiding any extra delay or
inserting global synchronisation barriers. This makes tracking when an
engine and its associated contexts are idle much easier -- important for
when we forgo our assumed execution ordering and need idle barriers to
unpin used contexts. In the process, it means we remove a large chunk of
code whose only purpose was to switch back to the kernel context.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Imre Deak <imre.deak@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190424200717.1686-5-chris@chris-wilson.co.uk
2019-04-24 20:07:17 +00:00
|
|
|
#include "intel_engine_pm.h"
|
2019-08-04 12:48:26 +00:00
|
|
|
#include "intel_engine_pool.h"
|
2019-08-06 12:43:00 +00:00
|
|
|
#include "intel_engine_user.h"
|
2019-05-28 09:29:49 +00:00
|
|
|
#include "intel_context.h"
|
2016-07-13 15:03:40 +00:00
|
|
|
#include "intel_lrc.h"
|
2019-04-24 17:48:39 +00:00
|
|
|
#include "intel_reset.h"
|
2016-07-13 15:03:40 +00:00
|
|
|
|
2017-04-28 07:53:36 +00:00
|
|
|
/* Haswell does have the CXT_SIZE register however it does not appear to be
|
|
|
|
* valid. Now, docs explain in dwords what is in the context object. The full
|
|
|
|
* size is 70720 bytes, however, the power context and execlist context will
|
|
|
|
* never be saved (power context is stored elsewhere, and execlists don't work
|
|
|
|
* on HSW) - so the final size, including the extra state required for the
|
|
|
|
* Resource Streamer, is 66944 bytes, which rounds to 17 pages.
|
|
|
|
*/
|
|
|
|
#define HSW_CXT_TOTAL_SIZE (17 * PAGE_SIZE)
|
|
|
|
|
2018-01-11 22:55:06 +00:00
|
|
|
#define DEFAULT_LR_CONTEXT_RENDER_SIZE (22 * PAGE_SIZE)
|
2017-04-28 07:53:36 +00:00
|
|
|
#define GEN8_LR_CONTEXT_RENDER_SIZE (20 * PAGE_SIZE)
|
|
|
|
#define GEN9_LR_CONTEXT_RENDER_SIZE (22 * PAGE_SIZE)
|
2017-10-04 15:39:52 +00:00
|
|
|
#define GEN10_LR_CONTEXT_RENDER_SIZE (18 * PAGE_SIZE)
|
2018-01-11 22:55:07 +00:00
|
|
|
#define GEN11_LR_CONTEXT_RENDER_SIZE (14 * PAGE_SIZE)
|
2017-04-28 07:53:36 +00:00
|
|
|
|
|
|
|
#define GEN8_LR_CONTEXT_OTHER_SIZE ( 2 * PAGE_SIZE)
|
|
|
|
|
2018-03-14 18:26:50 +00:00
|
|
|
#define MAX_MMIO_BASES 3
|
2017-04-10 14:34:32 +00:00
|
|
|
struct engine_info {
|
2017-03-01 20:26:15 +00:00
|
|
|
unsigned int hw_id;
|
2017-04-10 14:34:29 +00:00
|
|
|
u8 class;
|
|
|
|
u8 instance;
|
2018-03-14 18:26:50 +00:00
|
|
|
/* mmio bases table *must* be sorted in reverse gen order */
|
|
|
|
struct engine_mmio_base {
|
|
|
|
u32 gen : 8;
|
|
|
|
u32 base : 24;
|
|
|
|
} mmio_bases[MAX_MMIO_BASES];
|
2017-04-10 14:34:32 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
static const struct engine_info intel_engines[] = {
|
2019-03-05 18:03:30 +00:00
|
|
|
[RCS0] = {
|
|
|
|
.hw_id = RCS0_HW,
|
2017-04-10 14:34:29 +00:00
|
|
|
.class = RENDER_CLASS,
|
|
|
|
.instance = 0,
|
2018-03-14 18:26:50 +00:00
|
|
|
.mmio_bases = {
|
|
|
|
{ .gen = 1, .base = RENDER_RING_BASE }
|
|
|
|
},
|
2016-07-13 15:03:40 +00:00
|
|
|
},
|
2019-03-05 18:03:30 +00:00
|
|
|
[BCS0] = {
|
|
|
|
.hw_id = BCS0_HW,
|
2017-04-10 14:34:29 +00:00
|
|
|
.class = COPY_ENGINE_CLASS,
|
|
|
|
.instance = 0,
|
2018-03-14 18:26:50 +00:00
|
|
|
.mmio_bases = {
|
|
|
|
{ .gen = 6, .base = BLT_RING_BASE }
|
|
|
|
},
|
2016-07-13 15:03:40 +00:00
|
|
|
},
|
2019-03-05 18:03:30 +00:00
|
|
|
[VCS0] = {
|
|
|
|
.hw_id = VCS0_HW,
|
2017-04-10 14:34:29 +00:00
|
|
|
.class = VIDEO_DECODE_CLASS,
|
|
|
|
.instance = 0,
|
2018-03-14 18:26:50 +00:00
|
|
|
.mmio_bases = {
|
|
|
|
{ .gen = 11, .base = GEN11_BSD_RING_BASE },
|
|
|
|
{ .gen = 6, .base = GEN6_BSD_RING_BASE },
|
|
|
|
{ .gen = 4, .base = BSD_RING_BASE }
|
|
|
|
},
|
2016-07-13 15:03:40 +00:00
|
|
|
},
|
2019-03-05 18:03:30 +00:00
|
|
|
[VCS1] = {
|
|
|
|
.hw_id = VCS1_HW,
|
2017-04-10 14:34:29 +00:00
|
|
|
.class = VIDEO_DECODE_CLASS,
|
|
|
|
.instance = 1,
|
2018-03-14 18:26:50 +00:00
|
|
|
.mmio_bases = {
|
|
|
|
{ .gen = 11, .base = GEN11_BSD2_RING_BASE },
|
|
|
|
{ .gen = 8, .base = GEN8_BSD2_RING_BASE }
|
|
|
|
},
|
2016-07-13 15:03:40 +00:00
|
|
|
},
|
2019-03-05 18:03:30 +00:00
|
|
|
[VCS2] = {
|
|
|
|
.hw_id = VCS2_HW,
|
2018-03-02 16:14:57 +00:00
|
|
|
.class = VIDEO_DECODE_CLASS,
|
|
|
|
.instance = 2,
|
2018-03-14 18:26:50 +00:00
|
|
|
.mmio_bases = {
|
|
|
|
{ .gen = 11, .base = GEN11_BSD3_RING_BASE }
|
|
|
|
},
|
2018-03-02 16:14:57 +00:00
|
|
|
},
|
2019-03-05 18:03:30 +00:00
|
|
|
[VCS3] = {
|
|
|
|
.hw_id = VCS3_HW,
|
2018-03-02 16:14:57 +00:00
|
|
|
.class = VIDEO_DECODE_CLASS,
|
|
|
|
.instance = 3,
|
2018-03-14 18:26:50 +00:00
|
|
|
.mmio_bases = {
|
|
|
|
{ .gen = 11, .base = GEN11_BSD4_RING_BASE }
|
|
|
|
},
|
2018-03-02 16:14:57 +00:00
|
|
|
},
|
2019-03-05 18:03:30 +00:00
|
|
|
[VECS0] = {
|
|
|
|
.hw_id = VECS0_HW,
|
2017-04-10 14:34:29 +00:00
|
|
|
.class = VIDEO_ENHANCEMENT_CLASS,
|
|
|
|
.instance = 0,
|
2018-03-14 18:26:50 +00:00
|
|
|
.mmio_bases = {
|
|
|
|
{ .gen = 11, .base = GEN11_VEBOX_RING_BASE },
|
|
|
|
{ .gen = 7, .base = VEBOX_RING_BASE }
|
|
|
|
},
|
2016-07-13 15:03:40 +00:00
|
|
|
},
|
2019-03-05 18:03:30 +00:00
|
|
|
[VECS1] = {
|
|
|
|
.hw_id = VECS1_HW,
|
2018-03-02 16:14:57 +00:00
|
|
|
.class = VIDEO_ENHANCEMENT_CLASS,
|
|
|
|
.instance = 1,
|
2018-03-14 18:26:50 +00:00
|
|
|
.mmio_bases = {
|
|
|
|
{ .gen = 11, .base = GEN11_VEBOX2_RING_BASE }
|
|
|
|
},
|
2018-03-02 16:14:57 +00:00
|
|
|
},
|
2016-07-13 15:03:40 +00:00
|
|
|
};
|
|
|
|
|
2017-04-28 07:53:36 +00:00
|
|
|
/**
|
2019-05-27 18:35:59 +00:00
|
|
|
* intel_engine_context_size() - return the size of the context for an engine
|
2017-04-28 07:53:36 +00:00
|
|
|
* @dev_priv: i915 device private
|
|
|
|
* @class: engine class
|
|
|
|
*
|
|
|
|
* Each engine class may require a different amount of space for a context
|
|
|
|
* image.
|
|
|
|
*
|
|
|
|
* Return: size (in bytes) of an engine class specific context image
|
|
|
|
*
|
|
|
|
* Note: this size includes the HWSP, which is part of the context image
|
|
|
|
* in LRC mode, but does not include the "shared data page" used with
|
|
|
|
* GuC submission. The caller should account for this if using the GuC.
|
|
|
|
*/
|
2019-05-27 18:35:59 +00:00
|
|
|
u32 intel_engine_context_size(struct drm_i915_private *dev_priv, u8 class)
|
2017-04-28 07:53:36 +00:00
|
|
|
{
|
|
|
|
u32 cxt_size;
|
|
|
|
|
|
|
|
BUILD_BUG_ON(I915_GTT_PAGE_SIZE != PAGE_SIZE);
|
|
|
|
|
|
|
|
switch (class) {
|
|
|
|
case RENDER_CLASS:
|
|
|
|
switch (INTEL_GEN(dev_priv)) {
|
|
|
|
default:
|
|
|
|
MISSING_CASE(INTEL_GEN(dev_priv));
|
2018-01-11 22:55:06 +00:00
|
|
|
return DEFAULT_LR_CONTEXT_RENDER_SIZE;
|
2019-08-17 09:38:48 +00:00
|
|
|
case 12:
|
2018-01-11 22:55:07 +00:00
|
|
|
case 11:
|
|
|
|
return GEN11_LR_CONTEXT_RENDER_SIZE;
|
2017-07-06 21:06:24 +00:00
|
|
|
case 10:
|
2017-09-21 23:19:49 +00:00
|
|
|
return GEN10_LR_CONTEXT_RENDER_SIZE;
|
2017-04-28 07:53:36 +00:00
|
|
|
case 9:
|
|
|
|
return GEN9_LR_CONTEXT_RENDER_SIZE;
|
|
|
|
case 8:
|
2017-11-20 20:55:00 +00:00
|
|
|
return GEN8_LR_CONTEXT_RENDER_SIZE;
|
2017-04-28 07:53:36 +00:00
|
|
|
case 7:
|
|
|
|
if (IS_HASWELL(dev_priv))
|
|
|
|
return HSW_CXT_TOTAL_SIZE;
|
|
|
|
|
|
|
|
cxt_size = I915_READ(GEN7_CXT_SIZE);
|
|
|
|
return round_up(GEN7_CXT_TOTAL_SIZE(cxt_size) * 64,
|
|
|
|
PAGE_SIZE);
|
|
|
|
case 6:
|
|
|
|
cxt_size = I915_READ(CXT_SIZE);
|
|
|
|
return round_up(GEN6_CXT_TOTAL_SIZE(cxt_size) * 64,
|
|
|
|
PAGE_SIZE);
|
|
|
|
case 5:
|
drm/i915: Enable render context support for gen4 (Broadwater to Cantiga)
Broadwater and the rest of gen4 do support being able to saving and
reloading context specific registers between contexts, providing isolation
of the basic GPU state (as programmable by userspace). This allows
userspace to assume that the GPU retains their state from one batch to the
next, minimising the amount of state it needs to reload and manually save
across batches.
v2: CONSTANT_BUFFER woes
Running through piglit turned up an interesting issue, a GPU hang inside
the context load. The context image includes the CONSTANT_BUFFER command
that loads an address into a on-gpu buffer, and the context load was
executing that immediately. However, since it was reading from the GTT
there is no guarantee that the GTT retains the same configuration as
when the context was saved, resulting in stray reads and a GPU hang.
Having tried issuing a CONSTANT_BUFFER (to disable the command) from the
ring before saving the context to no avail, we resort to patching out
the instruction inside the context image before loading.
This does impose that gen4 always reissues CONSTANT_BUFFER commands on
each batch, but due to the use of a shared GTT that was and will remain
a requirement.
v3: ECOSKPD to the rescue
Ville found the magic bit in the ECOSKPD to disable saving and restoring
the CONSTANT_BUFFER from the context image, thereby completely avoiding
the GPU hangs from chasing invalid pointers. This appears to be the
default behaviour for gen5, and so we just need to tweak gen4 to match.
v4: Fix spelling of ECOSKPD and discover it already exists
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190419172720.5462-1-chris@chris-wilson.co.uk
2019-04-19 17:27:20 +00:00
|
|
|
case 4:
|
2019-04-19 11:17:48 +00:00
|
|
|
/*
|
|
|
|
* There is a discrepancy here between the size reported
|
|
|
|
* by the register and the size of the context layout
|
|
|
|
* in the docs. Both are described as authorative!
|
|
|
|
*
|
|
|
|
* The discrepancy is on the order of a few cachelines,
|
|
|
|
* but the total is under one page (4k), which is our
|
|
|
|
* minimum allocation anyway so it should all come
|
|
|
|
* out in the wash.
|
|
|
|
*/
|
|
|
|
cxt_size = I915_READ(CXT_SIZE) + 1;
|
|
|
|
DRM_DEBUG_DRIVER("gen%d CXT_SIZE = %d bytes [0x%08x]\n",
|
|
|
|
INTEL_GEN(dev_priv),
|
|
|
|
cxt_size * 64,
|
|
|
|
cxt_size - 1);
|
|
|
|
return round_up(cxt_size * 64, PAGE_SIZE);
|
2017-04-28 07:53:36 +00:00
|
|
|
case 3:
|
|
|
|
case 2:
|
|
|
|
/* For the special day when i810 gets merged. */
|
|
|
|
case 1:
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
MISSING_CASE(class);
|
2018-06-28 22:35:41 +00:00
|
|
|
/* fall through */
|
2017-04-28 07:53:36 +00:00
|
|
|
case VIDEO_DECODE_CLASS:
|
|
|
|
case VIDEO_ENHANCEMENT_CLASS:
|
|
|
|
case COPY_ENGINE_CLASS:
|
|
|
|
if (INTEL_GEN(dev_priv) < 8)
|
|
|
|
return 0;
|
|
|
|
return GEN8_LR_CONTEXT_OTHER_SIZE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-03-14 18:26:50 +00:00
|
|
|
static u32 __engine_mmio_base(struct drm_i915_private *i915,
|
|
|
|
const struct engine_mmio_base *bases)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < MAX_MMIO_BASES; i++)
|
|
|
|
if (INTEL_GEN(i915) >= bases[i].gen)
|
|
|
|
break;
|
|
|
|
|
|
|
|
GEM_BUG_ON(i == MAX_MMIO_BASES);
|
|
|
|
GEM_BUG_ON(!bases[i].base);
|
|
|
|
|
|
|
|
return bases[i].base;
|
|
|
|
}
|
|
|
|
|
2019-08-07 11:04:31 +00:00
|
|
|
static void __sprint_engine_name(struct intel_engine_cs *engine)
|
2018-03-14 18:26:51 +00:00
|
|
|
{
|
2019-08-07 11:04:31 +00:00
|
|
|
/*
|
|
|
|
* Before we know what the uABI name for this engine will be,
|
|
|
|
* we still would like to keep track of this engine in the debug logs.
|
|
|
|
* We throw in a ' here as a reminder that this isn't its final name.
|
|
|
|
*/
|
|
|
|
GEM_WARN_ON(snprintf(engine->name, sizeof(engine->name), "%s'%u",
|
|
|
|
intel_engine_class_repr(engine->class),
|
|
|
|
engine->instance) >= sizeof(engine->name));
|
2018-03-14 18:26:51 +00:00
|
|
|
}
|
|
|
|
|
2018-12-18 10:27:12 +00:00
|
|
|
void intel_engine_set_hwsp_writemask(struct intel_engine_cs *engine, u32 mask)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Though they added more rings on g4x/ilk, they did not add
|
|
|
|
* per-engine HWSTAM until gen6.
|
|
|
|
*/
|
2019-03-25 21:49:40 +00:00
|
|
|
if (INTEL_GEN(engine->i915) < 6 && engine->class != RENDER_CLASS)
|
2018-12-18 10:27:12 +00:00
|
|
|
return;
|
|
|
|
|
2019-03-25 21:49:40 +00:00
|
|
|
if (INTEL_GEN(engine->i915) >= 3)
|
|
|
|
ENGINE_WRITE(engine, RING_HWSTAM, mask);
|
2018-12-18 10:27:12 +00:00
|
|
|
else
|
2019-03-25 21:49:40 +00:00
|
|
|
ENGINE_WRITE16(engine, RING_HWSTAM, mask);
|
2018-12-18 10:27:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_engine_sanitize_mmio(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
/* Mask off all writes into the unknown HWSP */
|
|
|
|
intel_engine_set_hwsp_writemask(engine, ~0u);
|
|
|
|
}
|
|
|
|
|
2019-08-06 12:43:00 +00:00
|
|
|
static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id)
|
2016-07-13 15:03:40 +00:00
|
|
|
{
|
|
|
|
const struct engine_info *info = &intel_engines[id];
|
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-13 17:14:48 +00:00
|
|
|
struct intel_engine_cs *engine;
|
|
|
|
|
2018-03-02 16:14:58 +00:00
|
|
|
BUILD_BUG_ON(MAX_ENGINE_CLASS >= BIT(GEN11_ENGINE_CLASS_WIDTH));
|
|
|
|
BUILD_BUG_ON(MAX_ENGINE_INSTANCE >= BIT(GEN11_ENGINE_INSTANCE_WIDTH));
|
|
|
|
|
2018-10-12 06:31:42 +00:00
|
|
|
if (GEM_DEBUG_WARN_ON(info->class > MAX_ENGINE_CLASS))
|
drm/i915/pmu: Expose a PMU interface for perf queries
From: Chris Wilson <chris@chris-wilson.co.uk>
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
The first goal is to be able to measure GPU (and invidual ring) busyness
without having to poll registers from userspace. (Which not only incurs
holding the forcewake lock indefinitely, perturbing the system, but also
runs the risk of hanging the machine.) As an alternative we can use the
perf event counter interface to sample the ring registers periodically
and send those results to userspace.
Functionality we are exporting to userspace is via the existing perf PMU
API and can be exercised via the existing tools. For example:
perf stat -a -e i915/rcs0-busy/ -I 1000
Will print the render engine busynnes once per second. All the performance
counters can be enumerated (perf list) and have their unit of measure
correctly reported in sysfs.
v1-v2 (Chris Wilson):
v2: Use a common timer for the ring sampling.
v3: (Tvrtko Ursulin)
* Decouple uAPI from i915 engine ids.
* Complete uAPI defines.
* Refactor some code to helpers for clarity.
* Skip sampling disabled engines.
* Expose counters in sysfs.
* Pass in fake regs to avoid null ptr deref in perf core.
* Convert to class/instance uAPI.
* Use shared driver code for rc6 residency, power and frequency.
v4: (Dmitry Rogozhkin)
* Register PMU with .task_ctx_nr=perf_invalid_context
* Expose cpumask for the PMU with the single CPU in the mask
* Properly support pmu->stop(): it should call pmu->read()
* Properly support pmu->del(): it should call stop(event, PERF_EF_UPDATE)
* Introduce refcounting of event subscriptions.
* Make pmu.busy_stats a refcounter to avoid busy stats going away
with some deleted event.
* Expose cpumask for i915 PMU to avoid multiple events creation of
the same type followed by counter aggregation by perf-stat.
* Track CPUs getting online/offline to migrate perf context. If (likely)
cpumask will initially set CPU0, CONFIG_BOOTPARAM_HOTPLUG_CPU0 will be
needed to see effect of CPU status tracking.
* End result is that only global events are supported and perf stat
works correctly.
* Deny perf driver level sampling - it is prohibited for uncore PMU.
v5: (Tvrtko Ursulin)
* Don't hardcode number of engine samplers.
* Rewrite event ref-counting for correctness and simplicity.
* Store initial counter value when starting already enabled events
to correctly report values to all listeners.
* Fix RC6 residency readout.
* Comments, GPL header.
v6:
* Add missing entry to v4 changelog.
* Fix accounting in CPU hotplug case by copying the approach from
arch/x86/events/intel/cstate.c. (Dmitry Rogozhkin)
v7:
* Log failure message only on failure.
* Remove CPU hotplug notification state on unregister.
v8:
* Fix error unwind on failed registration.
* Checkpatch cleanup.
v9:
* Drop the energy metric, it is available via intel_rapl_perf.
(Ville Syrjälä)
* Use HAS_RC6(p). (Chris Wilson)
* Handle unsupported non-engine events. (Dmitry Rogozhkin)
* Rebase for intel_rc6_residency_ns needing caller managed
runtime pm.
* Drop HAS_RC6 checks from the read callback since creating those
events will be rejected at init time already.
* Add counter units to sysfs so perf stat output is nicer.
* Cleanup the attribute tables for brevity and readability.
v10:
* Fixed queued accounting.
v11:
* Move intel_engine_lookup_user to intel_engine_cs.c
* Commit update. (Joonas Lahtinen)
v12:
* More accurate sampling. (Chris Wilson)
* Store and report frequency in MHz for better usability from
perf stat.
* Removed metrics: queued, interrupts, rc6 counters.
* Sample engine busyness based on seqno difference only
for less MMIO (and forcewake) on all platforms. (Chris Wilson)
v13:
* Comment spelling, use mul_u32_u32 to work around potential GCC
issue and somne code alignment changes. (Chris Wilson)
v14:
* Rebase.
v15:
* Rebase for RPS refactoring.
v16:
* Use the dynamic slot in the CPU hotplug state machine so that we are
free to setup our state as multi-instance. Previously we were re-using
the CPUHP_AP_PERF_X86_UNCORE_ONLINE slot which is neither used as
multi-instance, nor owned by our driver to start with.
* Register the CPU hotplug handlers after the PMU, otherwise the callback
will get called before the PMU is initialized which can end up in
perf_pmu_migrate_context with an un-initialized base.
* Added workaround for a probable bug in cpuhp core.
v17:
* Remove workaround for the cpuhp bug.
v18:
* Rebase for drm_i915_gem_engine_class getting upstream before us.
v19:
* Rebase. (trivial)
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171121181852.16128-2-tvrtko.ursulin@linux.intel.com
2017-11-21 18:18:45 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2018-10-12 06:31:42 +00:00
|
|
|
if (GEM_DEBUG_WARN_ON(info->instance > MAX_ENGINE_INSTANCE))
|
drm/i915/pmu: Expose a PMU interface for perf queries
From: Chris Wilson <chris@chris-wilson.co.uk>
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
The first goal is to be able to measure GPU (and invidual ring) busyness
without having to poll registers from userspace. (Which not only incurs
holding the forcewake lock indefinitely, perturbing the system, but also
runs the risk of hanging the machine.) As an alternative we can use the
perf event counter interface to sample the ring registers periodically
and send those results to userspace.
Functionality we are exporting to userspace is via the existing perf PMU
API and can be exercised via the existing tools. For example:
perf stat -a -e i915/rcs0-busy/ -I 1000
Will print the render engine busynnes once per second. All the performance
counters can be enumerated (perf list) and have their unit of measure
correctly reported in sysfs.
v1-v2 (Chris Wilson):
v2: Use a common timer for the ring sampling.
v3: (Tvrtko Ursulin)
* Decouple uAPI from i915 engine ids.
* Complete uAPI defines.
* Refactor some code to helpers for clarity.
* Skip sampling disabled engines.
* Expose counters in sysfs.
* Pass in fake regs to avoid null ptr deref in perf core.
* Convert to class/instance uAPI.
* Use shared driver code for rc6 residency, power and frequency.
v4: (Dmitry Rogozhkin)
* Register PMU with .task_ctx_nr=perf_invalid_context
* Expose cpumask for the PMU with the single CPU in the mask
* Properly support pmu->stop(): it should call pmu->read()
* Properly support pmu->del(): it should call stop(event, PERF_EF_UPDATE)
* Introduce refcounting of event subscriptions.
* Make pmu.busy_stats a refcounter to avoid busy stats going away
with some deleted event.
* Expose cpumask for i915 PMU to avoid multiple events creation of
the same type followed by counter aggregation by perf-stat.
* Track CPUs getting online/offline to migrate perf context. If (likely)
cpumask will initially set CPU0, CONFIG_BOOTPARAM_HOTPLUG_CPU0 will be
needed to see effect of CPU status tracking.
* End result is that only global events are supported and perf stat
works correctly.
* Deny perf driver level sampling - it is prohibited for uncore PMU.
v5: (Tvrtko Ursulin)
* Don't hardcode number of engine samplers.
* Rewrite event ref-counting for correctness and simplicity.
* Store initial counter value when starting already enabled events
to correctly report values to all listeners.
* Fix RC6 residency readout.
* Comments, GPL header.
v6:
* Add missing entry to v4 changelog.
* Fix accounting in CPU hotplug case by copying the approach from
arch/x86/events/intel/cstate.c. (Dmitry Rogozhkin)
v7:
* Log failure message only on failure.
* Remove CPU hotplug notification state on unregister.
v8:
* Fix error unwind on failed registration.
* Checkpatch cleanup.
v9:
* Drop the energy metric, it is available via intel_rapl_perf.
(Ville Syrjälä)
* Use HAS_RC6(p). (Chris Wilson)
* Handle unsupported non-engine events. (Dmitry Rogozhkin)
* Rebase for intel_rc6_residency_ns needing caller managed
runtime pm.
* Drop HAS_RC6 checks from the read callback since creating those
events will be rejected at init time already.
* Add counter units to sysfs so perf stat output is nicer.
* Cleanup the attribute tables for brevity and readability.
v10:
* Fixed queued accounting.
v11:
* Move intel_engine_lookup_user to intel_engine_cs.c
* Commit update. (Joonas Lahtinen)
v12:
* More accurate sampling. (Chris Wilson)
* Store and report frequency in MHz for better usability from
perf stat.
* Removed metrics: queued, interrupts, rc6 counters.
* Sample engine busyness based on seqno difference only
for less MMIO (and forcewake) on all platforms. (Chris Wilson)
v13:
* Comment spelling, use mul_u32_u32 to work around potential GCC
issue and somne code alignment changes. (Chris Wilson)
v14:
* Rebase.
v15:
* Rebase for RPS refactoring.
v16:
* Use the dynamic slot in the CPU hotplug state machine so that we are
free to setup our state as multi-instance. Previously we were re-using
the CPUHP_AP_PERF_X86_UNCORE_ONLINE slot which is neither used as
multi-instance, nor owned by our driver to start with.
* Register the CPU hotplug handlers after the PMU, otherwise the callback
will get called before the PMU is initialized which can end up in
perf_pmu_migrate_context with an un-initialized base.
* Added workaround for a probable bug in cpuhp core.
v17:
* Remove workaround for the cpuhp bug.
v18:
* Rebase for drm_i915_gem_engine_class getting upstream before us.
v19:
* Rebase. (trivial)
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171121181852.16128-2-tvrtko.ursulin@linux.intel.com
2017-11-21 18:18:45 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2019-08-06 12:43:00 +00:00
|
|
|
if (GEM_DEBUG_WARN_ON(gt->engine_class[info->class][info->instance]))
|
drm/i915/pmu: Expose a PMU interface for perf queries
From: Chris Wilson <chris@chris-wilson.co.uk>
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
The first goal is to be able to measure GPU (and invidual ring) busyness
without having to poll registers from userspace. (Which not only incurs
holding the forcewake lock indefinitely, perturbing the system, but also
runs the risk of hanging the machine.) As an alternative we can use the
perf event counter interface to sample the ring registers periodically
and send those results to userspace.
Functionality we are exporting to userspace is via the existing perf PMU
API and can be exercised via the existing tools. For example:
perf stat -a -e i915/rcs0-busy/ -I 1000
Will print the render engine busynnes once per second. All the performance
counters can be enumerated (perf list) and have their unit of measure
correctly reported in sysfs.
v1-v2 (Chris Wilson):
v2: Use a common timer for the ring sampling.
v3: (Tvrtko Ursulin)
* Decouple uAPI from i915 engine ids.
* Complete uAPI defines.
* Refactor some code to helpers for clarity.
* Skip sampling disabled engines.
* Expose counters in sysfs.
* Pass in fake regs to avoid null ptr deref in perf core.
* Convert to class/instance uAPI.
* Use shared driver code for rc6 residency, power and frequency.
v4: (Dmitry Rogozhkin)
* Register PMU with .task_ctx_nr=perf_invalid_context
* Expose cpumask for the PMU with the single CPU in the mask
* Properly support pmu->stop(): it should call pmu->read()
* Properly support pmu->del(): it should call stop(event, PERF_EF_UPDATE)
* Introduce refcounting of event subscriptions.
* Make pmu.busy_stats a refcounter to avoid busy stats going away
with some deleted event.
* Expose cpumask for i915 PMU to avoid multiple events creation of
the same type followed by counter aggregation by perf-stat.
* Track CPUs getting online/offline to migrate perf context. If (likely)
cpumask will initially set CPU0, CONFIG_BOOTPARAM_HOTPLUG_CPU0 will be
needed to see effect of CPU status tracking.
* End result is that only global events are supported and perf stat
works correctly.
* Deny perf driver level sampling - it is prohibited for uncore PMU.
v5: (Tvrtko Ursulin)
* Don't hardcode number of engine samplers.
* Rewrite event ref-counting for correctness and simplicity.
* Store initial counter value when starting already enabled events
to correctly report values to all listeners.
* Fix RC6 residency readout.
* Comments, GPL header.
v6:
* Add missing entry to v4 changelog.
* Fix accounting in CPU hotplug case by copying the approach from
arch/x86/events/intel/cstate.c. (Dmitry Rogozhkin)
v7:
* Log failure message only on failure.
* Remove CPU hotplug notification state on unregister.
v8:
* Fix error unwind on failed registration.
* Checkpatch cleanup.
v9:
* Drop the energy metric, it is available via intel_rapl_perf.
(Ville Syrjälä)
* Use HAS_RC6(p). (Chris Wilson)
* Handle unsupported non-engine events. (Dmitry Rogozhkin)
* Rebase for intel_rc6_residency_ns needing caller managed
runtime pm.
* Drop HAS_RC6 checks from the read callback since creating those
events will be rejected at init time already.
* Add counter units to sysfs so perf stat output is nicer.
* Cleanup the attribute tables for brevity and readability.
v10:
* Fixed queued accounting.
v11:
* Move intel_engine_lookup_user to intel_engine_cs.c
* Commit update. (Joonas Lahtinen)
v12:
* More accurate sampling. (Chris Wilson)
* Store and report frequency in MHz for better usability from
perf stat.
* Removed metrics: queued, interrupts, rc6 counters.
* Sample engine busyness based on seqno difference only
for less MMIO (and forcewake) on all platforms. (Chris Wilson)
v13:
* Comment spelling, use mul_u32_u32 to work around potential GCC
issue and somne code alignment changes. (Chris Wilson)
v14:
* Rebase.
v15:
* Rebase for RPS refactoring.
v16:
* Use the dynamic slot in the CPU hotplug state machine so that we are
free to setup our state as multi-instance. Previously we were re-using
the CPUHP_AP_PERF_X86_UNCORE_ONLINE slot which is neither used as
multi-instance, nor owned by our driver to start with.
* Register the CPU hotplug handlers after the PMU, otherwise the callback
will get called before the PMU is initialized which can end up in
perf_pmu_migrate_context with an un-initialized base.
* Added workaround for a probable bug in cpuhp core.
v17:
* Remove workaround for the cpuhp bug.
v18:
* Rebase for drm_i915_gem_engine_class getting upstream before us.
v19:
* Rebase. (trivial)
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171121181852.16128-2-tvrtko.ursulin@linux.intel.com
2017-11-21 18:18:45 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-13 17:14:48 +00:00
|
|
|
engine = kzalloc(sizeof(*engine), GFP_KERNEL);
|
|
|
|
if (!engine)
|
|
|
|
return -ENOMEM;
|
2016-07-13 15:03:40 +00:00
|
|
|
|
2019-03-05 18:03:30 +00:00
|
|
|
BUILD_BUG_ON(BITS_PER_TYPE(engine->mask) < I915_NUM_ENGINES);
|
|
|
|
|
2016-07-13 15:03:40 +00:00
|
|
|
engine->id = id;
|
2019-03-05 18:03:30 +00:00
|
|
|
engine->mask = BIT(id);
|
2019-08-06 12:43:00 +00:00
|
|
|
engine->i915 = gt->i915;
|
|
|
|
engine->gt = gt;
|
|
|
|
engine->uncore = gt->uncore;
|
2016-08-16 16:04:20 +00:00
|
|
|
engine->hw_id = engine->guc_id = info->hw_id;
|
2019-08-06 12:43:00 +00:00
|
|
|
engine->mmio_base = __engine_mmio_base(gt->i915, info->mmio_bases);
|
2019-08-07 11:04:31 +00:00
|
|
|
|
2017-04-10 14:34:29 +00:00
|
|
|
engine->class = info->class;
|
|
|
|
engine->instance = info->instance;
|
2019-08-07 11:04:31 +00:00
|
|
|
__sprint_engine_name(engine);
|
2016-07-13 15:03:40 +00:00
|
|
|
|
2019-05-01 10:32:04 +00:00
|
|
|
/*
|
|
|
|
* To be overridden by the backend on setup. However to facilitate
|
|
|
|
* cleanup on error during setup, we always provide the destroy vfunc.
|
|
|
|
*/
|
|
|
|
engine->destroy = (typeof(engine->destroy))kfree;
|
|
|
|
|
2019-08-06 12:43:00 +00:00
|
|
|
engine->context_size = intel_engine_context_size(gt->i915,
|
2019-05-27 18:35:59 +00:00
|
|
|
engine->class);
|
2017-04-28 07:53:36 +00:00
|
|
|
if (WARN_ON(engine->context_size > BIT(20)))
|
|
|
|
engine->context_size = 0;
|
2018-07-06 10:14:41 +00:00
|
|
|
if (engine->context_size)
|
2019-08-06 12:43:00 +00:00
|
|
|
DRIVER_CAPS(gt->i915)->has_logical_contexts = true;
|
2017-04-28 07:53:36 +00:00
|
|
|
|
2016-11-14 20:41:01 +00:00
|
|
|
/* Nothing to do here, execute in order of dependencies */
|
|
|
|
engine->schedule = NULL;
|
|
|
|
|
2018-04-26 07:47:16 +00:00
|
|
|
seqlock_init(&engine->stats.lock);
|
2017-11-21 18:18:48 +00:00
|
|
|
|
2017-03-13 02:47:11 +00:00
|
|
|
ATOMIC_INIT_NOTIFIER_HEAD(&engine->context_status_notifier);
|
|
|
|
|
2018-12-18 10:27:12 +00:00
|
|
|
/* Scrub mmio state on takeover */
|
|
|
|
intel_engine_sanitize_mmio(engine);
|
|
|
|
|
2019-08-06 12:43:00 +00:00
|
|
|
gt->engine_class[info->class][info->instance] = engine;
|
|
|
|
|
|
|
|
intel_engine_add_user(engine);
|
|
|
|
gt->i915->engine[id] = engine;
|
|
|
|
|
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-13 17:14:48 +00:00
|
|
|
return 0;
|
2016-07-13 15:03:40 +00:00
|
|
|
}
|
|
|
|
|
2019-05-22 09:00:54 +00:00
|
|
|
static void __setup_engine_capabilities(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
struct drm_i915_private *i915 = engine->i915;
|
|
|
|
|
|
|
|
if (engine->class == VIDEO_DECODE_CLASS) {
|
|
|
|
/*
|
|
|
|
* HEVC support is present on first engine instance
|
|
|
|
* before Gen11 and on all instances afterwards.
|
|
|
|
*/
|
|
|
|
if (INTEL_GEN(i915) >= 11 ||
|
|
|
|
(INTEL_GEN(i915) >= 9 && engine->instance == 0))
|
|
|
|
engine->uabi_capabilities |=
|
|
|
|
I915_VIDEO_CLASS_CAPABILITY_HEVC;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* SFC block is present only on even logical engine
|
|
|
|
* instances.
|
|
|
|
*/
|
|
|
|
if ((INTEL_GEN(i915) >= 11 &&
|
|
|
|
RUNTIME_INFO(i915)->vdbox_sfc_access & engine->mask) ||
|
|
|
|
(INTEL_GEN(i915) >= 9 && engine->instance == 0))
|
|
|
|
engine->uabi_capabilities |=
|
|
|
|
I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC;
|
|
|
|
} else if (engine->class == VIDEO_ENHANCEMENT_CLASS) {
|
|
|
|
if (INTEL_GEN(i915) >= 9)
|
|
|
|
engine->uabi_capabilities |=
|
|
|
|
I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void intel_setup_engine_capabilities(struct drm_i915_private *i915)
|
|
|
|
{
|
|
|
|
struct intel_engine_cs *engine;
|
|
|
|
enum intel_engine_id id;
|
|
|
|
|
|
|
|
for_each_engine(engine, i915, id)
|
|
|
|
__setup_engine_capabilities(engine);
|
|
|
|
}
|
|
|
|
|
2019-05-01 10:32:04 +00:00
|
|
|
/**
|
|
|
|
* intel_engines_cleanup() - free the resources allocated for Command Streamers
|
|
|
|
* @i915: the i915 devic
|
|
|
|
*/
|
|
|
|
void intel_engines_cleanup(struct drm_i915_private *i915)
|
|
|
|
{
|
|
|
|
struct intel_engine_cs *engine;
|
|
|
|
enum intel_engine_id id;
|
|
|
|
|
|
|
|
for_each_engine(engine, i915, id) {
|
|
|
|
engine->destroy(engine);
|
|
|
|
i915->engine[id] = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-13 15:03:40 +00:00
|
|
|
/**
|
2017-04-28 07:53:36 +00:00
|
|
|
* intel_engines_init_mmio() - allocate and prepare the Engine Command Streamers
|
2019-05-01 10:32:04 +00:00
|
|
|
* @i915: the i915 device
|
2016-07-13 15:03:40 +00:00
|
|
|
*
|
|
|
|
* Return: non-zero if the initialization failed.
|
|
|
|
*/
|
2019-05-01 10:32:04 +00:00
|
|
|
int intel_engines_init_mmio(struct drm_i915_private *i915)
|
2016-07-13 15:03:40 +00:00
|
|
|
{
|
2019-05-01 10:32:04 +00:00
|
|
|
struct intel_device_info *device_info = mkwrite_device_info(i915);
|
|
|
|
const unsigned int engine_mask = INTEL_INFO(i915)->engine_mask;
|
2017-04-11 16:56:58 +00:00
|
|
|
unsigned int mask = 0;
|
2016-07-13 15:03:40 +00:00
|
|
|
unsigned int i;
|
2017-01-24 11:01:34 +00:00
|
|
|
int err;
|
2016-07-13 15:03:40 +00:00
|
|
|
|
2019-03-05 18:03:30 +00:00
|
|
|
WARN_ON(engine_mask == 0);
|
|
|
|
WARN_ON(engine_mask &
|
2018-09-26 10:47:07 +00:00
|
|
|
GENMASK(BITS_PER_TYPE(mask) - 1, I915_NUM_ENGINES));
|
2016-07-13 15:03:40 +00:00
|
|
|
|
2019-08-02 18:40:50 +00:00
|
|
|
if (i915_inject_probe_failure(i915))
|
2018-10-11 13:00:08 +00:00
|
|
|
return -ENODEV;
|
|
|
|
|
2016-07-13 15:03:40 +00:00
|
|
|
for (i = 0; i < ARRAY_SIZE(intel_engines); i++) {
|
2019-05-01 10:32:04 +00:00
|
|
|
if (!HAS_ENGINE(i915, i))
|
2016-07-13 15:03:40 +00:00
|
|
|
continue;
|
|
|
|
|
2019-08-06 12:43:00 +00:00
|
|
|
err = intel_engine_setup(&i915->gt, i);
|
2017-01-24 11:01:34 +00:00
|
|
|
if (err)
|
|
|
|
goto cleanup;
|
|
|
|
|
2019-03-05 18:03:30 +00:00
|
|
|
mask |= BIT(i);
|
2017-01-24 11:01:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Catch failures to update intel_engines table when the new engines
|
|
|
|
* are added to the driver by a warning and disabling the forgotten
|
|
|
|
* engines.
|
|
|
|
*/
|
2019-03-05 18:03:30 +00:00
|
|
|
if (WARN_ON(mask != engine_mask))
|
|
|
|
device_info->engine_mask = mask;
|
2017-01-24 11:01:34 +00:00
|
|
|
|
2019-05-01 10:32:04 +00:00
|
|
|
RUNTIME_INFO(i915)->num_engines = hweight32(mask);
|
2017-01-24 11:01:34 +00:00
|
|
|
|
2019-06-21 07:07:44 +00:00
|
|
|
intel_gt_check_and_clear_faults(&i915->gt);
|
2017-11-11 00:44:47 +00:00
|
|
|
|
2019-05-22 09:00:54 +00:00
|
|
|
intel_setup_engine_capabilities(i915);
|
|
|
|
|
2017-01-24 11:01:34 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
cleanup:
|
2019-05-01 10:32:04 +00:00
|
|
|
intel_engines_cleanup(i915);
|
2017-01-24 11:01:34 +00:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2017-04-28 07:53:36 +00:00
|
|
|
* intel_engines_init() - init the Engine Command Streamers
|
2019-04-26 16:33:33 +00:00
|
|
|
* @i915: i915 device private
|
2017-01-24 11:01:34 +00:00
|
|
|
*
|
|
|
|
* Return: non-zero if the initialization failed.
|
|
|
|
*/
|
2019-04-26 16:33:33 +00:00
|
|
|
int intel_engines_init(struct drm_i915_private *i915)
|
2017-01-24 11:01:34 +00:00
|
|
|
{
|
2019-04-26 16:33:33 +00:00
|
|
|
int (*init)(struct intel_engine_cs *engine);
|
2017-01-24 11:01:34 +00:00
|
|
|
struct intel_engine_cs *engine;
|
2019-05-01 10:32:04 +00:00
|
|
|
enum intel_engine_id id;
|
2017-06-16 13:03:38 +00:00
|
|
|
int err;
|
2017-01-24 11:01:34 +00:00
|
|
|
|
2019-04-26 16:33:33 +00:00
|
|
|
if (HAS_EXECLISTS(i915))
|
|
|
|
init = intel_execlists_submission_init;
|
|
|
|
else
|
|
|
|
init = intel_ring_submission_init;
|
2017-06-16 13:03:38 +00:00
|
|
|
|
2019-04-26 16:33:33 +00:00
|
|
|
for_each_engine(engine, i915, id) {
|
2017-01-24 11:01:34 +00:00
|
|
|
err = init(engine);
|
2017-06-16 13:03:38 +00:00
|
|
|
if (err)
|
2016-07-13 15:03:40 +00:00
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
cleanup:
|
2019-05-01 10:32:04 +00:00
|
|
|
intel_engines_cleanup(i915);
|
2017-01-24 11:01:34 +00:00
|
|
|
return err;
|
2016-07-13 15:03:40 +00:00
|
|
|
}
|
|
|
|
|
drm/i915: Invert the GEM wakeref hierarchy
In the current scheme, on submitting a request we take a single global
GEM wakeref, which trickles down to wake up all GT power domains. This
is undesirable as we would like to be able to localise our power
management to the available power domains and to remove the global GEM
operations from the heart of the driver. (The intent there is to push
global GEM decisions to the boundary as used by the GEM user interface.)
Now during request construction, each request is responsible via its
logical context to acquire a wakeref on each power domain it intends to
utilize. Currently, each request takes a wakeref on the engine(s) and
the engines themselves take a chipset wakeref. This gives us a
transition on each engine which we can extend if we want to insert more
powermangement control (such as soft rc6). The global GEM operations
that currently require a struct_mutex are reduced to listening to pm
events from the chipset GT wakeref. As we reduce the struct_mutex
requirement, these listeners should evaporate.
Perhaps the biggest immediate change is that this removes the
struct_mutex requirement around GT power management, allowing us greater
flexibility in request construction. Another important knock-on effect,
is that by tracking engine usage, we can insert a switch back to the
kernel context on that engine immediately, avoiding any extra delay or
inserting global synchronisation barriers. This makes tracking when an
engine and its associated contexts are idle much easier -- important for
when we forgo our assumed execution ordering and need idle barriers to
unpin used contexts. In the process, it means we remove a large chunk of
code whose only purpose was to switch back to the kernel context.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Imre Deak <imre.deak@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190424200717.1686-5-chris@chris-wilson.co.uk
2019-04-24 20:07:17 +00:00
|
|
|
void intel_engine_init_execlists(struct intel_engine_cs *engine)
|
2017-09-22 12:43:04 +00:00
|
|
|
{
|
|
|
|
struct intel_engine_execlists * const execlists = &engine->execlists;
|
|
|
|
|
2017-09-22 12:43:07 +00:00
|
|
|
execlists->port_mask = 1;
|
2018-10-16 12:29:38 +00:00
|
|
|
GEM_BUG_ON(!is_power_of_2(execlists_num_ports(execlists)));
|
2017-09-22 12:43:07 +00:00
|
|
|
GEM_BUG_ON(execlists_num_ports(execlists) > EXECLIST_MAX_PORTS);
|
|
|
|
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
memset(execlists->pending, 0, sizeof(execlists->pending));
|
|
|
|
execlists->active =
|
|
|
|
memset(execlists->inflight, 0, sizeof(execlists->inflight));
|
|
|
|
|
2019-01-29 18:54:51 +00:00
|
|
|
execlists->queue_priority_hint = INT_MIN;
|
2018-06-29 07:53:20 +00:00
|
|
|
execlists->queue = RB_ROOT_CACHED;
|
2017-09-22 12:43:04 +00:00
|
|
|
}
|
|
|
|
|
2018-09-03 15:23:03 +00:00
|
|
|
static void cleanup_status_page(struct intel_engine_cs *engine)
|
2017-09-13 08:56:02 +00:00
|
|
|
{
|
2019-01-28 10:23:55 +00:00
|
|
|
struct i915_vma *vma;
|
|
|
|
|
2018-12-18 10:27:12 +00:00
|
|
|
/* Prevent writes into HWSP after returning the page to the system */
|
|
|
|
intel_engine_set_hwsp_writemask(engine, ~0u);
|
|
|
|
|
2019-01-28 10:23:55 +00:00
|
|
|
vma = fetch_and_zero(&engine->status_page.vma);
|
|
|
|
if (!vma)
|
|
|
|
return;
|
2017-09-13 08:56:02 +00:00
|
|
|
|
2019-01-28 10:23:55 +00:00
|
|
|
if (!HWS_NEEDS_PHYSICAL(engine->i915))
|
|
|
|
i915_vma_unpin(vma);
|
|
|
|
|
|
|
|
i915_gem_object_unpin_map(vma->obj);
|
2019-05-28 09:29:56 +00:00
|
|
|
i915_gem_object_put(vma->obj);
|
2019-01-28 10:23:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int pin_ggtt_status_page(struct intel_engine_cs *engine,
|
|
|
|
struct i915_vma *vma)
|
|
|
|
{
|
|
|
|
unsigned int flags;
|
|
|
|
|
|
|
|
flags = PIN_GLOBAL;
|
|
|
|
if (!HAS_LLC(engine->i915))
|
|
|
|
/*
|
|
|
|
* On g33, we cannot place HWS above 256MiB, so
|
|
|
|
* restrict its pinning to the low mappable arena.
|
|
|
|
* Though this restriction is not documented for
|
|
|
|
* gen4, gen5, or byt, they also behave similarly
|
|
|
|
* and hang if the HWS is placed at the top of the
|
|
|
|
* GTT. To generalise, it appears that all !llc
|
|
|
|
* platforms have issues with us placing the HWS
|
|
|
|
* above the mappable region (even though we never
|
|
|
|
* actually map it).
|
|
|
|
*/
|
|
|
|
flags |= PIN_MAPPABLE;
|
|
|
|
else
|
|
|
|
flags |= PIN_HIGH;
|
2017-09-13 08:56:02 +00:00
|
|
|
|
2019-01-28 10:23:55 +00:00
|
|
|
return i915_vma_pin(vma, 0, 0, flags);
|
2017-09-13 08:56:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int init_status_page(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
struct drm_i915_gem_object *obj;
|
|
|
|
struct i915_vma *vma;
|
|
|
|
void *vaddr;
|
|
|
|
int ret;
|
|
|
|
|
2019-01-28 10:23:55 +00:00
|
|
|
/*
|
|
|
|
* Though the HWS register does support 36bit addresses, historically
|
|
|
|
* we have had hangs and corruption reported due to wild writes if
|
|
|
|
* the HWS is placed above 4G. We only allow objects to be allocated
|
|
|
|
* in GFP_DMA32 for i965, and no earlier physical address users had
|
|
|
|
* access to more than 4G.
|
|
|
|
*/
|
2017-09-13 08:56:02 +00:00
|
|
|
obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE);
|
|
|
|
if (IS_ERR(obj)) {
|
|
|
|
DRM_ERROR("Failed to allocate status page\n");
|
|
|
|
return PTR_ERR(obj);
|
|
|
|
}
|
|
|
|
|
drm/i915: Flush pages on acquisition
When we return pages to the system, we ensure that they are marked as
being in the CPU domain since any external access is uncontrolled and we
must assume the worst. This means that we need to always flush the pages
on acquisition if we need to use them on the GPU, and from the beginning
have used set-domain. Set-domain is overkill for the purpose as it is a
general synchronisation barrier, but our intent is to only flush the
pages being swapped in. If we move that flush into the pages acquisition
phase, we know then that when we have obj->mm.pages, they are coherent
with the GPU and need only maintain that status without resorting to
heavy handed use of set-domain.
The principle knock-on effect for userspace is through mmap-gtt
pagefaulting. Our uAPI has always implied that the GTT mmap was async
(especially as when any pagefault occurs is unpredicatable to userspace)
and so userspace had to apply explicit domain control itself
(set-domain). However, swapping is transparent to the kernel, and so on
first fault we need to acquire the pages and make them coherent for
access through the GTT. Our use of set-domain here leaks into the uABI
that the first pagefault was synchronous. This is unintentional and
baring a few igt should be unoticed, nevertheless we bump the uABI
version for mmap-gtt to reflect the change in behaviour.
Another implication of the change is that gem_create() is presumed to
create an object that is coherent with the CPU and is in the CPU write
domain, so a set-domain(CPU) following a gem_create() would be a minor
operation that merely checked whether we could allocate all pages for
the object. On applying this change, a set-domain(CPU) causes a clflush
as we acquire the pages. This will have a small impact on mesa as we move
the clflush here on !llc from execbuf time to create, but that should
have minimal performance impact as the same clflush exists but is now
done early and because of the clflush issue, userspace recycles bo and
so should resist allocating fresh objects.
Internally, the presumption that objects are created in the CPU
write-domain and remain so through writes to obj->mm.mapping is more
prevalent than I expected; but easy enough to catch and apply a manual
flush.
For the future, we should push the page flush from the central
set_pages() into the callers so that we can more finely control when it
is applied, but for now doing it one location is easier to validate, at
the cost of sometimes flushing when there is no need.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Matthew Auld <matthew.william.auld@gmail.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Antonio Argenziano <antonio.argenziano@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Matthew Auld <matthew.william.auld@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190321161908.8007-1-chris@chris-wilson.co.uk
2019-03-21 16:19:07 +00:00
|
|
|
i915_gem_object_set_cache_coherency(obj, I915_CACHE_LLC);
|
2017-09-13 08:56:02 +00:00
|
|
|
|
2019-06-21 07:08:08 +00:00
|
|
|
vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
|
2017-09-13 08:56:02 +00:00
|
|
|
if (IS_ERR(vma)) {
|
|
|
|
ret = PTR_ERR(vma);
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
|
|
|
|
if (IS_ERR(vaddr)) {
|
|
|
|
ret = PTR_ERR(vaddr);
|
2019-01-28 10:23:55 +00:00
|
|
|
goto err;
|
2017-09-13 08:56:02 +00:00
|
|
|
}
|
|
|
|
|
2019-01-28 10:23:55 +00:00
|
|
|
engine->status_page.addr = memset(vaddr, 0, PAGE_SIZE);
|
2017-09-13 08:56:02 +00:00
|
|
|
engine->status_page.vma = vma;
|
2019-01-28 10:23:55 +00:00
|
|
|
|
|
|
|
if (!HWS_NEEDS_PHYSICAL(engine->i915)) {
|
|
|
|
ret = pin_ggtt_status_page(engine, vma);
|
|
|
|
if (ret)
|
|
|
|
goto err_unpin;
|
|
|
|
}
|
|
|
|
|
2017-09-13 08:56:02 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
err_unpin:
|
2019-01-28 10:23:55 +00:00
|
|
|
i915_gem_object_unpin_map(obj);
|
2017-09-13 08:56:02 +00:00
|
|
|
err:
|
|
|
|
i915_gem_object_put(obj);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-04-26 16:33:33 +00:00
|
|
|
static int intel_engine_setup_common(struct intel_engine_cs *engine)
|
2019-01-28 18:18:09 +00:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
drm/i915: Keep contexts pinned until after the next kernel context switch
We need to keep the context image pinned in memory until after the GPU
has finished writing into it. Since it continues to write as we signal
the final breadcrumb, we need to keep it pinned until the request after
it is complete. Currently we know the order in which requests execute on
each engine, and so to remove that presumption we need to identify a
request/context-switch we know must occur after our completion. Any
request queued after the signal must imply a context switch, for
simplicity we use a fresh request from the kernel context.
The sequence of operations for keeping the context pinned until saved is:
- On context activation, we preallocate a node for each physical engine
the context may operate on. This is to avoid allocations during
unpinning, which may be from inside FS_RECLAIM context (aka the
shrinker)
- On context deactivation on retirement of the last active request (which
is before we know the context has been saved), we add the
preallocated node onto a barrier list on each engine
- On engine idling, we emit a switch to kernel context. When this
switch completes, we know that all previous contexts must have been
saved, and so on retiring this request we can finally unpin all the
contexts that were marked as deactivated prior to the switch.
We can enhance this in future by flushing all the idle contexts on a
regular heartbeat pulse of a switch to kernel context, which will also
be used to check for hung engines.
v2: intel_context_active_acquire/_release
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190614164606.15633-1-chris@chris-wilson.co.uk
2019-06-14 16:46:04 +00:00
|
|
|
init_llist_head(&engine->barrier_tasks);
|
|
|
|
|
2019-01-28 18:18:09 +00:00
|
|
|
err = init_status_page(engine);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2019-06-14 16:46:06 +00:00
|
|
|
intel_engine_init_active(engine, ENGINE_PHYSICAL);
|
drm/i915: Replace global breadcrumbs with per-context interrupt tracking
A few years ago, see commit 688e6c725816 ("drm/i915: Slaughter the
thundering i915_wait_request herd"), the issue of handling multiple
clients waiting in parallel was brought to our attention. The
requirement was that every client should be woken immediately upon its
request being signaled, without incurring any cpu overhead.
To handle certain fragility of our hw meant that we could not do a
simple check inside the irq handler (some generations required almost
unbounded delays before we could be sure of seqno coherency) and so
request completion checking required delegation.
Before commit 688e6c725816, the solution was simple. Every client
waiting on a request would be woken on every interrupt and each would do
a heavyweight check to see if their request was complete. Commit
688e6c725816 introduced an rbtree so that only the earliest waiter on
the global timeline would woken, and would wake the next and so on.
(Along with various complications to handle requests being reordered
along the global timeline, and also a requirement for kthread to provide
a delegate for fence signaling that had no process context.)
The global rbtree depends on knowing the execution timeline (and global
seqno). Without knowing that order, we must instead check all contexts
queued to the HW to see which may have advanced. We trim that list by
only checking queued contexts that are being waited on, but still we
keep a list of all active contexts and their active signalers that we
inspect from inside the irq handler. By moving the waiters onto the fence
signal list, we can combine the client wakeup with the dma_fence
signaling (a dramatic reduction in complexity, but does require the HW
being coherent, the seqno must be visible from the cpu before the
interrupt is raised - we keep a timer backup just in case).
Having previously fixed all the issues with irq-seqno serialisation (by
inserting delays onto the GPU after each request instead of random delays
on the CPU after each interrupt), we can rely on the seqno state to
perfom direct wakeups from the interrupt handler. This allows us to
preserve our single context switch behaviour of the current routine,
with the only downside that we lose the RT priority sorting of wakeups.
In general, direct wakeup latency of multiple clients is about the same
(about 10% better in most cases) with a reduction in total CPU time spent
in the waiter (about 20-50% depending on gen). Average herd behaviour is
improved, but at the cost of not delegating wakeups on task_prio.
v2: Capture fence signaling state for error state and add comments to
warm even the most cold of hearts.
v3: Check if the request is still active before busywaiting
v4: Reduce the amount of pointer misdirection with list_for_each_safe
and using a local i915_request variable inside the loops
v5: Add a missing pluralisation to a purely informative selftest message.
References: 688e6c725816 ("drm/i915: Slaughter the thundering i915_wait_request herd")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190129205230.19056-2-chris@chris-wilson.co.uk
2019-01-29 20:52:29 +00:00
|
|
|
intel_engine_init_breadcrumbs(engine);
|
drm/i915: Invert the GEM wakeref hierarchy
In the current scheme, on submitting a request we take a single global
GEM wakeref, which trickles down to wake up all GT power domains. This
is undesirable as we would like to be able to localise our power
management to the available power domains and to remove the global GEM
operations from the heart of the driver. (The intent there is to push
global GEM decisions to the boundary as used by the GEM user interface.)
Now during request construction, each request is responsible via its
logical context to acquire a wakeref on each power domain it intends to
utilize. Currently, each request takes a wakeref on the engine(s) and
the engines themselves take a chipset wakeref. This gives us a
transition on each engine which we can extend if we want to insert more
powermangement control (such as soft rc6). The global GEM operations
that currently require a struct_mutex are reduced to listening to pm
events from the chipset GT wakeref. As we reduce the struct_mutex
requirement, these listeners should evaporate.
Perhaps the biggest immediate change is that this removes the
struct_mutex requirement around GT power management, allowing us greater
flexibility in request construction. Another important knock-on effect,
is that by tracking engine usage, we can insert a switch back to the
kernel context on that engine immediately, avoiding any extra delay or
inserting global synchronisation barriers. This makes tracking when an
engine and its associated contexts are idle much easier -- important for
when we forgo our assumed execution ordering and need idle barriers to
unpin used contexts. In the process, it means we remove a large chunk of
code whose only purpose was to switch back to the kernel context.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Imre Deak <imre.deak@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190424200717.1686-5-chris@chris-wilson.co.uk
2019-04-24 20:07:17 +00:00
|
|
|
intel_engine_init_execlists(engine);
|
2019-01-28 18:18:09 +00:00
|
|
|
intel_engine_init_hangcheck(engine);
|
|
|
|
intel_engine_init_cmd_parser(engine);
|
drm/i915: Invert the GEM wakeref hierarchy
In the current scheme, on submitting a request we take a single global
GEM wakeref, which trickles down to wake up all GT power domains. This
is undesirable as we would like to be able to localise our power
management to the available power domains and to remove the global GEM
operations from the heart of the driver. (The intent there is to push
global GEM decisions to the boundary as used by the GEM user interface.)
Now during request construction, each request is responsible via its
logical context to acquire a wakeref on each power domain it intends to
utilize. Currently, each request takes a wakeref on the engine(s) and
the engines themselves take a chipset wakeref. This gives us a
transition on each engine which we can extend if we want to insert more
powermangement control (such as soft rc6). The global GEM operations
that currently require a struct_mutex are reduced to listening to pm
events from the chipset GT wakeref. As we reduce the struct_mutex
requirement, these listeners should evaporate.
Perhaps the biggest immediate change is that this removes the
struct_mutex requirement around GT power management, allowing us greater
flexibility in request construction. Another important knock-on effect,
is that by tracking engine usage, we can insert a switch back to the
kernel context on that engine immediately, avoiding any extra delay or
inserting global synchronisation barriers. This makes tracking when an
engine and its associated contexts are idle much easier -- important for
when we forgo our assumed execution ordering and need idle barriers to
unpin used contexts. In the process, it means we remove a large chunk of
code whose only purpose was to switch back to the kernel context.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Imre Deak <imre.deak@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190424200717.1686-5-chris@chris-wilson.co.uk
2019-04-24 20:07:17 +00:00
|
|
|
intel_engine_init__pm(engine);
|
2019-01-28 18:18:09 +00:00
|
|
|
|
2019-08-04 12:48:26 +00:00
|
|
|
intel_engine_pool_init(&engine->pool);
|
|
|
|
|
2019-04-24 09:51:34 +00:00
|
|
|
/* Use the whole device by default */
|
|
|
|
engine->sseu =
|
|
|
|
intel_sseu_from_device_info(&RUNTIME_INFO(engine->i915)->sseu);
|
|
|
|
|
2019-07-03 13:58:05 +00:00
|
|
|
intel_engine_init_workarounds(engine);
|
|
|
|
intel_engine_init_whitelist(engine);
|
|
|
|
intel_engine_init_ctx_wa(engine);
|
|
|
|
|
2019-01-28 18:18:09 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-04-26 16:33:33 +00:00
|
|
|
/**
|
|
|
|
* intel_engines_setup- setup engine state not requiring hw access
|
|
|
|
* @i915: Device to setup.
|
|
|
|
*
|
|
|
|
* Initializes engine structure members shared between legacy and execlists
|
|
|
|
* submission modes which do not require hardware access.
|
|
|
|
*
|
|
|
|
* Typically done early in the submission mode specific engine setup stage.
|
|
|
|
*/
|
|
|
|
int intel_engines_setup(struct drm_i915_private *i915)
|
|
|
|
{
|
|
|
|
int (*setup)(struct intel_engine_cs *engine);
|
|
|
|
struct intel_engine_cs *engine;
|
|
|
|
enum intel_engine_id id;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
if (HAS_EXECLISTS(i915))
|
|
|
|
setup = intel_execlists_submission_setup;
|
|
|
|
else
|
|
|
|
setup = intel_ring_submission_setup;
|
|
|
|
|
|
|
|
for_each_engine(engine, i915, id) {
|
|
|
|
err = intel_engine_setup_common(engine);
|
|
|
|
if (err)
|
|
|
|
goto cleanup;
|
|
|
|
|
|
|
|
err = setup(engine);
|
|
|
|
if (err)
|
|
|
|
goto cleanup;
|
|
|
|
|
2019-05-01 10:32:04 +00:00
|
|
|
/* We expect the backend to take control over its state */
|
|
|
|
GEM_BUG_ON(engine->destroy == (typeof(engine->destroy))kfree);
|
|
|
|
|
2019-04-26 16:33:33 +00:00
|
|
|
GEM_BUG_ON(!engine->cops);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
cleanup:
|
2019-05-01 10:32:04 +00:00
|
|
|
intel_engines_cleanup(i915);
|
2019-04-26 16:33:33 +00:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2019-01-25 10:05:20 +00:00
|
|
|
struct measure_breadcrumb {
|
|
|
|
struct i915_request rq;
|
2019-06-21 07:08:10 +00:00
|
|
|
struct intel_timeline timeline;
|
2019-01-25 10:05:20 +00:00
|
|
|
struct intel_ring ring;
|
|
|
|
u32 cs[1024];
|
|
|
|
};
|
|
|
|
|
2019-01-25 12:00:04 +00:00
|
|
|
static int measure_breadcrumb_dw(struct intel_engine_cs *engine)
|
2019-01-25 10:05:20 +00:00
|
|
|
{
|
|
|
|
struct measure_breadcrumb *frame;
|
2019-01-28 18:18:09 +00:00
|
|
|
int dw = -ENOMEM;
|
2019-01-25 10:05:20 +00:00
|
|
|
|
2019-06-21 07:08:11 +00:00
|
|
|
GEM_BUG_ON(!engine->gt->scratch);
|
2019-01-25 10:05:20 +00:00
|
|
|
|
|
|
|
frame = kzalloc(sizeof(*frame), GFP_KERNEL);
|
|
|
|
if (!frame)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2019-06-21 07:08:10 +00:00
|
|
|
if (intel_timeline_init(&frame->timeline,
|
|
|
|
engine->gt,
|
|
|
|
engine->status_page.vma))
|
2019-01-28 18:18:09 +00:00
|
|
|
goto out_frame;
|
2019-01-25 10:05:20 +00:00
|
|
|
|
|
|
|
frame->ring.vaddr = frame->cs;
|
|
|
|
frame->ring.size = sizeof(frame->cs);
|
|
|
|
frame->ring.effective_size = frame->ring.size;
|
|
|
|
intel_ring_update_space(&frame->ring);
|
|
|
|
|
|
|
|
frame->rq.i915 = engine->i915;
|
|
|
|
frame->rq.engine = engine;
|
|
|
|
frame->rq.ring = &frame->ring;
|
|
|
|
frame->rq.timeline = &frame->timeline;
|
|
|
|
|
2019-06-21 07:08:10 +00:00
|
|
|
dw = intel_timeline_pin(&frame->timeline);
|
2019-01-28 18:18:11 +00:00
|
|
|
if (dw < 0)
|
|
|
|
goto out_timeline;
|
|
|
|
|
2019-01-29 18:54:50 +00:00
|
|
|
dw = engine->emit_fini_breadcrumb(&frame->rq, frame->cs) - frame->cs;
|
2019-05-08 08:06:25 +00:00
|
|
|
GEM_BUG_ON(dw & 1); /* RING_TAIL must be qword aligned */
|
2019-01-25 10:05:20 +00:00
|
|
|
|
2019-06-21 07:08:10 +00:00
|
|
|
intel_timeline_unpin(&frame->timeline);
|
2019-01-25 10:05:20 +00:00
|
|
|
|
2019-01-28 18:18:11 +00:00
|
|
|
out_timeline:
|
2019-06-21 07:08:10 +00:00
|
|
|
intel_timeline_fini(&frame->timeline);
|
2019-01-28 18:18:09 +00:00
|
|
|
out_frame:
|
|
|
|
kfree(frame);
|
2019-01-25 10:05:20 +00:00
|
|
|
return dw;
|
|
|
|
}
|
|
|
|
|
2019-06-14 16:46:06 +00:00
|
|
|
void
|
|
|
|
intel_engine_init_active(struct intel_engine_cs *engine, unsigned int subclass)
|
|
|
|
{
|
|
|
|
INIT_LIST_HEAD(&engine->active.requests);
|
|
|
|
|
|
|
|
spin_lock_init(&engine->active.lock);
|
|
|
|
lockdep_set_subclass(&engine->active.lock, subclass);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Due to an interesting quirk in lockdep's internal debug tracking,
|
|
|
|
* after setting a subclass we must ensure the lock is used. Otherwise,
|
|
|
|
* nr_unused_locks is incremented once too often.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
|
|
|
local_irq_disable();
|
|
|
|
lock_map_acquire(&engine->active.lock.dep_map);
|
|
|
|
lock_map_release(&engine->active.lock.dep_map);
|
|
|
|
local_irq_enable();
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2019-08-08 11:06:11 +00:00
|
|
|
static struct intel_context *
|
|
|
|
create_kernel_context(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
struct intel_context *ce;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
ce = intel_context_create(engine->i915->kernel_context, engine);
|
|
|
|
if (IS_ERR(ce))
|
|
|
|
return ce;
|
|
|
|
|
2019-08-09 18:25:17 +00:00
|
|
|
ce->ring = __intel_context_ring_size(SZ_4K);
|
|
|
|
|
2019-08-08 11:06:11 +00:00
|
|
|
err = intel_context_pin(ce);
|
|
|
|
if (err) {
|
|
|
|
intel_context_put(ce);
|
|
|
|
return ERR_PTR(err);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ce;
|
|
|
|
}
|
|
|
|
|
2016-07-13 15:03:41 +00:00
|
|
|
/**
|
|
|
|
* intel_engines_init_common - initialize cengine state which might require hw access
|
|
|
|
* @engine: Engine to initialize.
|
|
|
|
*
|
|
|
|
* Initializes @engine@ structure members shared between legacy and execlists
|
|
|
|
* submission modes which do require hardware access.
|
|
|
|
*
|
|
|
|
* Typcally done at later stages of submission mode specific engine setup.
|
|
|
|
*
|
|
|
|
* Returns zero on success or an error code on failure.
|
|
|
|
*/
|
|
|
|
int intel_engine_init_common(struct intel_engine_cs *engine)
|
|
|
|
{
|
2019-08-08 11:06:11 +00:00
|
|
|
struct intel_context *ce;
|
2016-07-13 15:03:41 +00:00
|
|
|
int ret;
|
|
|
|
|
2019-07-09 09:12:33 +00:00
|
|
|
engine->set_default_submission(engine);
|
|
|
|
|
2019-08-08 11:06:11 +00:00
|
|
|
/*
|
|
|
|
* We may need to do things with the shrinker which
|
drm/i915: Unify active context tracking between legacy/execlists/guc
The requests conversion introduced a nasty bug where we could generate a
new request in the middle of constructing a request if we needed to idle
the system in order to evict space for a context. The request to idle
would be executed (and waited upon) before the current one, creating a
minor havoc in the seqno accounting, as we will consider the current
request to already be completed (prior to deferred seqno assignment) but
ring->last_retired_head would have been updated and still could allow
us to overwrite the current request before execution.
We also employed two different mechanisms to track the active context
until it was switched out. The legacy method allowed for waiting upon an
active context (it could forcibly evict any vma, including context's),
but the execlists method took a step backwards by pinning the vma for
the entire active lifespan of the context (the only way to evict was to
idle the entire GPU, not individual contexts). However, to circumvent
the tricky issue of locking (i.e. we cannot take struct_mutex at the
time of i915_gem_request_submit(), where we would want to move the
previous context onto the active tracker and unpin it), we take the
execlists approach and keep the contexts pinned until retirement.
The benefit of the execlists approach, more important for execlists than
legacy, was the reduction in work in pinning the context for each
request - as the context was kept pinned until idle, it could short
circuit the pinning for all active contexts.
We introduce new engine vfuncs to pin and unpin the context
respectively. The context is pinned at the start of the request, and
only unpinned when the following request is retired (this ensures that
the context is idle and coherent in main memory before we unpin it). We
move the engine->last_context tracking into the retirement itself
(rather than during request submission) in order to allow the submission
to be reordered or unwound without undue difficultly.
And finally an ulterior motive for unifying context handling was to
prepare for mock requests.
v2: Rename to last_retired_context, split out legacy_context tracking
for MI_SET_CONTEXT.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161218153724.8439-3-chris@chris-wilson.co.uk
2016-12-18 15:37:20 +00:00
|
|
|
* require us to immediately switch back to the default
|
|
|
|
* context. This can cause a problem as pinning the
|
|
|
|
* default context also requires GTT space which may not
|
|
|
|
* be available. To avoid this we always pin the default
|
|
|
|
* context.
|
|
|
|
*/
|
2019-08-08 11:06:11 +00:00
|
|
|
ce = create_kernel_context(engine);
|
|
|
|
if (IS_ERR(ce))
|
|
|
|
return PTR_ERR(ce);
|
|
|
|
|
|
|
|
engine->kernel_context = ce;
|
2016-07-13 15:03:41 +00:00
|
|
|
|
2019-01-25 12:00:04 +00:00
|
|
|
ret = measure_breadcrumb_dw(engine);
|
2019-01-25 10:05:20 +00:00
|
|
|
if (ret < 0)
|
2019-03-08 13:25:21 +00:00
|
|
|
goto err_unpin;
|
2019-01-25 10:05:20 +00:00
|
|
|
|
2019-01-29 18:54:50 +00:00
|
|
|
engine->emit_fini_breadcrumb_dw = ret;
|
2019-01-25 10:05:20 +00:00
|
|
|
|
2019-03-08 13:25:21 +00:00
|
|
|
return 0;
|
2018-05-17 21:26:32 +00:00
|
|
|
|
2019-03-08 13:25:21 +00:00
|
|
|
err_unpin:
|
2019-08-08 11:06:11 +00:00
|
|
|
intel_context_unpin(ce);
|
|
|
|
intel_context_put(ce);
|
drm/i915: Unify active context tracking between legacy/execlists/guc
The requests conversion introduced a nasty bug where we could generate a
new request in the middle of constructing a request if we needed to idle
the system in order to evict space for a context. The request to idle
would be executed (and waited upon) before the current one, creating a
minor havoc in the seqno accounting, as we will consider the current
request to already be completed (prior to deferred seqno assignment) but
ring->last_retired_head would have been updated and still could allow
us to overwrite the current request before execution.
We also employed two different mechanisms to track the active context
until it was switched out. The legacy method allowed for waiting upon an
active context (it could forcibly evict any vma, including context's),
but the execlists method took a step backwards by pinning the vma for
the entire active lifespan of the context (the only way to evict was to
idle the entire GPU, not individual contexts). However, to circumvent
the tricky issue of locking (i.e. we cannot take struct_mutex at the
time of i915_gem_request_submit(), where we would want to move the
previous context onto the active tracker and unpin it), we take the
execlists approach and keep the contexts pinned until retirement.
The benefit of the execlists approach, more important for execlists than
legacy, was the reduction in work in pinning the context for each
request - as the context was kept pinned until idle, it could short
circuit the pinning for all active contexts.
We introduce new engine vfuncs to pin and unpin the context
respectively. The context is pinned at the start of the request, and
only unpinned when the following request is retired (this ensures that
the context is idle and coherent in main memory before we unpin it). We
move the engine->last_context tracking into the retirement itself
(rather than during request submission) in order to allow the submission
to be reordered or unwound without undue difficultly.
And finally an ulterior motive for unifying context handling was to
prepare for mock requests.
v2: Rename to last_retired_context, split out legacy_context tracking
for MI_SET_CONTEXT.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161218153724.8439-3-chris@chris-wilson.co.uk
2016-12-18 15:37:20 +00:00
|
|
|
return ret;
|
2016-07-13 15:03:41 +00:00
|
|
|
}
|
2016-08-03 12:19:16 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* intel_engines_cleanup_common - cleans up the engine state created by
|
|
|
|
* the common initiailizers.
|
|
|
|
* @engine: Engine to cleanup.
|
|
|
|
*
|
|
|
|
* This cleans up everything created by the common helpers.
|
|
|
|
*/
|
|
|
|
void intel_engine_cleanup_common(struct intel_engine_cs *engine)
|
|
|
|
{
|
2019-06-14 16:46:06 +00:00
|
|
|
GEM_BUG_ON(!list_empty(&engine->active.requests));
|
|
|
|
|
2018-09-03 15:23:03 +00:00
|
|
|
cleanup_status_page(engine);
|
2017-09-13 08:56:02 +00:00
|
|
|
|
2019-08-04 12:48:26 +00:00
|
|
|
intel_engine_pool_fini(&engine->pool);
|
2016-08-03 12:19:16 +00:00
|
|
|
intel_engine_fini_breadcrumbs(engine);
|
2016-08-18 16:17:10 +00:00
|
|
|
intel_engine_cleanup_cmd_parser(engine);
|
drm/i915: Unify active context tracking between legacy/execlists/guc
The requests conversion introduced a nasty bug where we could generate a
new request in the middle of constructing a request if we needed to idle
the system in order to evict space for a context. The request to idle
would be executed (and waited upon) before the current one, creating a
minor havoc in the seqno accounting, as we will consider the current
request to already be completed (prior to deferred seqno assignment) but
ring->last_retired_head would have been updated and still could allow
us to overwrite the current request before execution.
We also employed two different mechanisms to track the active context
until it was switched out. The legacy method allowed for waiting upon an
active context (it could forcibly evict any vma, including context's),
but the execlists method took a step backwards by pinning the vma for
the entire active lifespan of the context (the only way to evict was to
idle the entire GPU, not individual contexts). However, to circumvent
the tricky issue of locking (i.e. we cannot take struct_mutex at the
time of i915_gem_request_submit(), where we would want to move the
previous context onto the active tracker and unpin it), we take the
execlists approach and keep the contexts pinned until retirement.
The benefit of the execlists approach, more important for execlists than
legacy, was the reduction in work in pinning the context for each
request - as the context was kept pinned until idle, it could short
circuit the pinning for all active contexts.
We introduce new engine vfuncs to pin and unpin the context
respectively. The context is pinned at the start of the request, and
only unpinned when the following request is retired (this ensures that
the context is idle and coherent in main memory before we unpin it). We
move the engine->last_context tracking into the retirement itself
(rather than during request submission) in order to allow the submission
to be reordered or unwound without undue difficultly.
And finally an ulterior motive for unifying context handling was to
prepare for mock requests.
v2: Rename to last_retired_context, split out legacy_context tracking
for MI_SET_CONTEXT.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161218153724.8439-3-chris@chris-wilson.co.uk
2016-12-18 15:37:20 +00:00
|
|
|
|
2017-11-10 14:26:33 +00:00
|
|
|
if (engine->default_state)
|
|
|
|
i915_gem_object_put(engine->default_state);
|
|
|
|
|
2019-03-08 13:25:21 +00:00
|
|
|
intel_context_unpin(engine->kernel_context);
|
2019-08-08 11:06:11 +00:00
|
|
|
intel_context_put(engine->kernel_context);
|
drm/i915: Keep contexts pinned until after the next kernel context switch
We need to keep the context image pinned in memory until after the GPU
has finished writing into it. Since it continues to write as we signal
the final breadcrumb, we need to keep it pinned until the request after
it is complete. Currently we know the order in which requests execute on
each engine, and so to remove that presumption we need to identify a
request/context-switch we know must occur after our completion. Any
request queued after the signal must imply a context switch, for
simplicity we use a fresh request from the kernel context.
The sequence of operations for keeping the context pinned until saved is:
- On context activation, we preallocate a node for each physical engine
the context may operate on. This is to avoid allocations during
unpinning, which may be from inside FS_RECLAIM context (aka the
shrinker)
- On context deactivation on retirement of the last active request (which
is before we know the context has been saved), we add the
preallocated node onto a barrier list on each engine
- On engine idling, we emit a switch to kernel context. When this
switch completes, we know that all previous contexts must have been
saved, and so on retiring this request we can finally unpin all the
contexts that were marked as deactivated prior to the switch.
We can enhance this in future by flushing all the idle contexts on a
regular heartbeat pulse of a switch to kernel context, which will also
be used to check for hung engines.
v2: intel_context_active_acquire/_release
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190614164606.15633-1-chris@chris-wilson.co.uk
2019-06-14 16:46:04 +00:00
|
|
|
GEM_BUG_ON(!llist_empty(&engine->barrier_tasks));
|
2018-05-02 16:38:39 +00:00
|
|
|
|
2018-12-03 13:33:57 +00:00
|
|
|
intel_wa_list_free(&engine->ctx_wa_list);
|
2018-12-03 13:33:41 +00:00
|
|
|
intel_wa_list_free(&engine->wa_list);
|
2018-12-03 12:50:12 +00:00
|
|
|
intel_wa_list_free(&engine->whitelist);
|
2016-08-03 12:19:16 +00:00
|
|
|
}
|
2016-10-04 20:11:31 +00:00
|
|
|
|
2018-02-12 10:24:15 +00:00
|
|
|
u64 intel_engine_get_active_head(const struct intel_engine_cs *engine)
|
2016-10-04 20:11:31 +00:00
|
|
|
{
|
2019-03-25 21:49:40 +00:00
|
|
|
struct drm_i915_private *i915 = engine->i915;
|
|
|
|
|
2016-10-04 20:11:31 +00:00
|
|
|
u64 acthd;
|
|
|
|
|
2019-03-25 21:49:40 +00:00
|
|
|
if (INTEL_GEN(i915) >= 8)
|
|
|
|
acthd = ENGINE_READ64(engine, RING_ACTHD, RING_ACTHD_UDW);
|
|
|
|
else if (INTEL_GEN(i915) >= 4)
|
|
|
|
acthd = ENGINE_READ(engine, RING_ACTHD);
|
2016-10-04 20:11:31 +00:00
|
|
|
else
|
2019-03-25 21:49:40 +00:00
|
|
|
acthd = ENGINE_READ(engine, ACTHD);
|
2016-10-04 20:11:31 +00:00
|
|
|
|
|
|
|
return acthd;
|
|
|
|
}
|
|
|
|
|
2018-02-12 10:24:15 +00:00
|
|
|
u64 intel_engine_get_last_batch_head(const struct intel_engine_cs *engine)
|
2016-10-04 20:11:31 +00:00
|
|
|
{
|
|
|
|
u64 bbaddr;
|
|
|
|
|
2019-03-25 21:49:40 +00:00
|
|
|
if (INTEL_GEN(engine->i915) >= 8)
|
|
|
|
bbaddr = ENGINE_READ64(engine, RING_BBADDR, RING_BBADDR_UDW);
|
2016-10-04 20:11:31 +00:00
|
|
|
else
|
2019-03-25 21:49:40 +00:00
|
|
|
bbaddr = ENGINE_READ(engine, RING_BBADDR);
|
2016-10-04 20:11:31 +00:00
|
|
|
|
|
|
|
return bbaddr;
|
|
|
|
}
|
2016-10-12 09:05:17 +00:00
|
|
|
|
2018-05-16 18:33:55 +00:00
|
|
|
int intel_engine_stop_cs(struct intel_engine_cs *engine)
|
|
|
|
{
|
2019-03-25 21:49:40 +00:00
|
|
|
struct intel_uncore *uncore = engine->uncore;
|
2018-05-16 18:33:55 +00:00
|
|
|
const u32 base = engine->mmio_base;
|
|
|
|
const i915_reg_t mode = RING_MI_MODE(base);
|
|
|
|
int err;
|
|
|
|
|
2019-03-25 21:49:38 +00:00
|
|
|
if (INTEL_GEN(engine->i915) < 3)
|
2018-05-16 18:33:55 +00:00
|
|
|
return -ENODEV;
|
|
|
|
|
|
|
|
GEM_TRACE("%s\n", engine->name);
|
|
|
|
|
2019-03-25 21:49:38 +00:00
|
|
|
intel_uncore_write_fw(uncore, mode, _MASKED_BIT_ENABLE(STOP_RING));
|
2018-05-16 18:33:55 +00:00
|
|
|
|
|
|
|
err = 0;
|
2019-03-25 21:49:38 +00:00
|
|
|
if (__intel_wait_for_register_fw(uncore,
|
2018-05-16 18:33:55 +00:00
|
|
|
mode, MODE_IDLE, MODE_IDLE,
|
|
|
|
1000, 0,
|
|
|
|
NULL)) {
|
|
|
|
GEM_TRACE("%s: timed out on STOP_RING -> IDLE\n", engine->name);
|
|
|
|
err = -ETIMEDOUT;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* A final mmio read to let GPU writes be hopefully flushed to memory */
|
2019-03-25 21:49:38 +00:00
|
|
|
intel_uncore_posting_read_fw(uncore, mode);
|
2018-05-16 18:33:55 +00:00
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2018-08-14 17:18:57 +00:00
|
|
|
void intel_engine_cancel_stop_cs(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
GEM_TRACE("%s\n", engine->name);
|
|
|
|
|
2019-03-25 21:49:40 +00:00
|
|
|
ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
|
2018-08-14 17:18:57 +00:00
|
|
|
}
|
|
|
|
|
2016-10-12 09:05:17 +00:00
|
|
|
const char *i915_cache_level_str(struct drm_i915_private *i915, int type)
|
|
|
|
{
|
|
|
|
switch (type) {
|
|
|
|
case I915_CACHE_NONE: return " uncached";
|
|
|
|
case I915_CACHE_LLC: return HAS_LLC(i915) ? " LLC" : " snooped";
|
|
|
|
case I915_CACHE_L3_LLC: return " L3+LLC";
|
|
|
|
case I915_CACHE_WT: return " WT";
|
|
|
|
default: return "";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-10 12:57:06 +00:00
|
|
|
static u32
|
|
|
|
read_subslice_reg(struct intel_engine_cs *engine, int slice, int subslice,
|
|
|
|
i915_reg_t reg)
|
2016-10-12 09:05:17 +00:00
|
|
|
{
|
2019-06-10 12:57:06 +00:00
|
|
|
struct drm_i915_private *i915 = engine->i915;
|
|
|
|
struct intel_uncore *uncore = engine->uncore;
|
2019-07-17 18:06:20 +00:00
|
|
|
u32 mcr_mask, mcr_ss, mcr, old_mcr, val;
|
2016-10-12 09:05:17 +00:00
|
|
|
enum forcewake_domains fw_domains;
|
|
|
|
|
2019-06-10 12:57:06 +00:00
|
|
|
if (INTEL_GEN(i915) >= 11) {
|
2019-07-17 18:06:20 +00:00
|
|
|
mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK;
|
|
|
|
mcr_ss = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice);
|
2018-03-16 12:14:51 +00:00
|
|
|
} else {
|
2019-07-17 18:06:20 +00:00
|
|
|
mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK;
|
|
|
|
mcr_ss = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice);
|
2018-03-16 12:14:51 +00:00
|
|
|
}
|
|
|
|
|
2019-03-25 21:49:37 +00:00
|
|
|
fw_domains = intel_uncore_forcewake_for_reg(uncore, reg,
|
2016-10-12 09:05:17 +00:00
|
|
|
FW_REG_READ);
|
2019-03-25 21:49:37 +00:00
|
|
|
fw_domains |= intel_uncore_forcewake_for_reg(uncore,
|
2016-10-12 09:05:17 +00:00
|
|
|
GEN8_MCR_SELECTOR,
|
|
|
|
FW_REG_READ | FW_REG_WRITE);
|
|
|
|
|
2019-03-25 21:49:37 +00:00
|
|
|
spin_lock_irq(&uncore->lock);
|
|
|
|
intel_uncore_forcewake_get__locked(uncore, fw_domains);
|
2016-10-12 09:05:17 +00:00
|
|
|
|
2019-07-17 18:06:20 +00:00
|
|
|
old_mcr = mcr = intel_uncore_read_fw(uncore, GEN8_MCR_SELECTOR);
|
2018-05-18 22:39:57 +00:00
|
|
|
|
2019-07-17 18:06:20 +00:00
|
|
|
mcr &= ~mcr_mask;
|
|
|
|
mcr |= mcr_ss;
|
2019-03-25 21:49:37 +00:00
|
|
|
intel_uncore_write_fw(uncore, GEN8_MCR_SELECTOR, mcr);
|
2016-10-12 09:05:17 +00:00
|
|
|
|
2019-07-17 18:06:20 +00:00
|
|
|
val = intel_uncore_read_fw(uncore, reg);
|
2016-10-12 09:05:17 +00:00
|
|
|
|
2019-07-17 18:06:20 +00:00
|
|
|
mcr &= ~mcr_mask;
|
|
|
|
mcr |= old_mcr & mcr_mask;
|
2018-05-18 22:39:57 +00:00
|
|
|
|
2019-03-25 21:49:37 +00:00
|
|
|
intel_uncore_write_fw(uncore, GEN8_MCR_SELECTOR, mcr);
|
2016-10-12 09:05:17 +00:00
|
|
|
|
2019-03-25 21:49:37 +00:00
|
|
|
intel_uncore_forcewake_put__locked(uncore, fw_domains);
|
|
|
|
spin_unlock_irq(&uncore->lock);
|
2016-10-12 09:05:17 +00:00
|
|
|
|
2019-07-17 18:06:20 +00:00
|
|
|
return val;
|
2016-10-12 09:05:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* NB: please notice the memset */
|
|
|
|
void intel_engine_get_instdone(struct intel_engine_cs *engine,
|
|
|
|
struct intel_instdone *instdone)
|
|
|
|
{
|
2019-06-10 12:57:06 +00:00
|
|
|
struct drm_i915_private *i915 = engine->i915;
|
2019-08-23 16:03:05 +00:00
|
|
|
const struct sseu_dev_info *sseu = &RUNTIME_INFO(i915)->sseu;
|
2019-03-25 21:49:40 +00:00
|
|
|
struct intel_uncore *uncore = engine->uncore;
|
2016-10-12 09:05:17 +00:00
|
|
|
u32 mmio_base = engine->mmio_base;
|
|
|
|
int slice;
|
|
|
|
int subslice;
|
|
|
|
|
|
|
|
memset(instdone, 0, sizeof(*instdone));
|
|
|
|
|
2019-06-10 12:57:06 +00:00
|
|
|
switch (INTEL_GEN(i915)) {
|
2016-10-12 09:05:17 +00:00
|
|
|
default:
|
2019-03-25 21:49:40 +00:00
|
|
|
instdone->instdone =
|
|
|
|
intel_uncore_read(uncore, RING_INSTDONE(mmio_base));
|
2016-10-12 09:05:17 +00:00
|
|
|
|
2019-03-05 18:03:30 +00:00
|
|
|
if (engine->id != RCS0)
|
2016-10-12 09:05:17 +00:00
|
|
|
break;
|
|
|
|
|
2019-03-25 21:49:40 +00:00
|
|
|
instdone->slice_common =
|
|
|
|
intel_uncore_read(uncore, GEN7_SC_INSTDONE);
|
2019-08-23 16:03:05 +00:00
|
|
|
for_each_instdone_slice_subslice(i915, sseu, slice, subslice) {
|
2016-10-12 09:05:17 +00:00
|
|
|
instdone->sampler[slice][subslice] =
|
2019-06-10 12:57:06 +00:00
|
|
|
read_subslice_reg(engine, slice, subslice,
|
2016-10-12 09:05:17 +00:00
|
|
|
GEN7_SAMPLER_INSTDONE);
|
|
|
|
instdone->row[slice][subslice] =
|
2019-06-10 12:57:06 +00:00
|
|
|
read_subslice_reg(engine, slice, subslice,
|
2016-10-12 09:05:17 +00:00
|
|
|
GEN7_ROW_INSTDONE);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case 7:
|
2019-03-25 21:49:40 +00:00
|
|
|
instdone->instdone =
|
|
|
|
intel_uncore_read(uncore, RING_INSTDONE(mmio_base));
|
2016-10-12 09:05:17 +00:00
|
|
|
|
2019-03-05 18:03:30 +00:00
|
|
|
if (engine->id != RCS0)
|
2016-10-12 09:05:17 +00:00
|
|
|
break;
|
|
|
|
|
2019-03-25 21:49:40 +00:00
|
|
|
instdone->slice_common =
|
|
|
|
intel_uncore_read(uncore, GEN7_SC_INSTDONE);
|
|
|
|
instdone->sampler[0][0] =
|
|
|
|
intel_uncore_read(uncore, GEN7_SAMPLER_INSTDONE);
|
|
|
|
instdone->row[0][0] =
|
|
|
|
intel_uncore_read(uncore, GEN7_ROW_INSTDONE);
|
2016-10-12 09:05:17 +00:00
|
|
|
|
|
|
|
break;
|
|
|
|
case 6:
|
|
|
|
case 5:
|
|
|
|
case 4:
|
2019-03-25 21:49:40 +00:00
|
|
|
instdone->instdone =
|
|
|
|
intel_uncore_read(uncore, RING_INSTDONE(mmio_base));
|
2019-03-05 18:03:30 +00:00
|
|
|
if (engine->id == RCS0)
|
2016-10-12 09:05:17 +00:00
|
|
|
/* HACK: Using the wrong struct member */
|
2019-03-25 21:49:40 +00:00
|
|
|
instdone->slice_common =
|
|
|
|
intel_uncore_read(uncore, GEN4_INSTDONE1);
|
2016-10-12 09:05:17 +00:00
|
|
|
break;
|
|
|
|
case 3:
|
|
|
|
case 2:
|
2019-03-25 21:49:40 +00:00
|
|
|
instdone->instdone = intel_uncore_read(uncore, GEN2_INSTDONE);
|
2016-10-12 09:05:17 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2017-02-13 17:15:14 +00:00
|
|
|
|
2017-05-30 12:13:33 +00:00
|
|
|
static bool ring_is_idle(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
bool idle = true;
|
|
|
|
|
2019-01-18 11:22:25 +00:00
|
|
|
if (I915_SELFTEST_ONLY(!engine->mmio_base))
|
|
|
|
return true;
|
|
|
|
|
2019-08-12 09:10:44 +00:00
|
|
|
if (!intel_engine_pm_get_if_awake(engine))
|
2018-02-12 09:39:28 +00:00
|
|
|
return true;
|
2017-05-30 12:13:33 +00:00
|
|
|
|
2019-02-27 20:46:53 +00:00
|
|
|
/* First check that no commands are left in the ring */
|
2019-03-25 21:49:40 +00:00
|
|
|
if ((ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR) !=
|
|
|
|
(ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR))
|
2019-02-27 20:46:53 +00:00
|
|
|
idle = false;
|
2017-05-30 12:13:34 +00:00
|
|
|
|
2019-02-27 20:46:53 +00:00
|
|
|
/* No bit for gen2, so assume the CS parser is idle */
|
2019-08-12 09:10:44 +00:00
|
|
|
if (INTEL_GEN(engine->i915) > 2 &&
|
2019-03-25 21:49:40 +00:00
|
|
|
!(ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE))
|
2017-05-30 12:13:33 +00:00
|
|
|
idle = false;
|
|
|
|
|
2019-08-12 09:10:44 +00:00
|
|
|
intel_engine_pm_put(engine);
|
2017-05-30 12:13:33 +00:00
|
|
|
|
|
|
|
return idle;
|
|
|
|
}
|
|
|
|
|
2017-03-03 12:19:46 +00:00
|
|
|
/**
|
|
|
|
* intel_engine_is_idle() - Report if the engine has finished process all work
|
|
|
|
* @engine: the intel_engine_cs
|
|
|
|
*
|
|
|
|
* Return true if there are no requests pending, nothing left to be submitted
|
|
|
|
* to hardware, and that the engine is idle.
|
|
|
|
*/
|
|
|
|
bool intel_engine_is_idle(struct intel_engine_cs *engine)
|
|
|
|
{
|
2017-04-11 19:00:42 +00:00
|
|
|
/* More white lies, if wedged, hw state is inconsistent */
|
2019-07-12 19:29:53 +00:00
|
|
|
if (intel_gt_is_wedged(engine->gt))
|
2017-04-11 19:00:42 +00:00
|
|
|
return true;
|
|
|
|
|
2019-06-25 13:01:14 +00:00
|
|
|
if (!intel_engine_pm_is_awake(engine))
|
2019-05-03 11:52:15 +00:00
|
|
|
return true;
|
|
|
|
|
2017-10-23 21:32:36 +00:00
|
|
|
/* Waiting to drain ELSP? */
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
if (execlists_active(&engine->execlists)) {
|
2018-07-13 20:35:28 +00:00
|
|
|
struct tasklet_struct *t = &engine->execlists.tasklet;
|
2018-05-06 17:13:28 +00:00
|
|
|
|
2019-07-02 15:17:23 +00:00
|
|
|
synchronize_hardirq(engine->i915->drm.pdev->irq);
|
2019-05-03 08:09:42 +00:00
|
|
|
|
2018-06-20 13:59:29 +00:00
|
|
|
local_bh_disable();
|
2018-07-13 20:35:28 +00:00
|
|
|
if (tasklet_trylock(t)) {
|
|
|
|
/* Must wait for any GPU reset in progress. */
|
|
|
|
if (__tasklet_is_enabled(t))
|
|
|
|
t->func(t->data);
|
|
|
|
tasklet_unlock(t);
|
2018-05-06 17:13:28 +00:00
|
|
|
}
|
2018-06-20 13:59:29 +00:00
|
|
|
local_bh_enable();
|
2018-05-06 17:13:28 +00:00
|
|
|
|
2018-09-14 08:00:16 +00:00
|
|
|
/* Otherwise flush the tasklet if it was on another cpu */
|
|
|
|
tasklet_unlock_wait(t);
|
|
|
|
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
if (execlists_active(&engine->execlists))
|
2018-05-06 17:13:28 +00:00
|
|
|
return false;
|
|
|
|
}
|
2017-03-03 12:19:46 +00:00
|
|
|
|
2018-05-06 17:13:28 +00:00
|
|
|
/* ELSP is empty, but there are ready requests? E.g. after reset */
|
2018-06-29 07:53:20 +00:00
|
|
|
if (!RB_EMPTY_ROOT(&engine->execlists.queue.rb_root))
|
2017-07-21 12:32:24 +00:00
|
|
|
return false;
|
|
|
|
|
2017-03-03 12:19:46 +00:00
|
|
|
/* Ring stopped? */
|
2019-01-18 11:22:25 +00:00
|
|
|
return ring_is_idle(engine);
|
2017-03-03 12:19:46 +00:00
|
|
|
}
|
|
|
|
|
2019-07-12 19:29:53 +00:00
|
|
|
bool intel_engines_are_idle(struct intel_gt *gt)
|
2017-03-03 12:19:47 +00:00
|
|
|
{
|
|
|
|
struct intel_engine_cs *engine;
|
|
|
|
enum intel_engine_id id;
|
|
|
|
|
2017-12-12 13:21:48 +00:00
|
|
|
/*
|
|
|
|
* If the driver is wedged, HW state may be very inconsistent and
|
2017-03-30 14:50:37 +00:00
|
|
|
* report that it is still busy, even though we have stopped using it.
|
|
|
|
*/
|
2019-07-12 19:29:53 +00:00
|
|
|
if (intel_gt_is_wedged(gt))
|
2017-03-30 14:50:37 +00:00
|
|
|
return true;
|
|
|
|
|
2019-02-27 21:41:59 +00:00
|
|
|
/* Already parked (and passed an idleness test); must still be idle */
|
2019-07-12 19:29:53 +00:00
|
|
|
if (!READ_ONCE(gt->awake))
|
2019-02-27 21:41:59 +00:00
|
|
|
return true;
|
|
|
|
|
2019-07-12 19:29:53 +00:00
|
|
|
for_each_engine(engine, gt->i915, id) {
|
2017-03-03 12:19:47 +00:00
|
|
|
if (!intel_engine_is_idle(engine))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-07-12 19:29:53 +00:00
|
|
|
void intel_engines_reset_default_submission(struct intel_gt *gt)
|
2017-03-16 17:13:03 +00:00
|
|
|
{
|
|
|
|
struct intel_engine_cs *engine;
|
|
|
|
enum intel_engine_id id;
|
|
|
|
|
2019-07-12 19:29:53 +00:00
|
|
|
for_each_engine(engine, gt->i915, id)
|
2017-03-16 17:13:03 +00:00
|
|
|
engine->set_default_submission(engine);
|
|
|
|
}
|
|
|
|
|
2017-09-06 15:28:59 +00:00
|
|
|
bool intel_engine_can_store_dword(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
switch (INTEL_GEN(engine->i915)) {
|
|
|
|
case 2:
|
|
|
|
return false; /* uses physical not virtual addresses */
|
|
|
|
case 3:
|
|
|
|
/* maybe only uses physical not virtual addresses */
|
|
|
|
return !(IS_I915G(engine->i915) || IS_I915GM(engine->i915));
|
2019-08-26 13:38:37 +00:00
|
|
|
case 4:
|
|
|
|
return !IS_I965G(engine->i915); /* who knows! */
|
2017-09-06 15:28:59 +00:00
|
|
|
case 6:
|
|
|
|
return engine->class != VIDEO_DECODE_CLASS; /* b0rked */
|
|
|
|
default:
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-24 01:08:39 +00:00
|
|
|
static int print_sched_attr(struct drm_i915_private *i915,
|
|
|
|
const struct i915_sched_attr *attr,
|
|
|
|
char *buf, int x, int len)
|
2018-04-18 18:40:52 +00:00
|
|
|
{
|
|
|
|
if (attr->priority == I915_PRIORITY_INVALID)
|
2018-04-24 01:08:39 +00:00
|
|
|
return x;
|
|
|
|
|
|
|
|
x += snprintf(buf + x, len - x,
|
|
|
|
" prio=%d", attr->priority);
|
2018-04-18 18:40:52 +00:00
|
|
|
|
2018-04-24 01:08:39 +00:00
|
|
|
return x;
|
2018-04-18 18:40:52 +00:00
|
|
|
}
|
|
|
|
|
2017-10-09 11:02:57 +00:00
|
|
|
static void print_request(struct drm_printer *m,
|
2018-02-21 09:56:36 +00:00
|
|
|
struct i915_request *rq,
|
2017-10-09 11:02:57 +00:00
|
|
|
const char *prefix)
|
|
|
|
{
|
2018-03-14 10:16:30 +00:00
|
|
|
const char *name = rq->fence.ops->get_timeline_name(&rq->fence);
|
2018-05-17 15:28:24 +00:00
|
|
|
char buf[80] = "";
|
2018-04-24 01:08:39 +00:00
|
|
|
int x = 0;
|
|
|
|
|
|
|
|
x = print_sched_attr(rq->i915, &rq->sched.attr, buf, x, sizeof(buf));
|
2018-03-14 10:16:30 +00:00
|
|
|
|
2019-02-26 09:49:21 +00:00
|
|
|
drm_printf(m, "%s %llx:%llx%s%s %s @ %dms: %s\n",
|
2018-04-18 18:40:52 +00:00
|
|
|
prefix,
|
2019-02-26 09:49:21 +00:00
|
|
|
rq->fence.context, rq->fence.seqno,
|
2019-01-29 18:54:50 +00:00
|
|
|
i915_request_completed(rq) ? "!" :
|
|
|
|
i915_request_started(rq) ? "*" :
|
|
|
|
"",
|
2019-05-01 11:45:29 +00:00
|
|
|
test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
|
|
|
|
&rq->fence.flags) ? "+" :
|
drm/i915: Replace global breadcrumbs with per-context interrupt tracking
A few years ago, see commit 688e6c725816 ("drm/i915: Slaughter the
thundering i915_wait_request herd"), the issue of handling multiple
clients waiting in parallel was brought to our attention. The
requirement was that every client should be woken immediately upon its
request being signaled, without incurring any cpu overhead.
To handle certain fragility of our hw meant that we could not do a
simple check inside the irq handler (some generations required almost
unbounded delays before we could be sure of seqno coherency) and so
request completion checking required delegation.
Before commit 688e6c725816, the solution was simple. Every client
waiting on a request would be woken on every interrupt and each would do
a heavyweight check to see if their request was complete. Commit
688e6c725816 introduced an rbtree so that only the earliest waiter on
the global timeline would woken, and would wake the next and so on.
(Along with various complications to handle requests being reordered
along the global timeline, and also a requirement for kthread to provide
a delegate for fence signaling that had no process context.)
The global rbtree depends on knowing the execution timeline (and global
seqno). Without knowing that order, we must instead check all contexts
queued to the HW to see which may have advanced. We trim that list by
only checking queued contexts that are being waited on, but still we
keep a list of all active contexts and their active signalers that we
inspect from inside the irq handler. By moving the waiters onto the fence
signal list, we can combine the client wakeup with the dma_fence
signaling (a dramatic reduction in complexity, but does require the HW
being coherent, the seqno must be visible from the cpu before the
interrupt is raised - we keep a timer backup just in case).
Having previously fixed all the issues with irq-seqno serialisation (by
inserting delays onto the GPU after each request instead of random delays
on the CPU after each interrupt), we can rely on the seqno state to
perfom direct wakeups from the interrupt handler. This allows us to
preserve our single context switch behaviour of the current routine,
with the only downside that we lose the RT priority sorting of wakeups.
In general, direct wakeup latency of multiple clients is about the same
(about 10% better in most cases) with a reduction in total CPU time spent
in the waiter (about 20-50% depending on gen). Average herd behaviour is
improved, but at the cost of not delegating wakeups on task_prio.
v2: Capture fence signaling state for error state and add comments to
warm even the most cold of hearts.
v3: Check if the request is still active before busywaiting
v4: Reduce the amount of pointer misdirection with list_for_each_safe
and using a local i915_request variable inside the loops
v5: Add a missing pluralisation to a purely informative selftest message.
References: 688e6c725816 ("drm/i915: Slaughter the thundering i915_wait_request herd")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190129205230.19056-2-chris@chris-wilson.co.uk
2019-01-29 20:52:29 +00:00
|
|
|
test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
|
2019-05-01 11:45:29 +00:00
|
|
|
&rq->fence.flags) ? "-" :
|
|
|
|
"",
|
2018-04-24 01:08:39 +00:00
|
|
|
buf,
|
2017-10-09 11:02:57 +00:00
|
|
|
jiffies_to_msecs(jiffies - rq->emitted_jiffies),
|
2018-03-14 10:16:30 +00:00
|
|
|
name);
|
2017-10-09 11:02:57 +00:00
|
|
|
}
|
|
|
|
|
2017-12-22 18:25:21 +00:00
|
|
|
static void hexdump(struct drm_printer *m, const void *buf, size_t len)
|
|
|
|
{
|
|
|
|
const size_t rowsize = 8 * sizeof(u32);
|
|
|
|
const void *prev = NULL;
|
|
|
|
bool skip = false;
|
|
|
|
size_t pos;
|
|
|
|
|
|
|
|
for (pos = 0; pos < len; pos += rowsize) {
|
|
|
|
char line[128];
|
|
|
|
|
|
|
|
if (prev && !memcmp(prev, buf + pos, rowsize)) {
|
|
|
|
if (!skip) {
|
|
|
|
drm_printf(m, "*\n");
|
|
|
|
skip = true;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
WARN_ON_ONCE(hex_dump_to_buffer(buf + pos, len - pos,
|
|
|
|
rowsize, sizeof(u32),
|
|
|
|
line, sizeof(line),
|
|
|
|
false) >= sizeof(line));
|
2018-06-14 09:41:01 +00:00
|
|
|
drm_printf(m, "[%04zx] %s\n", pos, line);
|
2017-12-22 18:25:21 +00:00
|
|
|
|
|
|
|
prev = buf + pos;
|
|
|
|
skip = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-18 16:19:51 +00:00
|
|
|
static void intel_engine_print_registers(struct intel_engine_cs *engine,
|
2018-02-12 10:24:15 +00:00
|
|
|
struct drm_printer *m)
|
2017-10-09 11:02:57 +00:00
|
|
|
{
|
|
|
|
struct drm_i915_private *dev_priv = engine->i915;
|
2018-02-12 10:24:15 +00:00
|
|
|
const struct intel_engine_execlists * const execlists =
|
|
|
|
&engine->execlists;
|
2019-06-18 16:19:51 +00:00
|
|
|
unsigned long flags;
|
2017-10-09 11:02:57 +00:00
|
|
|
u64 addr;
|
|
|
|
|
2019-08-13 17:41:20 +00:00
|
|
|
if (engine->id == RENDER_CLASS && IS_GEN_RANGE(dev_priv, 4, 7))
|
2019-03-25 21:49:40 +00:00
|
|
|
drm_printf(m, "\tCCID: 0x%08x\n", ENGINE_READ(engine, CCID));
|
2018-02-12 10:24:15 +00:00
|
|
|
drm_printf(m, "\tRING_START: 0x%08x\n",
|
2019-03-25 21:49:40 +00:00
|
|
|
ENGINE_READ(engine, RING_START));
|
2018-02-12 10:24:15 +00:00
|
|
|
drm_printf(m, "\tRING_HEAD: 0x%08x\n",
|
2019-03-25 21:49:40 +00:00
|
|
|
ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR);
|
2018-02-12 10:24:15 +00:00
|
|
|
drm_printf(m, "\tRING_TAIL: 0x%08x\n",
|
2019-03-25 21:49:40 +00:00
|
|
|
ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR);
|
2017-10-26 11:50:48 +00:00
|
|
|
drm_printf(m, "\tRING_CTL: 0x%08x%s\n",
|
2019-03-25 21:49:40 +00:00
|
|
|
ENGINE_READ(engine, RING_CTL),
|
|
|
|
ENGINE_READ(engine, RING_CTL) & (RING_WAIT | RING_WAIT_SEMAPHORE) ? " [waiting]" : "");
|
2017-10-26 11:50:48 +00:00
|
|
|
if (INTEL_GEN(engine->i915) > 2) {
|
|
|
|
drm_printf(m, "\tRING_MODE: 0x%08x%s\n",
|
2019-03-25 21:49:40 +00:00
|
|
|
ENGINE_READ(engine, RING_MI_MODE),
|
|
|
|
ENGINE_READ(engine, RING_MI_MODE) & (MODE_IDLE) ? " [idle]" : "");
|
2017-10-26 11:50:48 +00:00
|
|
|
}
|
2018-02-12 10:24:15 +00:00
|
|
|
|
|
|
|
if (INTEL_GEN(dev_priv) >= 6) {
|
2019-03-25 21:49:40 +00:00
|
|
|
drm_printf(m, "\tRING_IMR: %08x\n",
|
|
|
|
ENGINE_READ(engine, RING_IMR));
|
2018-02-12 10:24:15 +00:00
|
|
|
}
|
|
|
|
|
2017-10-09 11:02:57 +00:00
|
|
|
addr = intel_engine_get_active_head(engine);
|
|
|
|
drm_printf(m, "\tACTHD: 0x%08x_%08x\n",
|
|
|
|
upper_32_bits(addr), lower_32_bits(addr));
|
|
|
|
addr = intel_engine_get_last_batch_head(engine);
|
|
|
|
drm_printf(m, "\tBBADDR: 0x%08x_%08x\n",
|
|
|
|
upper_32_bits(addr), lower_32_bits(addr));
|
2017-12-18 12:39:14 +00:00
|
|
|
if (INTEL_GEN(dev_priv) >= 8)
|
2019-03-25 21:49:40 +00:00
|
|
|
addr = ENGINE_READ64(engine, RING_DMA_FADD, RING_DMA_FADD_UDW);
|
2017-12-18 12:39:14 +00:00
|
|
|
else if (INTEL_GEN(dev_priv) >= 4)
|
2019-03-25 21:49:40 +00:00
|
|
|
addr = ENGINE_READ(engine, RING_DMA_FADD);
|
2017-12-18 12:39:14 +00:00
|
|
|
else
|
2019-03-25 21:49:40 +00:00
|
|
|
addr = ENGINE_READ(engine, DMA_FADD_I8XX);
|
2017-12-18 12:39:14 +00:00
|
|
|
drm_printf(m, "\tDMA_FADDR: 0x%08x_%08x\n",
|
|
|
|
upper_32_bits(addr), lower_32_bits(addr));
|
|
|
|
if (INTEL_GEN(dev_priv) >= 4) {
|
|
|
|
drm_printf(m, "\tIPEIR: 0x%08x\n",
|
2019-03-25 21:49:40 +00:00
|
|
|
ENGINE_READ(engine, RING_IPEIR));
|
2017-12-18 12:39:14 +00:00
|
|
|
drm_printf(m, "\tIPEHR: 0x%08x\n",
|
2019-03-25 21:49:40 +00:00
|
|
|
ENGINE_READ(engine, RING_IPEHR));
|
2017-12-18 12:39:14 +00:00
|
|
|
} else {
|
2019-03-25 21:49:40 +00:00
|
|
|
drm_printf(m, "\tIPEIR: 0x%08x\n", ENGINE_READ(engine, IPEIR));
|
|
|
|
drm_printf(m, "\tIPEHR: 0x%08x\n", ENGINE_READ(engine, IPEHR));
|
2017-12-18 12:39:14 +00:00
|
|
|
}
|
2017-10-09 11:02:57 +00:00
|
|
|
|
2017-11-20 20:55:00 +00:00
|
|
|
if (HAS_EXECLISTS(dev_priv)) {
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
struct i915_request * const *port, *rq;
|
2019-01-28 10:23:55 +00:00
|
|
|
const u32 *hws =
|
|
|
|
&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
|
2019-04-05 20:46:56 +00:00
|
|
|
const u8 num_entries = execlists->csb_size;
|
2017-10-09 11:02:57 +00:00
|
|
|
unsigned int idx;
|
2018-08-21 10:11:38 +00:00
|
|
|
u8 read, write;
|
2017-10-09 11:02:57 +00:00
|
|
|
|
2019-04-05 20:46:56 +00:00
|
|
|
drm_printf(m, "\tExeclist status: 0x%08x %08x, entries %u\n",
|
2019-03-25 21:49:40 +00:00
|
|
|
ENGINE_READ(engine, RING_EXECLIST_STATUS_LO),
|
2019-04-05 20:46:56 +00:00
|
|
|
ENGINE_READ(engine, RING_EXECLIST_STATUS_HI),
|
|
|
|
num_entries);
|
2017-10-09 11:02:57 +00:00
|
|
|
|
2018-08-21 10:11:38 +00:00
|
|
|
read = execlists->csb_head;
|
|
|
|
write = READ_ONCE(*execlists->csb_write);
|
|
|
|
|
2019-04-05 20:46:56 +00:00
|
|
|
drm_printf(m, "\tExeclist CSB read %d, write %d, tasklet queued? %s (%s)\n",
|
2018-08-21 10:11:38 +00:00
|
|
|
read, write,
|
2018-03-26 11:50:36 +00:00
|
|
|
yesno(test_bit(TASKLET_STATE_SCHED,
|
|
|
|
&engine->execlists.tasklet.state)),
|
|
|
|
enableddisabled(!atomic_read(&engine->execlists.tasklet.count)));
|
2019-04-05 20:46:56 +00:00
|
|
|
if (read >= num_entries)
|
2017-10-09 11:02:57 +00:00
|
|
|
read = 0;
|
2019-04-05 20:46:56 +00:00
|
|
|
if (write >= num_entries)
|
2017-10-09 11:02:57 +00:00
|
|
|
write = 0;
|
|
|
|
if (read > write)
|
2019-04-05 20:46:56 +00:00
|
|
|
write += num_entries;
|
2017-10-09 11:02:57 +00:00
|
|
|
while (read < write) {
|
2019-04-05 20:46:56 +00:00
|
|
|
idx = ++read % num_entries;
|
|
|
|
drm_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: %d\n",
|
|
|
|
idx, hws[idx * 2], hws[idx * 2 + 1]);
|
2017-10-09 11:02:57 +00:00
|
|
|
}
|
|
|
|
|
2019-06-18 16:19:51 +00:00
|
|
|
spin_lock_irqsave(&engine->active.lock, flags);
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
for (port = execlists->active; (rq = *port); port++) {
|
|
|
|
char hdr[80];
|
|
|
|
int len;
|
|
|
|
|
|
|
|
len = snprintf(hdr, sizeof(hdr),
|
|
|
|
"\t\tActive[%d: ",
|
|
|
|
(int)(port - execlists->active));
|
|
|
|
if (!i915_request_signaled(rq))
|
|
|
|
len += snprintf(hdr + len, sizeof(hdr) - len,
|
|
|
|
"ring:{start:%08x, hwsp:%08x, seqno:%08x}, ",
|
|
|
|
i915_ggtt_offset(rq->ring->vma),
|
|
|
|
rq->timeline->hwsp_offset,
|
|
|
|
hwsp_seqno(rq));
|
|
|
|
snprintf(hdr + len, sizeof(hdr) - len, "rq: ");
|
|
|
|
print_request(m, rq, hdr);
|
|
|
|
}
|
|
|
|
for (port = execlists->pending; (rq = *port); port++) {
|
2019-06-18 16:19:51 +00:00
|
|
|
char hdr[80];
|
2017-10-09 11:02:57 +00:00
|
|
|
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
snprintf(hdr, sizeof(hdr),
|
|
|
|
"\t\tPending[%d] ring:{start:%08x, hwsp:%08x, seqno:%08x}, rq: ",
|
|
|
|
(int)(port - execlists->pending),
|
|
|
|
i915_ggtt_offset(rq->ring->vma),
|
|
|
|
rq->timeline->hwsp_offset,
|
|
|
|
hwsp_seqno(rq));
|
|
|
|
print_request(m, rq, hdr);
|
2017-10-09 11:02:57 +00:00
|
|
|
}
|
2019-06-18 16:19:51 +00:00
|
|
|
spin_unlock_irqrestore(&engine->active.lock, flags);
|
2017-10-09 11:02:57 +00:00
|
|
|
} else if (INTEL_GEN(dev_priv) > 6) {
|
|
|
|
drm_printf(m, "\tPP_DIR_BASE: 0x%08x\n",
|
2019-03-25 21:49:40 +00:00
|
|
|
ENGINE_READ(engine, RING_PP_DIR_BASE));
|
2017-10-09 11:02:57 +00:00
|
|
|
drm_printf(m, "\tPP_DIR_BASE_READ: 0x%08x\n",
|
2019-03-25 21:49:40 +00:00
|
|
|
ENGINE_READ(engine, RING_PP_DIR_BASE_READ));
|
2017-10-09 11:02:57 +00:00
|
|
|
drm_printf(m, "\tPP_DIR_DCLV: 0x%08x\n",
|
2019-03-25 21:49:40 +00:00
|
|
|
ENGINE_READ(engine, RING_PP_DIR_DCLV));
|
2017-10-09 11:02:57 +00:00
|
|
|
}
|
2018-02-12 10:24:15 +00:00
|
|
|
}
|
|
|
|
|
2018-06-14 12:21:50 +00:00
|
|
|
static void print_request_ring(struct drm_printer *m, struct i915_request *rq)
|
|
|
|
{
|
|
|
|
void *ring;
|
|
|
|
int size;
|
|
|
|
|
|
|
|
drm_printf(m,
|
|
|
|
"[head %04x, postfix %04x, tail %04x, batch 0x%08x_%08x]:\n",
|
|
|
|
rq->head, rq->postfix, rq->tail,
|
|
|
|
rq->batch ? upper_32_bits(rq->batch->node.start) : ~0u,
|
|
|
|
rq->batch ? lower_32_bits(rq->batch->node.start) : ~0u);
|
|
|
|
|
|
|
|
size = rq->tail - rq->head;
|
|
|
|
if (rq->tail < rq->head)
|
|
|
|
size += rq->ring->size;
|
|
|
|
|
|
|
|
ring = kmalloc(size, GFP_ATOMIC);
|
|
|
|
if (ring) {
|
|
|
|
const void *vaddr = rq->ring->vaddr;
|
|
|
|
unsigned int head = rq->head;
|
|
|
|
unsigned int len = 0;
|
|
|
|
|
|
|
|
if (rq->tail < head) {
|
|
|
|
len = rq->ring->size - head;
|
|
|
|
memcpy(ring, vaddr + head, len);
|
|
|
|
head = 0;
|
|
|
|
}
|
|
|
|
memcpy(ring + len, vaddr + head, size - len);
|
|
|
|
|
|
|
|
hexdump(m, ring, size);
|
|
|
|
kfree(ring);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-02-12 10:24:15 +00:00
|
|
|
void intel_engine_dump(struct intel_engine_cs *engine,
|
|
|
|
struct drm_printer *m,
|
|
|
|
const char *header, ...)
|
|
|
|
{
|
|
|
|
struct i915_gpu_error * const error = &engine->i915->gpu_error;
|
2019-01-15 21:29:48 +00:00
|
|
|
struct i915_request *rq;
|
2019-01-14 14:21:18 +00:00
|
|
|
intel_wakeref_t wakeref;
|
2019-07-15 08:09:28 +00:00
|
|
|
unsigned long flags;
|
2018-02-12 10:24:15 +00:00
|
|
|
|
|
|
|
if (header) {
|
|
|
|
va_list ap;
|
|
|
|
|
|
|
|
va_start(ap, header);
|
|
|
|
drm_vprintf(m, header, &ap);
|
|
|
|
va_end(ap);
|
|
|
|
}
|
|
|
|
|
2019-07-12 19:29:53 +00:00
|
|
|
if (intel_gt_is_wedged(engine->gt))
|
2018-02-12 10:24:15 +00:00
|
|
|
drm_printf(m, "*** WEDGED ***\n");
|
|
|
|
|
drm/i915: Invert the GEM wakeref hierarchy
In the current scheme, on submitting a request we take a single global
GEM wakeref, which trickles down to wake up all GT power domains. This
is undesirable as we would like to be able to localise our power
management to the available power domains and to remove the global GEM
operations from the heart of the driver. (The intent there is to push
global GEM decisions to the boundary as used by the GEM user interface.)
Now during request construction, each request is responsible via its
logical context to acquire a wakeref on each power domain it intends to
utilize. Currently, each request takes a wakeref on the engine(s) and
the engines themselves take a chipset wakeref. This gives us a
transition on each engine which we can extend if we want to insert more
powermangement control (such as soft rc6). The global GEM operations
that currently require a struct_mutex are reduced to listening to pm
events from the chipset GT wakeref. As we reduce the struct_mutex
requirement, these listeners should evaporate.
Perhaps the biggest immediate change is that this removes the
struct_mutex requirement around GT power management, allowing us greater
flexibility in request construction. Another important knock-on effect,
is that by tracking engine usage, we can insert a switch back to the
kernel context on that engine immediately, avoiding any extra delay or
inserting global synchronisation barriers. This makes tracking when an
engine and its associated contexts are idle much easier -- important for
when we forgo our assumed execution ordering and need idle barriers to
unpin used contexts. In the process, it means we remove a large chunk of
code whose only purpose was to switch back to the kernel context.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Imre Deak <imre.deak@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190424200717.1686-5-chris@chris-wilson.co.uk
2019-04-24 20:07:17 +00:00
|
|
|
drm_printf(m, "\tAwake? %d\n", atomic_read(&engine->wakeref.count));
|
2019-05-08 08:06:25 +00:00
|
|
|
drm_printf(m, "\tHangcheck: %d ms ago\n",
|
2018-04-30 13:15:00 +00:00
|
|
|
jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp));
|
2018-02-12 10:24:15 +00:00
|
|
|
drm_printf(m, "\tReset count: %d (global %d)\n",
|
|
|
|
i915_reset_engine_count(error, engine),
|
|
|
|
i915_reset_count(error));
|
|
|
|
|
|
|
|
drm_printf(m, "\tRequests:\n");
|
|
|
|
|
2019-07-15 08:09:28 +00:00
|
|
|
spin_lock_irqsave(&engine->active.lock, flags);
|
2019-03-05 18:03:32 +00:00
|
|
|
rq = intel_engine_find_active_request(engine);
|
2018-02-12 10:24:15 +00:00
|
|
|
if (rq) {
|
|
|
|
print_request(m, rq, "\t\tactive ");
|
2018-06-14 12:21:50 +00:00
|
|
|
|
2018-03-07 13:42:24 +00:00
|
|
|
drm_printf(m, "\t\tring->start: 0x%08x\n",
|
2018-02-12 10:24:15 +00:00
|
|
|
i915_ggtt_offset(rq->ring->vma));
|
2018-03-07 13:42:24 +00:00
|
|
|
drm_printf(m, "\t\tring->head: 0x%08x\n",
|
2018-02-12 10:24:15 +00:00
|
|
|
rq->ring->head);
|
2018-03-07 13:42:24 +00:00
|
|
|
drm_printf(m, "\t\tring->tail: 0x%08x\n",
|
2018-02-12 10:24:15 +00:00
|
|
|
rq->ring->tail);
|
2018-03-07 13:42:24 +00:00
|
|
|
drm_printf(m, "\t\tring->emit: 0x%08x\n",
|
|
|
|
rq->ring->emit);
|
|
|
|
drm_printf(m, "\t\tring->space: 0x%08x\n",
|
|
|
|
rq->ring->space);
|
2019-01-28 18:18:11 +00:00
|
|
|
drm_printf(m, "\t\tring->hwsp: 0x%08x\n",
|
|
|
|
rq->timeline->hwsp_offset);
|
2018-06-14 12:21:50 +00:00
|
|
|
|
|
|
|
print_request_ring(m, rq);
|
2019-09-15 20:37:00 +00:00
|
|
|
|
|
|
|
if (rq->hw_context->lrc_reg_state) {
|
|
|
|
drm_printf(m, "Logical Ring Context:\n");
|
|
|
|
hexdump(m, rq->hw_context->lrc_reg_state, PAGE_SIZE);
|
|
|
|
}
|
2018-02-12 10:24:15 +00:00
|
|
|
}
|
2019-07-15 08:09:28 +00:00
|
|
|
spin_unlock_irqrestore(&engine->active.lock, flags);
|
2018-02-12 10:24:15 +00:00
|
|
|
|
2019-08-13 21:57:07 +00:00
|
|
|
drm_printf(m, "\tMMIO base: 0x%08x\n", engine->mmio_base);
|
2019-06-13 23:21:54 +00:00
|
|
|
wakeref = intel_runtime_pm_get_if_in_use(&engine->i915->runtime_pm);
|
2019-01-14 14:21:18 +00:00
|
|
|
if (wakeref) {
|
2018-02-12 10:24:15 +00:00
|
|
|
intel_engine_print_registers(engine, m);
|
2019-06-13 23:21:54 +00:00
|
|
|
intel_runtime_pm_put(&engine->i915->runtime_pm, wakeref);
|
2018-02-12 10:24:15 +00:00
|
|
|
} else {
|
|
|
|
drm_printf(m, "\tDevice is asleep; skipping register dump\n");
|
|
|
|
}
|
2017-10-09 11:02:57 +00:00
|
|
|
|
2019-01-15 21:29:48 +00:00
|
|
|
intel_execlists_show_requests(engine, m, print_request, 8);
|
2017-10-15 20:43:10 +00:00
|
|
|
|
2017-12-22 18:25:21 +00:00
|
|
|
drm_printf(m, "HWSP:\n");
|
2019-01-28 10:23:55 +00:00
|
|
|
hexdump(m, engine->status_page.addr, PAGE_SIZE);
|
2017-12-22 18:25:21 +00:00
|
|
|
|
2017-11-07 15:22:11 +00:00
|
|
|
drm_printf(m, "Idle? %s\n", yesno(intel_engine_is_idle(engine)));
|
drm/i915: Replace global breadcrumbs with per-context interrupt tracking
A few years ago, see commit 688e6c725816 ("drm/i915: Slaughter the
thundering i915_wait_request herd"), the issue of handling multiple
clients waiting in parallel was brought to our attention. The
requirement was that every client should be woken immediately upon its
request being signaled, without incurring any cpu overhead.
To handle certain fragility of our hw meant that we could not do a
simple check inside the irq handler (some generations required almost
unbounded delays before we could be sure of seqno coherency) and so
request completion checking required delegation.
Before commit 688e6c725816, the solution was simple. Every client
waiting on a request would be woken on every interrupt and each would do
a heavyweight check to see if their request was complete. Commit
688e6c725816 introduced an rbtree so that only the earliest waiter on
the global timeline would woken, and would wake the next and so on.
(Along with various complications to handle requests being reordered
along the global timeline, and also a requirement for kthread to provide
a delegate for fence signaling that had no process context.)
The global rbtree depends on knowing the execution timeline (and global
seqno). Without knowing that order, we must instead check all contexts
queued to the HW to see which may have advanced. We trim that list by
only checking queued contexts that are being waited on, but still we
keep a list of all active contexts and their active signalers that we
inspect from inside the irq handler. By moving the waiters onto the fence
signal list, we can combine the client wakeup with the dma_fence
signaling (a dramatic reduction in complexity, but does require the HW
being coherent, the seqno must be visible from the cpu before the
interrupt is raised - we keep a timer backup just in case).
Having previously fixed all the issues with irq-seqno serialisation (by
inserting delays onto the GPU after each request instead of random delays
on the CPU after each interrupt), we can rely on the seqno state to
perfom direct wakeups from the interrupt handler. This allows us to
preserve our single context switch behaviour of the current routine,
with the only downside that we lose the RT priority sorting of wakeups.
In general, direct wakeup latency of multiple clients is about the same
(about 10% better in most cases) with a reduction in total CPU time spent
in the waiter (about 20-50% depending on gen). Average herd behaviour is
improved, but at the cost of not delegating wakeups on task_prio.
v2: Capture fence signaling state for error state and add comments to
warm even the most cold of hearts.
v3: Check if the request is still active before busywaiting
v4: Reduce the amount of pointer misdirection with list_for_each_safe
and using a local i915_request variable inside the loops
v5: Add a missing pluralisation to a purely informative selftest message.
References: 688e6c725816 ("drm/i915: Slaughter the thundering i915_wait_request herd")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190129205230.19056-2-chris@chris-wilson.co.uk
2019-01-29 20:52:29 +00:00
|
|
|
|
|
|
|
intel_engine_print_breadcrumbs(engine, m);
|
2017-10-09 11:02:57 +00:00
|
|
|
}
|
|
|
|
|
2017-11-21 18:18:48 +00:00
|
|
|
/**
|
|
|
|
* intel_enable_engine_stats() - Enable engine busy tracking on engine
|
|
|
|
* @engine: engine to enable stats collection
|
|
|
|
*
|
|
|
|
* Start collecting the engine busyness data for @engine.
|
|
|
|
*
|
|
|
|
* Returns 0 on success or a negative error code.
|
|
|
|
*/
|
|
|
|
int intel_enable_engine_stats(struct intel_engine_cs *engine)
|
|
|
|
{
|
2018-01-15 09:20:41 +00:00
|
|
|
struct intel_engine_execlists *execlists = &engine->execlists;
|
2017-11-21 18:18:48 +00:00
|
|
|
unsigned long flags;
|
2018-01-15 09:20:41 +00:00
|
|
|
int err = 0;
|
2017-11-21 18:18:48 +00:00
|
|
|
|
2017-11-29 10:28:05 +00:00
|
|
|
if (!intel_engine_supports_stats(engine))
|
2017-11-21 18:18:48 +00:00
|
|
|
return -ENODEV;
|
|
|
|
|
2019-06-14 16:46:06 +00:00
|
|
|
spin_lock_irqsave(&engine->active.lock, flags);
|
drm/i915/execlists: Direct submission of new requests (avoid tasklet/ksoftirqd)
Back in commit 27af5eea54d1 ("drm/i915: Move execlists irq handler to a
bottom half"), we came to the conclusion that running our CSB processing
and ELSP submission from inside the irq handler was a bad idea. A really
bad idea as we could impose nearly 1s latency on other users of the
system, on average! Deferring our work to a tasklet allowed us to do the
processing with irqs enabled, reducing the impact to an average of about
50us.
We have since eradicated the use of forcewaked mmio from inside the CSB
processing and ELSP submission, bringing the impact down to around 5us
(on Kabylake); an order of magnitude better than our measurements 2
years ago on Broadwell and only about 2x worse on average than the
gem_syslatency on an unladen system.
In this iteration of the tasklet-vs-direct submission debate, we seek a
compromise where by we submit new requests immediately to the HW but
defer processing the CS interrupt onto a tasklet. We gain the advantage
of low-latency and ksoftirqd avoidance when waking up the HW, while
avoiding the system-wide starvation of our CS irq-storms.
Comparing the impact on the maximum latency observed (that is the time
stolen from an RT process) over a 120s interval, repeated several times
(using gem_syslatency, similar to RT's cyclictest) while the system is
fully laden with i915 nops, we see that direct submission an actually
improve the worse case.
Maximum latency in microseconds of a third party RT thread
(gem_syslatency -t 120 -f 2)
x Always using tasklets (a couple of >1000us outliers removed)
+ Only using tasklets from CS irq, direct submission of requests
+------------------------------------------------------------------------+
| + |
| + |
| + |
| + + |
| + + + |
| + + + + x x x |
| +++ + + + x x x x x x |
| +++ + ++ + + *x x x x x x |
| +++ + ++ + * *x x * x x x |
| + +++ + ++ * * +*xxx * x x xx |
| * +++ + ++++* *x+**xx+ * x x xxxx x |
| **x++++*++**+*x*x****x+ * +x xx xxxx x x |
|x* ******+***************++*+***xxxxxx* xx*x xxx + x+|
| |__________MA___________| |
| |______M__A________| |
+------------------------------------------------------------------------+
N Min Max Median Avg Stddev
x 118 91 186 124 125.28814 16.279137
+ 120 92 187 109 112.00833 13.458617
Difference at 95.0% confidence
-13.2798 +/- 3.79219
-10.5994% +/- 3.02677%
(Student's t, pooled s = 14.9237)
However the mean latency is adversely affected:
Mean latency in microseconds of a third party RT thread
(gem_syslatency -t 120 -f 1)
x Always using tasklets
+ Only using tasklets from CS irq, direct submission of requests
+------------------------------------------------------------------------+
| xxxxxx + ++ |
| xxxxxx + ++ |
| xxxxxx + +++ ++ |
| xxxxxxx +++++ ++ |
| xxxxxxx +++++ ++ |
| xxxxxxx +++++ +++ |
| xxxxxxx + ++++++++++ |
| xxxxxxxx ++ ++++++++++ |
| xxxxxxxx ++ ++++++++++ |
| xxxxxxxxxx +++++++++++++++ |
| xxxxxxxxxxx x +++++++++++++++ |
|x xxxxxxxxxxxxx x + + ++++++++++++++++++ +|
| |__A__| |
| |____A___| |
+------------------------------------------------------------------------+
N Min Max Median Avg Stddev
x 120 3.506 3.727 3.631 3.6321417 0.02773109
+ 120 3.834 4.149 4.039 4.0375167 0.041221676
Difference at 95.0% confidence
0.405375 +/- 0.00888913
11.1608% +/- 0.244735%
(Student's t, pooled s = 0.03513)
However, since the mean latency corresponds to the amount of irqsoff
processing we have to do for a CS interrupt, we only need to speed that
up to benefit not just system latency but our own throughput.
v2: Remember to defer submissions when under reset.
v4: Only use direct submission for new requests
v5: Be aware that with mixing direct tasklet evaluation and deferred
tasklets, we may end up idling before running the deferred tasklet.
v6: Remove the redudant likely() from tasklet_is_enabled(), restrict the
annotation to reset_in_progress().
v7: Take the full timeline.lock when enabling perf_pmu stats as the
tasklet is no longer a valid guard. A consequence is that the stats are
now only valid for engines also using the timeline.lock to process
state.
Testcase: igt/gem_exec_latency/*rthog*
References: 27af5eea54d1 ("drm/i915: Move execlists irq handler to a bottom half")
Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180628201211.13837-9-chris@chris-wilson.co.uk
2018-06-28 20:12:11 +00:00
|
|
|
write_seqlock(&engine->stats.lock);
|
2018-01-15 09:20:41 +00:00
|
|
|
|
|
|
|
if (unlikely(engine->stats.enabled == ~0)) {
|
|
|
|
err = -EBUSY;
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
|
2018-01-11 07:30:31 +00:00
|
|
|
if (engine->stats.enabled++ == 0) {
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
struct i915_request * const *port;
|
|
|
|
struct i915_request *rq;
|
2018-01-11 07:30:31 +00:00
|
|
|
|
2017-11-21 18:18:48 +00:00
|
|
|
engine->stats.enabled_at = ktime_get();
|
2018-01-11 07:30:31 +00:00
|
|
|
|
|
|
|
/* XXX submission method oblivious? */
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
for (port = execlists->active; (rq = *port); port++)
|
2018-01-11 07:30:31 +00:00
|
|
|
engine->stats.active++;
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
|
|
|
|
for (port = execlists->pending; (rq = *port); port++) {
|
|
|
|
/* Exclude any contexts already counted in active */
|
2019-08-16 17:16:08 +00:00
|
|
|
if (!intel_context_inflight_count(rq->hw_context))
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
engine->stats.active++;
|
2018-01-11 07:30:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (engine->stats.active)
|
|
|
|
engine->stats.start = engine->stats.enabled_at;
|
|
|
|
}
|
2017-11-21 18:18:48 +00:00
|
|
|
|
2018-01-15 09:20:41 +00:00
|
|
|
unlock:
|
drm/i915/execlists: Direct submission of new requests (avoid tasklet/ksoftirqd)
Back in commit 27af5eea54d1 ("drm/i915: Move execlists irq handler to a
bottom half"), we came to the conclusion that running our CSB processing
and ELSP submission from inside the irq handler was a bad idea. A really
bad idea as we could impose nearly 1s latency on other users of the
system, on average! Deferring our work to a tasklet allowed us to do the
processing with irqs enabled, reducing the impact to an average of about
50us.
We have since eradicated the use of forcewaked mmio from inside the CSB
processing and ELSP submission, bringing the impact down to around 5us
(on Kabylake); an order of magnitude better than our measurements 2
years ago on Broadwell and only about 2x worse on average than the
gem_syslatency on an unladen system.
In this iteration of the tasklet-vs-direct submission debate, we seek a
compromise where by we submit new requests immediately to the HW but
defer processing the CS interrupt onto a tasklet. We gain the advantage
of low-latency and ksoftirqd avoidance when waking up the HW, while
avoiding the system-wide starvation of our CS irq-storms.
Comparing the impact on the maximum latency observed (that is the time
stolen from an RT process) over a 120s interval, repeated several times
(using gem_syslatency, similar to RT's cyclictest) while the system is
fully laden with i915 nops, we see that direct submission an actually
improve the worse case.
Maximum latency in microseconds of a third party RT thread
(gem_syslatency -t 120 -f 2)
x Always using tasklets (a couple of >1000us outliers removed)
+ Only using tasklets from CS irq, direct submission of requests
+------------------------------------------------------------------------+
| + |
| + |
| + |
| + + |
| + + + |
| + + + + x x x |
| +++ + + + x x x x x x |
| +++ + ++ + + *x x x x x x |
| +++ + ++ + * *x x * x x x |
| + +++ + ++ * * +*xxx * x x xx |
| * +++ + ++++* *x+**xx+ * x x xxxx x |
| **x++++*++**+*x*x****x+ * +x xx xxxx x x |
|x* ******+***************++*+***xxxxxx* xx*x xxx + x+|
| |__________MA___________| |
| |______M__A________| |
+------------------------------------------------------------------------+
N Min Max Median Avg Stddev
x 118 91 186 124 125.28814 16.279137
+ 120 92 187 109 112.00833 13.458617
Difference at 95.0% confidence
-13.2798 +/- 3.79219
-10.5994% +/- 3.02677%
(Student's t, pooled s = 14.9237)
However the mean latency is adversely affected:
Mean latency in microseconds of a third party RT thread
(gem_syslatency -t 120 -f 1)
x Always using tasklets
+ Only using tasklets from CS irq, direct submission of requests
+------------------------------------------------------------------------+
| xxxxxx + ++ |
| xxxxxx + ++ |
| xxxxxx + +++ ++ |
| xxxxxxx +++++ ++ |
| xxxxxxx +++++ ++ |
| xxxxxxx +++++ +++ |
| xxxxxxx + ++++++++++ |
| xxxxxxxx ++ ++++++++++ |
| xxxxxxxx ++ ++++++++++ |
| xxxxxxxxxx +++++++++++++++ |
| xxxxxxxxxxx x +++++++++++++++ |
|x xxxxxxxxxxxxx x + + ++++++++++++++++++ +|
| |__A__| |
| |____A___| |
+------------------------------------------------------------------------+
N Min Max Median Avg Stddev
x 120 3.506 3.727 3.631 3.6321417 0.02773109
+ 120 3.834 4.149 4.039 4.0375167 0.041221676
Difference at 95.0% confidence
0.405375 +/- 0.00888913
11.1608% +/- 0.244735%
(Student's t, pooled s = 0.03513)
However, since the mean latency corresponds to the amount of irqsoff
processing we have to do for a CS interrupt, we only need to speed that
up to benefit not just system latency but our own throughput.
v2: Remember to defer submissions when under reset.
v4: Only use direct submission for new requests
v5: Be aware that with mixing direct tasklet evaluation and deferred
tasklets, we may end up idling before running the deferred tasklet.
v6: Remove the redudant likely() from tasklet_is_enabled(), restrict the
annotation to reset_in_progress().
v7: Take the full timeline.lock when enabling perf_pmu stats as the
tasklet is no longer a valid guard. A consequence is that the stats are
now only valid for engines also using the timeline.lock to process
state.
Testcase: igt/gem_exec_latency/*rthog*
References: 27af5eea54d1 ("drm/i915: Move execlists irq handler to a bottom half")
Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180628201211.13837-9-chris@chris-wilson.co.uk
2018-06-28 20:12:11 +00:00
|
|
|
write_sequnlock(&engine->stats.lock);
|
2019-06-14 16:46:06 +00:00
|
|
|
spin_unlock_irqrestore(&engine->active.lock, flags);
|
2017-11-21 18:18:48 +00:00
|
|
|
|
2018-01-15 09:20:41 +00:00
|
|
|
return err;
|
2017-11-21 18:18:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
ktime_t total = engine->stats.total;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the engine is executing something at the moment
|
|
|
|
* add it to the total.
|
|
|
|
*/
|
|
|
|
if (engine->stats.active)
|
|
|
|
total = ktime_add(total,
|
|
|
|
ktime_sub(ktime_get(), engine->stats.start));
|
|
|
|
|
|
|
|
return total;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* intel_engine_get_busy_time() - Return current accumulated engine busyness
|
|
|
|
* @engine: engine to report on
|
|
|
|
*
|
|
|
|
* Returns accumulated time @engine was busy since engine stats were enabled.
|
|
|
|
*/
|
|
|
|
ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine)
|
|
|
|
{
|
2018-04-26 07:47:16 +00:00
|
|
|
unsigned int seq;
|
2017-11-21 18:18:48 +00:00
|
|
|
ktime_t total;
|
|
|
|
|
2018-04-26 07:47:16 +00:00
|
|
|
do {
|
|
|
|
seq = read_seqbegin(&engine->stats.lock);
|
|
|
|
total = __intel_engine_get_busy_time(engine);
|
|
|
|
} while (read_seqretry(&engine->stats.lock, seq));
|
2017-11-21 18:18:48 +00:00
|
|
|
|
|
|
|
return total;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* intel_disable_engine_stats() - Disable engine busy tracking on engine
|
|
|
|
* @engine: engine to disable stats collection
|
|
|
|
*
|
|
|
|
* Stops collecting the engine busyness data for @engine.
|
|
|
|
*/
|
|
|
|
void intel_disable_engine_stats(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
2017-11-29 10:28:05 +00:00
|
|
|
if (!intel_engine_supports_stats(engine))
|
2017-11-21 18:18:48 +00:00
|
|
|
return;
|
|
|
|
|
2018-04-26 07:47:16 +00:00
|
|
|
write_seqlock_irqsave(&engine->stats.lock, flags);
|
2017-11-21 18:18:48 +00:00
|
|
|
WARN_ON_ONCE(engine->stats.enabled == 0);
|
|
|
|
if (--engine->stats.enabled == 0) {
|
|
|
|
engine->stats.total = __intel_engine_get_busy_time(engine);
|
|
|
|
engine->stats.active = 0;
|
|
|
|
}
|
2018-04-26 07:47:16 +00:00
|
|
|
write_sequnlock_irqrestore(&engine->stats.lock, flags);
|
2017-11-21 18:18:48 +00:00
|
|
|
}
|
|
|
|
|
2019-03-05 18:03:32 +00:00
|
|
|
static bool match_ring(struct i915_request *rq)
|
|
|
|
{
|
2019-03-25 21:49:40 +00:00
|
|
|
u32 ring = ENGINE_READ(rq->engine, RING_START);
|
2019-03-05 18:03:32 +00:00
|
|
|
|
|
|
|
return ring == i915_ggtt_offset(rq->ring->vma);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct i915_request *
|
|
|
|
intel_engine_find_active_request(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
struct i915_request *request, *active = NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We are called by the error capture, reset and to dump engine
|
|
|
|
* state at random points in time. In particular, note that neither is
|
|
|
|
* crucially ordered with an interrupt. After a hang, the GPU is dead
|
|
|
|
* and we assume that no more writes can happen (we waited long enough
|
|
|
|
* for all writes that were in transaction to be flushed) - adding an
|
|
|
|
* extra delay for a recent interrupt is pointless. Hence, we do
|
|
|
|
* not need an engine->irq_seqno_barrier() before the seqno reads.
|
|
|
|
* At all other times, we must assume the GPU is still running, but
|
|
|
|
* we only care about the snapshot of this moment.
|
|
|
|
*/
|
2019-07-15 08:09:28 +00:00
|
|
|
lockdep_assert_held(&engine->active.lock);
|
2019-06-14 16:46:06 +00:00
|
|
|
list_for_each_entry(request, &engine->active.requests, sched.link) {
|
2019-03-05 18:03:32 +00:00
|
|
|
if (i915_request_completed(request))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!i915_request_started(request))
|
2019-06-14 16:46:06 +00:00
|
|
|
continue;
|
2019-03-05 18:03:32 +00:00
|
|
|
|
|
|
|
/* More than one preemptible request may match! */
|
|
|
|
if (!match_ring(request))
|
2019-06-14 16:46:06 +00:00
|
|
|
continue;
|
2019-03-05 18:03:32 +00:00
|
|
|
|
|
|
|
active = request;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return active;
|
|
|
|
}
|
|
|
|
|
2017-02-13 17:15:14 +00:00
|
|
|
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
|
2019-08-08 11:06:11 +00:00
|
|
|
#include "mock_engine.c"
|
2019-08-08 20:27:58 +00:00
|
|
|
#include "selftest_engine.c"
|
2019-04-24 17:48:39 +00:00
|
|
|
#include "selftest_engine_cs.c"
|
2017-02-13 17:15:14 +00:00
|
|
|
#endif
|