drm/i915/bdw: Apply workarounds in render ring init function

For BDW workarounds are currently initialized in init_clock_gating() but
they are lost during reset, suspend/resume etc; this patch moves the WAs
that are part of register state context to render ring init fn otherwise
default context ends up with incorrect values as they don't get initialized
until init_clock_gating fn.

v2: Add workarounds to golden render state
This method has its own issues, first of all this is different for
each gen and it is generated using a tool so adding new workaround
and mainitaining them across gens is not a straightforward process.

v3: Use LRIs to emit these workarounds (Ville)
Instead of modifying the golden render state the same LRIs are
emitted from within the driver.

v4: Use abstract name when exporting gen specific routines (Chris)

For: VIZ-4092
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
This commit is contained in:
Arun Siluvery 2014-08-26 14:44:50 +01:00 committed by Daniel Vetter
parent c5ad011d7d
commit 86d7f23842
4 changed files with 87 additions and 48 deletions

View File

@ -628,6 +628,12 @@ done:
ring->last_context = to;
if (uninitialized) {
if (ring->init_context) {
ret = ring->init_context(ring);
if (ret)
DRM_ERROR("ring init context: %d\n", ret);
}
ret = i915_gem_render_state_init(ring);
if (ret)
DRM_ERROR("init render state: %d\n", ret);

View File

@ -5536,37 +5536,12 @@ static void broadwell_init_clock_gating(struct drm_device *dev)
/* FIXME(BDW): Check all the w/a, some might only apply to
* pre-production hw. */
/* WaDisablePartialInstShootdown:bdw */
I915_WRITE(GEN8_ROW_CHICKEN,
_MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE));
/* WaDisableThreadStallDopClockGating:bdw */
/* FIXME: Unclear whether we really need this on production bdw. */
I915_WRITE(GEN8_ROW_CHICKEN,
_MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE));
/*
* This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
* pre-production hardware
*/
I915_WRITE(HALF_SLICE_CHICKEN3,
_MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS));
I915_WRITE(HALF_SLICE_CHICKEN3,
_MASKED_BIT_ENABLE(GEN8_SAMPLER_POWER_BYPASS_DIS));
I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_BWGTLB_DISABLE));
I915_WRITE(_3D_CHICKEN3,
_MASKED_BIT_ENABLE(_3D_CHICKEN_SDE_LIMIT_FIFO_POLY_DEPTH(2)));
I915_WRITE(COMMON_SLICE_CHICKEN2,
_MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
I915_WRITE(GEN7_HALF_SLICE_CHICKEN1,
_MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
/* WaDisableDopClockGating:bdw May not be needed for production */
I915_WRITE(GEN7_ROW_CHICKEN2,
_MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
/* WaSwitchSolVfFArbitrationPriority:bdw */
I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | HSW_ECOCHK_ARB_PRIO_SOL);
@ -5582,31 +5557,12 @@ static void broadwell_init_clock_gating(struct drm_device *dev)
BDW_DPRS_MASK_VBLANK_SRD);
}
/* Use Force Non-Coherent whenever executing a 3D context. This is a
* workaround for for a possible hang in the unlikely event a TLB
* invalidation occurs during a PSD flush.
*/
I915_WRITE(HDC_CHICKEN0,
I915_READ(HDC_CHICKEN0) |
_MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
/* WaVSRefCountFullforceMissDisable:bdw */
/* WaDSRefCountFullforceMissDisable:bdw */
I915_WRITE(GEN7_FF_THREAD_MODE,
I915_READ(GEN7_FF_THREAD_MODE) &
~(GEN8_FF_DS_REF_CNT_FFME | GEN7_FF_VS_REF_CNT_FFME));
/*
* BSpec recommends 8x4 when MSAA is used,
* however in practice 16x4 seems fastest.
*
* Note that PS/WM thread counts depend on the WIZ hashing
* disable bit, which we don't touch here, but it's good
* to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
*/
I915_WRITE(GEN7_GT_MODE,
GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
_MASKED_BIT_ENABLE(GEN8_RC_SEMA_IDLE_MSG_DISABLE));
@ -5614,10 +5570,6 @@ static void broadwell_init_clock_gating(struct drm_device *dev)
I915_WRITE(GEN8_UCGCTL6, I915_READ(GEN8_UCGCTL6) |
GEN8_SDEUNIT_CLOCK_GATE_DISABLE);
/* Wa4x4STCOptimizationDisable:bdw */
I915_WRITE(CACHE_MODE_1,
_MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
lpt_init_clock_gating(dev);
}

View File

@ -657,6 +657,84 @@ err:
return ret;
}
static inline void intel_ring_emit_wa(struct intel_engine_cs *ring,
u32 addr, u32 value)
{
intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
intel_ring_emit(ring, addr);
intel_ring_emit(ring, value);
}
static int gen8_init_workarounds(struct intel_engine_cs *ring)
{
int ret;
/*
* workarounds applied in this fn are part of register state context,
* they need to be re-initialized followed by gpu reset, suspend/resume,
* module reload.
*/
/*
* update the number of dwords required based on the
* actual number of workarounds applied
*/
ret = intel_ring_begin(ring, 24);
if (ret)
return ret;
/* WaDisablePartialInstShootdown:bdw */
/* WaDisableThreadStallDopClockGating:bdw */
/* FIXME: Unclear whether we really need this on production bdw. */
intel_ring_emit_wa(ring, GEN8_ROW_CHICKEN,
_MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE
| STALL_DOP_GATING_DISABLE));
/* WaDisableDopClockGating:bdw May not be needed for production */
intel_ring_emit_wa(ring, GEN7_ROW_CHICKEN2,
_MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
/*
* This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
* pre-production hardware
*/
intel_ring_emit_wa(ring, HALF_SLICE_CHICKEN3,
_MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS
| GEN8_SAMPLER_POWER_BYPASS_DIS));
intel_ring_emit_wa(ring, GEN7_HALF_SLICE_CHICKEN1,
_MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
intel_ring_emit_wa(ring, COMMON_SLICE_CHICKEN2,
_MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
/* Use Force Non-Coherent whenever executing a 3D context. This is a
* workaround for for a possible hang in the unlikely event a TLB
* invalidation occurs during a PSD flush.
*/
intel_ring_emit_wa(ring, HDC_CHICKEN0,
_MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
/* Wa4x4STCOptimizationDisable:bdw */
intel_ring_emit_wa(ring, CACHE_MODE_1,
_MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
/*
* BSpec recommends 8x4 when MSAA is used,
* however in practice 16x4 seems fastest.
*
* Note that PS/WM thread counts depend on the WIZ hashing
* disable bit, which we don't touch here, but it's good
* to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
*/
intel_ring_emit_wa(ring, GEN7_GT_MODE,
GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
intel_ring_advance(ring);
return 0;
}
static int init_render_ring(struct intel_engine_cs *ring)
{
struct drm_device *dev = ring->dev;
@ -2143,6 +2221,7 @@ int intel_init_render_ring_buffer(struct drm_device *dev)
dev_priv->semaphore_obj = obj;
}
}
ring->init_context = gen8_init_workarounds;
ring->add_request = gen6_add_request;
ring->flush = gen8_render_ring_flush;
ring->irq_get = gen8_ring_get_irq;

View File

@ -148,6 +148,8 @@ struct intel_engine_cs {
int (*init)(struct intel_engine_cs *ring);
int (*init_context)(struct intel_engine_cs *ring);
void (*write_tail)(struct intel_engine_cs *ring,
u32 value);
int __must_check (*flush)(struct intel_engine_cs *ring,