linux/drivers/gpu/drm/i915/i915_gpu_error.h

/*
 * SPDX-License-Identifier: MIT
 *
 * Copyright <EFBFBD> 2008-2018 Intel Corporation
 */

#ifndef _I915_GPU_ERROR_H_
#define _I915_GPU_ERROR_H_

#include <linux/atomic.h>
#include <linux/kref.h>
#include <linux/ktime.h>
#include <linux/sched.h>

#include <drm/drm_mm.h>

#include "gt/intel_engine.h"
#include "gt/uc/intel_uc_fw.h"

#include "intel_device_info.h"

#include "i915_gem.h"
#include "i915_gem_gtt.h"
#include "i915_params.h"
#include "i915_scheduler.h"

struct drm_i915_private;
struct i915_vma_compress;
struct intel_engine_capture_vma;
struct intel_overlay_error_state;
struct intel_display_error_state;

struct i915_vma_coredump {
	struct i915_vma_coredump *next;

	char name[20];

	u64 gtt_offset;
	u64 gtt_size;
	u32 gtt_page_sizes;

	int num_pages;
	int page_count;
	int unused;
	u32 *pages[];
};

struct i915_request_coredump {
	unsigned long flags;
	pid_t pid;
	u32 context;
	u32 seqno;
	u32 head;
	u32 tail;
	struct i915_sched_attr sched_attr;
};

struct intel_engine_coredump {
	const struct intel_engine_cs *engine;

	bool simulated;
	u32 reset_count;

	/* position of active request inside the ring */
	u32 rq_head, rq_post, rq_tail;

	/* Register state */
	u32 ccid;
	u32 start;
	u32 tail;
	u32 head;
	u32 ctl;
	u32 mode;
	u32 hws;
	u32 ipeir;
	u32 ipehr;
	u32 esr;
	u32 bbstate;
	u32 instpm;
	u32 instps;
	u64 bbaddr;
	u64 acthd;
	u32 fault_reg;
	u64 faddr;
	u32 rc_psmi; /* sleep state */
	struct intel_instdone instdone;

	struct i915_gem_context_coredump {
		char comm[TASK_COMM_LEN];

		u64 total_runtime;
		u32 avg_runtime;

		pid_t pid;
		int active;
		int guilty;
		struct i915_sched_attr sched_attr;
	} context;

	struct i915_vma_coredump *vma;

	struct i915_request_coredump execlist[EXECLIST_MAX_PORTS];
	unsigned int num_ports;

	struct {
		u32 gfx_mode;
		union {
			u64 pdp[4];
			u32 pp_dir_base;
		};
	} vm_info;

	struct intel_engine_coredump *next;
};

struct intel_gt_coredump {
	const struct intel_gt *_gt;
	bool awake;
	bool simulated;

	/* Generic register state */
	u32 eir;
	u32 pgtbl_er;
	u32 ier;
	u32 gtier[6], ngtier;
	u32 derrmr;
	u32 forcewake;
	u32 error; /* gen6+ */
	u32 err_int; /* gen7 */
	u32 fault_data0; /* gen8, gen9 */
	u32 fault_data1; /* gen8, gen9 */
	u32 done_reg;
	u32 gac_eco;
	u32 gam_ecochk;
	u32 gab_ctl;
	u32 gfx_mode;
	u32 gtt_cache;
	u32 aux_err; /* gen12 */
	u32 sfc_done[GEN12_SFC_DONE_MAX]; /* gen12 */
	u32 gam_done; /* gen12 */

	u32 nfence;
	u64 fence[I915_MAX_NUM_FENCES];

	struct intel_engine_coredump *engine;

	struct intel_uc_coredump {
		struct intel_uc_fw guc_fw;
		struct intel_uc_fw huc_fw;
		struct i915_vma_coredump *guc_log;
	} *uc;

	struct intel_gt_coredump *next;
};

struct i915_gpu_coredump {
	struct kref ref;
	ktime_t time;
	ktime_t boottime;
	ktime_t uptime;
	unsigned long capture;

	struct drm_i915_private *i915;

	struct intel_gt_coredump *gt;

	char error_msg[128];
	bool simulated;
	bool wakelock;
	bool suspended;
	int iommu;
	u32 reset_count;
	u32 suspend_count;

	struct intel_device_info device_info;
	struct intel_runtime_info runtime_info;
	struct intel_driver_caps driver_caps;
	struct i915_params params;

	struct intel_overlay_error_state *overlay;
	struct intel_display_error_state *display;

	struct scatterlist *sgl, *fit;
};

struct i915_gpu_error {
	/* For reset and error_state handling. */
	spinlock_t lock;
	/* Protected by the above dev->gpu_error.lock. */
	struct i915_gpu_coredump *first_error;

	atomic_t pending_fb_pin;

	/** Number of times the device has been reset (global) */
	atomic_t reset_count;

	/** Number of times an engine has been reset */
	atomic_t reset_engine_count[I915_NUM_ENGINES];
};

struct drm_i915_error_state_buf {
	struct drm_i915_private *i915;
	struct scatterlist *sgl, *cur, *end;

	char *buf;
	size_t bytes;
	size_t size;
	loff_t iter;

	int err;
};

#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)

__printf(2, 3)
void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);

struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915);
void i915_capture_error_state(struct drm_i915_private *i915);

struct i915_gpu_coredump *
i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp);

struct intel_gt_coredump *
intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp);

struct intel_engine_coredump *
intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp);

struct intel_engine_capture_vma *
intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
				  struct i915_request *rq,
				  gfp_t gfp);

void intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
				   struct intel_engine_capture_vma *capture,
				   struct i915_vma_compress *compress);

struct i915_vma_compress *
i915_vma_capture_prepare(struct intel_gt_coredump *gt);

void i915_vma_capture_finish(struct intel_gt_coredump *gt,
			     struct i915_vma_compress *compress);

void i915_error_state_store(struct i915_gpu_coredump *error);

static inline struct i915_gpu_coredump *
i915_gpu_coredump_get(struct i915_gpu_coredump *gpu)
{
	kref_get(&gpu->ref);
	return gpu;
}

ssize_t
i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error,
				 char *buf, loff_t offset, size_t count);

void __i915_gpu_coredump_free(struct kref *kref);
static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)
{
	if (gpu)
		kref_put(&gpu->ref, __i915_gpu_coredump_free);
}

struct i915_gpu_coredump *i915_first_error_state(struct drm_i915_private *i915);
void i915_reset_error_state(struct drm_i915_private *i915);
void i915_disable_error_state(struct drm_i915_private *i915, int err);

#else

static inline void i915_capture_error_state(struct drm_i915_private *i915)
{
}

static inline struct i915_gpu_coredump *
i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp)
{
	return NULL;
}

static inline struct intel_gt_coredump *
intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp)
{
	return NULL;
}

static inline struct intel_engine_coredump *
intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp)
{
	return NULL;
}

static inline struct intel_engine_capture_vma *
intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
				  struct i915_request *rq,
				  gfp_t gfp)
{
	return NULL;
}

static inline void
intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
			      struct intel_engine_capture_vma *capture,
			      struct i915_vma_compress *compress)
{
}

static inline struct i915_vma_compress *
i915_vma_capture_prepare(struct intel_gt_coredump *gt)
{
	return NULL;
}

static inline void
i915_vma_capture_finish(struct intel_gt_coredump *gt,
			struct i915_vma_compress *compress)
{
}

static inline void
i915_error_state_store(struct i915_gpu_coredump *error)
{
}

static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)
{
}

static inline struct i915_gpu_coredump *
i915_first_error_state(struct drm_i915_private *i915)
{
	return ERR_PTR(-ENODEV);
}

static inline void i915_reset_error_state(struct drm_i915_private *i915)
{
}

static inline void i915_disable_error_state(struct drm_i915_private *i915,
					    int err)
{
}

#endif /* IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) */

#endif /* _I915_GPU_ERROR_H_ */
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								/*
 								 * SPDX-License-Identifier: MIT
 								 *
 								 * Copyright <EFBFBD> 2008-2018 Intel Corporation
 								 */
 								#ifndef _I915_GPU_ERROR_H_
 								#define _I915_GPU_ERROR_H_
-												drm/i915/gt: Use intel_gt as the primary object for handling resets

Having taken the first step in encapsulating the functionality by moving
the related files under gt/, the next step is to start encapsulating by
passing around the relevant structs rather than the global
drm_i915_private. In this step, we pass intel_gt to intel_reset.c

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190712192953.9187-1-chris@chris-wilson.co.uk

											
										
										
											2019-07-12 19:29:53 +00:00
+								#include <linux/atomic.h>
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								#include <linux/kref.h>
 								#include <linux/ktime.h>
 								#include <linux/sched.h>
 								#include <drm/drm_mm.h>
-												drm/i915: Move GraphicsTechnology files under gt/

Start partitioning off the code that talks to the hardware (GT) from the
uapi layers and move the device facing code under gt/

One casualty is s/intel_ringbuffer.h/intel_engine.h/ with the plan to
subdivide that header and body further (and split out the submission
code from the ringbuffer and logical context handling). This patch aims
to be simple motion so git can fixup inflight patches with little mess.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Jani Nikula <jani.nikula@intel.com>
Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190424174839.7141-1-chris@chris-wilson.co.uk

											
										
										
											2019-04-24 17:48:39 +00:00
+								#include "gt/intel_engine.h"
-												drm/i915/uc: move GuC and HuC files under gt/uc/

Both microcontrollers are part of the GT HW and are closely related to
GT operations. To keep all the files cleanly together, they've been
placed in their own subdir inside the gt/ folder

Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Acked-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20190713100016.8026-6-chris@chris-wilson.co.uk
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

											
										
										
											2019-07-13 10:00:11 +00:00
+								#include "gt/uc/intel_uc_fw.h"
-												drm/i915: Move GraphicsTechnology files under gt/

Start partitioning off the code that talks to the hardware (GT) from the
uapi layers and move the device facing code under gt/

One casualty is s/intel_ringbuffer.h/intel_engine.h/ with the plan to
subdivide that header and body further (and split out the submission
code from the ringbuffer and logical context handling). This patch aims
to be simple motion so git can fixup inflight patches with little mess.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Jani Nikula <jani.nikula@intel.com>
Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190424174839.7141-1-chris@chris-wilson.co.uk

											
										
										
											2019-04-24 17:48:39 +00:00
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								#include "intel_device_info.h"
 								#include "i915_gem.h"
 								#include "i915_gem_gtt.h"
 								#include "i915_params.h"
-												drm/i915: Pack params to engine->schedule() into a struct

Today we only want to pass along the priority to engine->schedule(), but
in the future we want to have much more control over the various aspects
of the GPU during a context's execution, for example controlling the
frequency allowed. As we need an ever growing number of parameters for
scheduling, move those into a struct for convenience.

v2: Move the anonymous struct into its own function for legibility and
ye olde gcc.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180418184052.7129-3-chris@chris-wilson.co.uk

											
										
										
											2018-04-18 18:40:52 +00:00
+								#include "i915_scheduler.h"
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
 								struct drm_i915_private;
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+								struct i915_vma_compress;
 								struct intel_engine_capture_vma;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								struct intel_overlay_error_state;
 								struct intel_display_error_state;
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+								struct i915_vma_coredump {
 									struct i915_vma_coredump *next;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+									char name[20];
 									u64 gtt_offset;
 									u64 gtt_size;
 									u32 gtt_page_sizes;
 									int num_pages;
 									int page_count;
 									int unused;
-												drm/i915: Replace zero-length array with flexible-array

The current codebase makes use of the zero-length array language
extension to the C90 standard, but the preferred mechanism to declare
variable-length types such as these ones is a flexible array member[1][2],
introduced in C99:

struct foo {
        int stuff;
        struct boo array[];
};

By making use of the mechanism above, we will get a compiler warning
in case the flexible array does not occur last in the structure, which
will help us prevent some kind of undefined behavior bugs from being
inadvertently introduced[3] to the codebase from now on.

Also, notice that, dynamic memory allocations won't be affected by
this change:

"Flexible array members have incomplete type, and so the sizeof operator
may not be applied. As a quirk of the original implementation of
zero-length arrays, sizeof evaluates to zero."[1]

sizeof(flexible-array-member) triggers a warning because flexible array
members have incomplete type[1]. There are some instances of code in
which the sizeof operator is being incorrectly/erroneously applied to
zero-length arrays and the result is zero. Such instances may be hiding
some bugs. So, this work (flexible-array member conversions) will also
help to get completely rid of those sorts of issues.

This issue was found with the help of Coccinelle.

[1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html
[2] https://github.com/KSPP/linux/issues/21
[3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour")

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20200507185408.GA14561@embeddedor

											
										
										
											2020-05-07 18:54:08 +00:00
+									u32 *pages[];
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+								};
 								struct i915_request_coredump {
 									unsigned long flags;
 									pid_t pid;
 									u32 context;
 									u32 seqno;
 									u32 head;
 									u32 tail;
 									struct i915_sched_attr sched_attr;
 								};
 								struct intel_engine_coredump {
 									const struct intel_engine_cs *engine;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
 									bool simulated;
 									u32 reset_count;
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+									/* position of active request inside the ring */
 									u32 rq_head, rq_post, rq_tail;
 									/* Register state */
 									u32 ccid;
 									u32 start;
 									u32 tail;
 									u32 head;
 									u32 ctl;
 									u32 mode;
 									u32 hws;
 									u32 ipeir;
 									u32 ipehr;
-												drm/i915/gt: Hook up CS_MASTER_ERROR_INTERRUPT

Now that we have offline error capture and can reset an engine from
inside an atomic context while also preserving the GPU state for
post-mortem analysis, it is time to handle error interrupts thrown by
the command parser.

This provides a much, much faster mechanism for us to detect known
problems than using heartbeats/hangchecks, and also provides a mechanism
for when those are disabled. However, it is limited to problems the HW
can detect in the CS and so not a complete solution for detecting lockups.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200128204318.4182039-2-chris@chris-wilson.co.uk

											
										
										
											2020-01-28 20:43:15 +00:00
+									u32 esr;
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+									u32 bbstate;
 									u32 instpm;
 									u32 instps;
 									u64 bbaddr;
 									u64 acthd;
 									u32 fault_reg;
 									u64 faddr;
 									u32 rc_psmi; /* sleep state */
 									struct intel_instdone instdone;
 									struct i915_gem_context_coredump {
 										char comm[TASK_COMM_LEN];
-												drm/i915: Track hw reported context runtime

GPU saves accumulated context runtime (in CS timestamp units) in PPHWSP
which will be useful for us in cases when we are not able to track context
busyness ourselves (like with GuC). Keep a copy of this in struct
intel_context from where it can be easily read even if the context is not
pinned.

v2:
 (Chris)
 * Do not store pphwsp address in intel_context.
 * Log CS wrap-around.
 * Simplify calculation by relying on integer wraparound.
v3:
 * Include total/avg in traces and error state for debugging

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20200216133620.394962-1-chris@chris-wilson.co.uk

											
										
										
											2020-02-16 13:36:20 +00:00
 										u64 total_runtime;
 										u32 avg_runtime;
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+										pid_t pid;
 										int active;
 										int guilty;
 										struct i915_sched_attr sched_attr;
 									} context;
 									struct i915_vma_coredump *vma;
-												drm/i915: Drop request list from error state

The list of requests from after the hang tells little about the hang
itself, only how busy userspace was after the fact. As it pertains
nothing to the HW state, drop it from the error state.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-4-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:59 +00:00
+									struct i915_request_coredump execlist[EXECLIST_MAX_PORTS];
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+									unsigned int num_ports;
 									struct {
 										u32 gfx_mode;
 										union {
 											u64 pdp[4];
 											u32 pp_dir_base;
 										};
 									} vm_info;
 									struct intel_engine_coredump *next;
 								};
 								struct intel_gt_coredump {
 									const struct intel_gt *_gt;
 									bool awake;
 									bool simulated;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
 									/* Generic register state */
 									u32 eir;
 									u32 pgtbl_er;
 									u32 ier;
-												drm/i915/icl: Read the correct Gen11 interrupt registers

Stop reading some now deprecated interrupt registers in both
debugfs and error state. Instead, read the new equivalents in the
Gen11 interrupt repartitioning scheme.

Note that the equivalent to the PM ISR & IIR cannot be read without
affecting the current state of the system, so I've opted for leaving
them out. See gen11_reset_one_iir() for more info.

v2: else if !!! (Paulo)
v3: another else if (Vinay)
v4:
  - Rebased
  - Renamed patch
  - Improved the ordering of GENs
  - Improved the printing of per-GEN info
v5: Avoid maybe-unitialized & add comment explaining the lack
    of PM ISR & IIR

Suggested-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
Signed-off-by: Oscar Mateo <oscar.mateo@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Sagar Arun Kamble <sagar.a.kamble@intel.com>
Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
Reviewed-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
[Paulo: fix commit message and coding style.]
Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1525989595-18220-1-git-send-email-oscar.mateo@intel.com

											
										
										
											2018-05-10 21:59:55 +00:00
+									u32 gtier[6], ngtier;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+									u32 derrmr;
 									u32 forcewake;
 									u32 error; /* gen6+ */
 									u32 err_int; /* gen7 */
 									u32 fault_data0; /* gen8, gen9 */
 									u32 fault_data1; /* gen8, gen9 */
 									u32 done_reg;
 									u32 gac_eco;
 									u32 gam_ecochk;
 									u32 gab_ctl;
 									u32 gfx_mode;
-												drm/i915: include GTT page-size info in error state

It might prove useful in the future to know if the vma is utilising
huge-GTT-pages. Related to this is the GTT cache, where there is some HW
"quirkiness" where it must be disabled if using 2M pages, so include
that for good measure.

Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20190909171646.22090-1-matthew.auld@intel.com

											
										
										
											2019-09-09 17:16:46 +00:00
+									u32 gtt_cache;
-												drm/i915: capture aux page table error register

TGL introduced a feature in which we map the main surface to the
auxiliary surface. If we screw up the page tables, the HW has a
register to tell us which engine encounters a fault in the page table
walk.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Acked-by: Chris Wilson <chris@chris-wilson.co.uk>
[ickle: Be brave and apply to gen12]
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20191025121718.18806-1-lionel.g.landwerlin@intel.com

											
										
										
											2019-10-25 12:17:18 +00:00
+									u32 aux_err; /* gen12 */
-												drm/i915/tgl: Add SFC instdone to error state

On debugging media workload hangs, sfc instdone
might prove useful in future. Be prepared.

Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20191029163841.5224-1-mika.kuoppala@linux.intel.com

											
										
										
											2019-10-29 16:38:40 +00:00
+									u32 sfc_done[GEN12_SFC_DONE_MAX]; /* gen12 */
-												drm/i915/tgl: Add gam instdone

This has been asked from us already. Prepare for the next
time.

Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20191029163841.5224-2-mika.kuoppala@linux.intel.com

											
										
										
											2019-10-29 16:38:41 +00:00
+									u32 gam_done; /* gen12 */
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
 									u32 nfence;
 									u64 fence[I915_MAX_NUM_FENCES];
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
 									struct intel_engine_coredump *engine;
 									struct intel_uc_coredump {
 										struct intel_uc_fw guc_fw;
 										struct intel_uc_fw huc_fw;
 										struct i915_vma_coredump *guc_log;
 									} *uc;
 									struct intel_gt_coredump *next;
 								};
 								struct i915_gpu_coredump {
 									struct kref ref;
 									ktime_t time;
 									ktime_t boottime;
 									ktime_t uptime;
 									unsigned long capture;
 									struct drm_i915_private *i915;
 									struct intel_gt_coredump *gt;
 									char error_msg[128];
 									bool simulated;
 									bool wakelock;
 									bool suspended;
 									int iommu;
 									u32 reset_count;
 									u32 suspend_count;
 									struct intel_device_info device_info;
 									struct intel_runtime_info runtime_info;
 									struct intel_driver_caps driver_caps;
 									struct i915_params params;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+									struct intel_overlay_error_state *overlay;
 									struct intel_display_error_state *display;
-												drm/i915: Cache the error string

Currently, we convert the error state into a string every time we read
from sysfs (and sysfs reads in page size (4KiB) chunks). We do try to
window the string and only capture the portion that is being read, but
that means that we must always convert up to the window to find the
start. For a very large error state bordering on EXEC_OBJECT_CAPTURE
abuse, this is noticeable as it degrades to O(N^2)!

As we do not have a convenient hook for sysfs open(), and we would like
to keep the lazy conversion into a string, do the conversion of the
whole string on the first read and keep the string until the error state
is freed.

v2: Don't double advance simple_read_from_buffer
v3: Due to extreme pain of lack of vrealloc, use a scatterlist
v4: Keep the forward iterator loosely cached
v5: Stylistic improvements to reduce patch size

Reported-by: Jason Ekstrand <jason@jlekstrand.net>
Testcase: igt/gem_exec_capture/many*
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181123132325.26541-1-chris@chris-wilson.co.uk

											
										
										
											2018-11-23 13:23:25 +00:00
+									struct scatterlist *sgl, *fit;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								};
 								struct i915_gpu_error {
 									/* For reset and error_state handling. */
 									spinlock_t lock;
 									/* Protected by the above dev->gpu_error.lock. */
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+									struct i915_gpu_coredump *first_error;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
 									atomic_t pending_fb_pin;
-												drm/i915: Revoke mmaps and prevent access to fence registers across reset

Previously, we were able to rely on the recursive properties of
struct_mutex to allow us to serialise revoking mmaps and reacquiring the
FENCE registers with them being clobbered over a global device reset.
I then proceeded to throw out the baby with the bath water in order to
pursue a struct_mutex-less reset.

Perusing LWN for alternative strategies, the dilemma on how to serialise
access to a global resource on one side was answered by
https://lwn.net/Articles/202847/ -- Sleepable RCU:

    1  int readside(void) {
    2      int idx;
    3      rcu_read_lock();
    4	   if (nomoresrcu) {
    5          rcu_read_unlock();
    6	       return -EINVAL;
    7      }
    8	   idx = srcu_read_lock(&ss);
    9	   rcu_read_unlock();
    10	   /* SRCU read-side critical section. */
    11	   srcu_read_unlock(&ss, idx);
    12	   return 0;
    13 }
    14
    15 void cleanup(void)
    16 {
    17     nomoresrcu = 1;
    18     synchronize_rcu();
    19     synchronize_srcu(&ss);
    20     cleanup_srcu_struct(&ss);
    21 }

No more worrying about stop_machine, just an uber-complex mutex,
optimised for reads, with the overhead pushed to the rare reset path.

However, we do run the risk of a deadlock as we allocate underneath the
SRCU read lock, and the allocation may require a GPU reset, causing a
dependency cycle via the in-flight requests. We resolve that by declaring
the driver wedged and cancelling all in-flight rendering.

v2: Use expedited rcu barriers to match our earlier timing
characteristics.
v3: Try to annotate locking contexts for sparse
v4: Reduce selftest lock duration to avoid a reset deadlock with fences
v5: s/srcu/reset_backoff_srcu/
v6: Remove more stale comments

Testcase: igt/gem_mmap_gtt/hang
Fixes: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190208153708.20023-2-chris@chris-wilson.co.uk

											
										
										
											2019-02-08 15:37:03 +00:00
+									/** Number of times the device has been reset (global) */
-												drm/i915/gt: Use intel_gt as the primary object for handling resets

Having taken the first step in encapsulating the functionality by moving
the related files under gt/, the next step is to start encapsulating by
passing around the relevant structs rather than the global
drm_i915_private. In this step, we pass intel_gt to intel_reset.c

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190712192953.9187-1-chris@chris-wilson.co.uk

											
										
										
											2019-07-12 19:29:53 +00:00
+									atomic_t reset_count;
-												drm/i915: Revoke mmaps and prevent access to fence registers across reset

Previously, we were able to rely on the recursive properties of
struct_mutex to allow us to serialise revoking mmaps and reacquiring the
FENCE registers with them being clobbered over a global device reset.
I then proceeded to throw out the baby with the bath water in order to
pursue a struct_mutex-less reset.

Perusing LWN for alternative strategies, the dilemma on how to serialise
access to a global resource on one side was answered by
https://lwn.net/Articles/202847/ -- Sleepable RCU:

    1  int readside(void) {
    2      int idx;
    3      rcu_read_lock();
    4	   if (nomoresrcu) {
    5          rcu_read_unlock();
    6	       return -EINVAL;
    7      }
    8	   idx = srcu_read_lock(&ss);
    9	   rcu_read_unlock();
    10	   /* SRCU read-side critical section. */
    11	   srcu_read_unlock(&ss, idx);
    12	   return 0;
    13 }
    14
    15 void cleanup(void)
    16 {
    17     nomoresrcu = 1;
    18     synchronize_rcu();
    19     synchronize_srcu(&ss);
    20     cleanup_srcu_struct(&ss);
    21 }

No more worrying about stop_machine, just an uber-complex mutex,
optimised for reads, with the overhead pushed to the rare reset path.

However, we do run the risk of a deadlock as we allocate underneath the
SRCU read lock, and the allocation may require a GPU reset, causing a
dependency cycle via the in-flight requests. We resolve that by declaring
the driver wedged and cancelling all in-flight rendering.

v2: Use expedited rcu barriers to match our earlier timing
characteristics.
v3: Try to annotate locking contexts for sparse
v4: Reduce selftest lock duration to avoid a reset deadlock with fences
v5: s/srcu/reset_backoff_srcu/
v6: Remove more stale comments

Testcase: igt/gem_mmap_gtt/hang
Fixes: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190208153708.20023-2-chris@chris-wilson.co.uk

											
										
										
											2019-02-08 15:37:03 +00:00
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+									/** Number of times an engine has been reset */
-												drm/i915/gt: Use intel_gt as the primary object for handling resets

Having taken the first step in encapsulating the functionality by moving
the related files under gt/, the next step is to start encapsulating by
passing around the relevant structs rather than the global
drm_i915_private. In this step, we pass intel_gt to intel_reset.c

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190712192953.9187-1-chris@chris-wilson.co.uk

											
										
										
											2019-07-12 19:29:53 +00:00
+									atomic_t reset_engine_count[I915_NUM_ENGINES];
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								};
 								struct drm_i915_error_state_buf {
 									struct drm_i915_private *i915;
-												drm/i915: Cache the error string

Currently, we convert the error state into a string every time we read
from sysfs (and sysfs reads in page size (4KiB) chunks). We do try to
window the string and only capture the portion that is being read, but
that means that we must always convert up to the window to find the
start. For a very large error state bordering on EXEC_OBJECT_CAPTURE
abuse, this is noticeable as it degrades to O(N^2)!

As we do not have a convenient hook for sysfs open(), and we would like
to keep the lazy conversion into a string, do the conversion of the
whole string on the first read and keep the string until the error state
is freed.

v2: Don't double advance simple_read_from_buffer
v3: Due to extreme pain of lack of vrealloc, use a scatterlist
v4: Keep the forward iterator loosely cached
v5: Stylistic improvements to reduce patch size

Reported-by: Jason Ekstrand <jason@jlekstrand.net>
Testcase: igt/gem_exec_capture/many*
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181123132325.26541-1-chris@chris-wilson.co.uk

											
										
										
											2018-11-23 13:23:25 +00:00
+									struct scatterlist *sgl, *cur, *end;
 									char *buf;
 									size_t bytes;
 									size_t size;
 									loff_t iter;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+									int err;
 								};
 								#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
 								__printf(2, 3)
 								void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+								struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915);
-												drm/i915: Fix too few arguments to function i915_capture_error_state

If 'CONFIG_DRM_I915_CAPTURE_ERROR' not configured, there is an error
when compile the kernel:

drivers/gpu/drm/i915/gt/intel_reset.c:
	In function intel_gt_handle_error:
drivers/gpu/drm/i915/gt/intel_reset.c:1233:3:
	error: too few arguments to function i915_capture_error_state
   i915_capture_error_state(gt->i915);
   ^~~~~~~~~~~~~~~~~~~~~~~~
In file included
         from ./drivers/gpu/drm/i915/i915_drv.h:97:0,
         from ./drivers/gpu/drm/i915/display/intel_display_types.h:46,
         from drivers/gpu/drm/i915/gt/intel_reset.c:10:
./drivers/gpu/drm/i915/i915_gpu_error.h:267:20: note: declared here
 static inline void i915_capture_error_state(struct drm_i915_private *dev_priv,

Fixes: 742379c0c400 ("drm/i915: Start chopping up the GPU error capture")
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20200113081942.15982-1-zhangxiaoxu5@huawei.com

											
										
										
											2020-01-13 08:19:42 +00:00
+								void i915_capture_error_state(struct drm_i915_private *i915);
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
 								struct i915_gpu_coredump *
 								i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp);
 								struct intel_gt_coredump *
 								intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp);
 								struct intel_engine_coredump *
 								intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp);
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+								struct intel_engine_capture_vma *
 								intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
 												  struct i915_request *rq,
 												  gfp_t gfp);
 								void intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
 												   struct intel_engine_capture_vma *capture,
 												   struct i915_vma_compress *compress);
 								struct i915_vma_compress *
 								i915_vma_capture_prepare(struct intel_gt_coredump *gt);
 								void i915_vma_capture_finish(struct intel_gt_coredump *gt,
 											     struct i915_vma_compress *compress);
 								void i915_error_state_store(struct i915_gpu_coredump *error);
 								static inline struct i915_gpu_coredump *
 								i915_gpu_coredump_get(struct i915_gpu_coredump *gpu)
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								{
 									kref_get(&gpu->ref);
 									return gpu;
 								}
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+								ssize_t
 								i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error,
 												 char *buf, loff_t offset, size_t count);
-												drm/i915: Cache the error string

Currently, we convert the error state into a string every time we read
from sysfs (and sysfs reads in page size (4KiB) chunks). We do try to
window the string and only capture the portion that is being read, but
that means that we must always convert up to the window to find the
start. For a very large error state bordering on EXEC_OBJECT_CAPTURE
abuse, this is noticeable as it degrades to O(N^2)!

As we do not have a convenient hook for sysfs open(), and we would like
to keep the lazy conversion into a string, do the conversion of the
whole string on the first read and keep the string until the error state
is freed.

v2: Don't double advance simple_read_from_buffer
v3: Due to extreme pain of lack of vrealloc, use a scatterlist
v4: Keep the forward iterator loosely cached
v5: Stylistic improvements to reduce patch size

Reported-by: Jason Ekstrand <jason@jlekstrand.net>
Testcase: igt/gem_exec_capture/many*
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181123132325.26541-1-chris@chris-wilson.co.uk

											
										
										
											2018-11-23 13:23:25 +00:00
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+								void __i915_gpu_coredump_free(struct kref *kref);
 								static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								{
 									if (gpu)
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+										kref_put(&gpu->ref, __i915_gpu_coredump_free);
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								}
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+								struct i915_gpu_coredump *i915_first_error_state(struct drm_i915_private *i915);
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								void i915_reset_error_state(struct drm_i915_private *i915);
-												drm/i915: Prevent machine hang from Broxton's vtd w/a and error capture

Since capturing the error state requires fiddling around with the GGTT
to read arbitrary buffers and is itself run under stop_machine(), it
deadlocks the machine (effectively a hard hang) when run in conjunction
with Broxton's VTd workaround to serialize GGTT access.

v2: Store the ERR_PTR in first_error so that the error can be reported
to the user via sysfs.
v3: Mention the quirk in dmesg (using info as per usual)

Fixes: 0ef34ad6222a ("drm/i915: Serialize GTT/Aperture accesses on BXT")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: John Harrison <john.C.Harrison@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181102161232.17742-5-chris@chris-wilson.co.uk

											
										
										
											2018-11-02 16:12:12 +00:00
+								void i915_disable_error_state(struct drm_i915_private *i915, int err);
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
 								#else
-												drm/i915: Fix too few arguments to function i915_capture_error_state

If 'CONFIG_DRM_I915_CAPTURE_ERROR' not configured, there is an error
when compile the kernel:

drivers/gpu/drm/i915/gt/intel_reset.c:
	In function intel_gt_handle_error:
drivers/gpu/drm/i915/gt/intel_reset.c:1233:3:
	error: too few arguments to function i915_capture_error_state
   i915_capture_error_state(gt->i915);
   ^~~~~~~~~~~~~~~~~~~~~~~~
In file included
         from ./drivers/gpu/drm/i915/i915_drv.h:97:0,
         from ./drivers/gpu/drm/i915/display/intel_display_types.h:46,
         from drivers/gpu/drm/i915/gt/intel_reset.c:10:
./drivers/gpu/drm/i915/i915_gpu_error.h:267:20: note: declared here
 static inline void i915_capture_error_state(struct drm_i915_private *dev_priv,

Fixes: 742379c0c400 ("drm/i915: Start chopping up the GPU error capture")
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20200113081942.15982-1-zhangxiaoxu5@huawei.com

											
										
										
											2020-01-13 08:19:42 +00:00
+								static inline void i915_capture_error_state(struct drm_i915_private *i915)
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								{
 								}
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+								static inline struct i915_gpu_coredump *
 								i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp)
 								{
 									return NULL;
 								}
 								static inline struct intel_gt_coredump *
 								intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp)
 								{
 									return NULL;
 								}
 								static inline struct intel_engine_coredump *
 								intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp)
 								{
 									return NULL;
 								}
 								static inline struct intel_engine_capture_vma *
 								intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
 												  struct i915_request *rq,
 												  gfp_t gfp)
 								{
 									return NULL;
 								}
 								static inline void
 								intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
 											      struct intel_engine_capture_vma *capture,
 											      struct i915_vma_compress *compress)
 								{
 								}
 								static inline struct i915_vma_compress *
-												drm/i915: Correct typo in i915_vma_compress_finish stub

A copy and paste error in setting up the !CONFIG_DRM_I915_CAPTURE_ERROR
stubs left a conflicting duplicate declaration.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200111083007.1619228-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-11 08:30:07 +00:00
+								i915_vma_capture_prepare(struct intel_gt_coredump *gt)
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+								{
 									return NULL;
 								}
-												drm/i915: Fix multiple definition of 'i915_vma_capture_finish'

If 'CONFIG_DRM_I915_CAPTURE_ERROR' not configured, there are some
errors like:

drivers/gpu/drm/i915/i915_irq.o:
	In function `i915_vma_capture_finish':
./drivers/gpu/drm/i915/i915_gpu_error.h:312:
	multiple definition of `i915_vma_capture_finish'
drivers/gpu/drm/i915/i915_drv.o:
	./drivers/gpu/drm/i915/i915_gpu_error.h:312: first defined here

So, add 'static inline' on the defineation of the 'i915_vma_capture_finish'

Fixes: d713e3ab93fdc("drm/i915: Correct typo in i915_vma_compress_finish stub")
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20200113104009.13274-1-zhangxiaoxu5@huawei.com

											
										
										
											2020-01-13 10:40:09 +00:00
+								static inline void
 								i915_vma_capture_finish(struct intel_gt_coredump *gt,
 											struct i915_vma_compress *compress)
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+								{
 								}
 								static inline void
-												drm/i915: Fix i915_error_state_store error defination

Since commit 742379c0c4001 ("drm/i915: Start chopping up the GPU error
capture"), function 'i915_error_state_store' was defined and used with
only one parameter.

But if no 'CONFIG_DRM_I915_CAPTURE_ERROR', this function was defined
with two parameter.

This may lead compile error. This patch fix it.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
Reviewed-by: Andi Shyti <andi.shyti@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20200117073436.6507-1-zhangxiaoxu5@huawei.com

											
										
										
											2020-01-17 07:34:36 +00:00
+								i915_error_state_store(struct i915_gpu_coredump *error)
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+								{
 								}
-												drm/i915: Stub out i915_gpu_coredump_put

i915_gpu_coreddump_put is currently only defined if
CONFIG_DRM_I915_CAPTURE_ERROR is enabled, provide a stub otherwise.

Reported-by: Mike Lothian <mike@fireburn.co.uk>
Fixes: 742379c0c400 ("drm/i915: Start chopping up the GPU error capture")
Fixes: 748317386afb ("drm/i915/execlists: Offline error capture")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mike Lothian <mike@fireburn.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Reviewed-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200124192255.541355-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-24 19:22:55 +00:00
+								static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)
 								{
 								}
-												drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new
context, from inside the softirq region of a forced preemption. To do
so requires us to break up the monolithic error capture to provide new
entry points with finer control; in particular focusing on one
engine/gt, and being able to compose an error state from little pieces
of HW capture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andi Shyti <andi.shyti@intel.com>
Acked-by: Andi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk

											
										
										
											2020-01-10 12:30:56 +00:00
+								static inline struct i915_gpu_coredump *
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								i915_first_error_state(struct drm_i915_private *i915)
 								{
-												drm/i915: Prevent machine hang from Broxton's vtd w/a and error capture

Since capturing the error state requires fiddling around with the GGTT
to read arbitrary buffers and is itself run under stop_machine(), it
deadlocks the machine (effectively a hard hang) when run in conjunction
with Broxton's VTd workaround to serialize GGTT access.

v2: Store the ERR_PTR in first_error so that the error can be reported
to the user via sysfs.
v3: Mention the quirk in dmesg (using info as per usual)

Fixes: 0ef34ad6222a ("drm/i915: Serialize GTT/Aperture accesses on BXT")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: John Harrison <john.C.Harrison@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181102161232.17742-5-chris@chris-wilson.co.uk

											
										
										
											2018-11-02 16:12:12 +00:00
+									return ERR_PTR(-ENODEV);
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								}
 								static inline void i915_reset_error_state(struct drm_i915_private *i915)
 								{
 								}
-												drm/i915: Prevent machine hang from Broxton's vtd w/a and error capture

Since capturing the error state requires fiddling around with the GGTT
to read arbitrary buffers and is itself run under stop_machine(), it
deadlocks the machine (effectively a hard hang) when run in conjunction
with Broxton's VTd workaround to serialize GGTT access.

v2: Store the ERR_PTR in first_error so that the error can be reported
to the user via sysfs.
v3: Mention the quirk in dmesg (using info as per usual)

Fixes: 0ef34ad6222a ("drm/i915: Serialize GTT/Aperture accesses on BXT")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: John Harrison <john.C.Harrison@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181102161232.17742-5-chris@chris-wilson.co.uk

											
										
										
											2018-11-02 16:12:12 +00:00
+								static inline void i915_disable_error_state(struct drm_i915_private *i915,
 													    int err)
 								{
 								}
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								#endif /* IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) */
 								#endif /* _I915_GPU_ERROR_H_ */