Driver Changes:

- Fix ccs_mode setting for Xe2 and later (Balasubramani)
 - Synchronize ccs_mode setting with client creation (Balasubramani)
 - Apply scheduling WA for LNL in additional places as needed
   (Nirmoy)
 - Fix leak and lock handling in error paths of xe_exec ioctl
   (Matthew Brost)
 - Fix GGTT allocation leak leading to eventual crash in SR-IOV
   (Michal Wajdeczko)
 - Move run_ticks update out of job handling to avoid synchronization
   with reader (Lucas)
 -----BEGIN PGP SIGNATURE-----
 
 iQJNBAABCAA3FiEE6rM8lpABPHM5FqyDm6KlpjDL6lMFAmcuLUQZHGx1Y2FzLmRl
 bWFyY2hpQGludGVsLmNvbQAKCRCboqWmMMvqU3HEEACI04W0GFdk1ix9MS2vuK7U
 r0phWoaP4+29vfWG9BRN3S6jisAt2Y1+DBHCP78C7naV/FvHW8aJQ78cHEJ9DjF+
 /OpzpErfd+BD2DSSOBRNcQJ3eqe7JVjy2Q4kV41+qkKqqOaN99L33vFexCXOu7Je
 wFOXn33pmrsFCp9mb6xBhLYGo8tFS5IctHvBajGFHoUQzqfA/WkD/g67x9qKSzYv
 BThYOm066jcWGX0LaKHCw6slQ/OeZE78q2gaQraEpCyw4dJg7QBTlrVU5662xCnI
 RXDv+adzsHGWah77vrnBKFBRAD66qBHj4ntecHesnhuBv31EJdqWlLb5baoiE5RN
 wuPm57hPJ39tsB/KxqUlTmFiBCWQ74Px4o6nRKcnme2N0m/H8xFTp7c45SRwJZ+X
 RhQPnv14fkLN3e4TaWbr8at89Gapk1R8cXRytZG4gwkrfFfLEGT1ufBV6lP3Pn0n
 6DQvtNrKR2qGO1JM+R5MXIzzdFFqRuI/8+GX9dvs/5fFTLbTUCckyU9ZM9JXFKIl
 2ShK7eSFhUPqw2p3zbQzyxQqkopWORQhRkzUmCm8e9WQLE9uBpaFbkeIzu36atY2
 pjRElc6OtTlxCN1KcBnOk5UvTbEC8MxNdWjB5MZJd8zaxm5DVbgGbxp5YmU8K/gG
 GV1YXZlniVpdhixJFFoypQ==
 =jirT
 -----END PGP SIGNATURE-----

Merge tag 'drm-xe-fixes-2024-11-08' of https://gitlab.freedesktop.org/drm/xe/kernel into drm-fixes

Driver Changes:
- Fix ccs_mode setting for Xe2 and later (Balasubramani)
- Synchronize ccs_mode setting with client creation (Balasubramani)
- Apply scheduling WA for LNL in additional places as needed
  (Nirmoy)
- Fix leak and lock handling in error paths of xe_exec ioctl
  (Matthew Brost)
- Fix GGTT allocation leak leading to eventual crash in SR-IOV
  (Michal Wajdeczko)
- Move run_ticks update out of job handling to avoid synchronization
  with reader (Lucas)

Signed-off-by: Dave Airlie <airlied@redhat.com>

From: Lucas De Marchi <lucas.demarchi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/4ffcebtluaaaohquxfyf5babpihmtscxwad3jjmt5nggwh2xpm@ztw67ucywttg
This commit is contained in:
Dave Airlie 2024-11-09 05:14:28 +10:00
commit 1a6bbc4d9e
12 changed files with 54 additions and 41 deletions

View File

@ -517,7 +517,7 @@
* [4-6] RSVD
* [7] Disabled
*/
#define CCS_MODE XE_REG(0x14804)
#define CCS_MODE XE_REG(0x14804, XE_REG_OPTION_MASKED)
#define CCS_MODE_CSLICE_0_3_MASK REG_GENMASK(11, 0) /* 3 bits per cslice */
#define CCS_MODE_CSLICE_MASK 0x7 /* CCS0-3 + rsvd */
#define CCS_MODE_CSLICE_WIDTH ilog2(CCS_MODE_CSLICE_MASK + 1)

View File

@ -87,10 +87,6 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file)
mutex_init(&xef->exec_queue.lock);
xa_init_flags(&xef->exec_queue.xa, XA_FLAGS_ALLOC1);
spin_lock(&xe->clients.lock);
xe->clients.count++;
spin_unlock(&xe->clients.lock);
file->driver_priv = xef;
kref_init(&xef->refcount);
@ -107,17 +103,12 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file)
static void xe_file_destroy(struct kref *ref)
{
struct xe_file *xef = container_of(ref, struct xe_file, refcount);
struct xe_device *xe = xef->xe;
xa_destroy(&xef->exec_queue.xa);
mutex_destroy(&xef->exec_queue.lock);
xa_destroy(&xef->vm.xa);
mutex_destroy(&xef->vm.lock);
spin_lock(&xe->clients.lock);
xe->clients.count--;
spin_unlock(&xe->clients.lock);
xe_drm_client_put(xef->client);
kfree(xef->process_name);
kfree(xef);
@ -333,7 +324,6 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
xe->info.force_execlist = xe_modparam.force_execlist;
spin_lock_init(&xe->irq.lock);
spin_lock_init(&xe->clients.lock);
init_waitqueue_head(&xe->ufence_wq);

View File

@ -178,4 +178,18 @@ void xe_device_declare_wedged(struct xe_device *xe);
struct xe_file *xe_file_get(struct xe_file *xef);
void xe_file_put(struct xe_file *xef);
/*
* Occasionally it is seen that the G2H worker starts running after a delay of more than
* a second even after being queued and activated by the Linux workqueue subsystem. This
* leads to G2H timeout error. The root cause of issue lies with scheduling latency of
* Lunarlake Hybrid CPU. Issue disappears if we disable Lunarlake atom cores from BIOS
* and this is beyond xe kmd.
*
* TODO: Drop this change once workqueue scheduling delay issue is fixed on LNL Hybrid CPU.
*/
#define LNL_FLUSH_WORKQUEUE(wq__) \
flush_workqueue(wq__)
#define LNL_FLUSH_WORK(wrk__) \
flush_work(wrk__)
#endif

View File

@ -353,15 +353,6 @@ struct xe_device {
struct workqueue_struct *wq;
} sriov;
/** @clients: drm clients info */
struct {
/** @clients.lock: Protects drm clients info */
spinlock_t lock;
/** @clients.count: number of drm clients */
u64 count;
} clients;
/** @usm: unified memory state */
struct {
/** @usm.asid: convert a ASID to VM */

View File

@ -132,12 +132,16 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
if (XE_IOCTL_DBG(xe, !q))
return -ENOENT;
if (XE_IOCTL_DBG(xe, q->flags & EXEC_QUEUE_FLAG_VM))
return -EINVAL;
if (XE_IOCTL_DBG(xe, q->flags & EXEC_QUEUE_FLAG_VM)) {
err = -EINVAL;
goto err_exec_queue;
}
if (XE_IOCTL_DBG(xe, args->num_batch_buffer &&
q->width != args->num_batch_buffer))
return -EINVAL;
q->width != args->num_batch_buffer)) {
err = -EINVAL;
goto err_exec_queue;
}
if (XE_IOCTL_DBG(xe, q->ops->reset_status(q))) {
err = -ECANCELED;
@ -220,6 +224,7 @@ retry:
fence = xe_sync_in_fence_get(syncs, num_syncs, q, vm);
if (IS_ERR(fence)) {
err = PTR_ERR(fence);
xe_vm_unlock(vm);
goto err_unlock_list;
}
for (i = 0; i < num_syncs; i++)

View File

@ -260,8 +260,14 @@ void xe_exec_queue_fini(struct xe_exec_queue *q)
{
int i;
/*
* Before releasing our ref to lrc and xef, accumulate our run ticks
*/
xe_exec_queue_update_run_ticks(q);
for (i = 0; i < q->width; ++i)
xe_lrc_put(q->lrc[i]);
__xe_exec_queue_free(q);
}

View File

@ -68,6 +68,12 @@ static void __xe_gt_apply_ccs_mode(struct xe_gt *gt, u32 num_engines)
}
}
/*
* Mask bits need to be set for the register. Though only Xe2+
* platforms require setting of mask bits, it won't harm for older
* platforms as these bits are unused there.
*/
mode |= CCS_MODE_CSLICE_0_3_MASK << 16;
xe_mmio_write32(gt, CCS_MODE, mode);
xe_gt_dbg(gt, "CCS_MODE=%x config:%08x, num_engines:%d, num_slices:%d\n",
@ -133,9 +139,10 @@ ccs_mode_store(struct device *kdev, struct device_attribute *attr,
}
/* CCS mode can only be updated when there are no drm clients */
spin_lock(&xe->clients.lock);
if (xe->clients.count) {
spin_unlock(&xe->clients.lock);
mutex_lock(&xe->drm.filelist_mutex);
if (!list_empty(&xe->drm.filelist)) {
mutex_unlock(&xe->drm.filelist_mutex);
xe_gt_dbg(gt, "Rejecting compute mode change as there are active drm clients\n");
return -EBUSY;
}
@ -146,7 +153,7 @@ ccs_mode_store(struct device *kdev, struct device_attribute *attr,
xe_gt_reset_async(gt);
}
spin_unlock(&xe->clients.lock);
mutex_unlock(&xe->drm.filelist_mutex);
return count;
}

View File

@ -387,6 +387,8 @@ static void pf_release_ggtt(struct xe_tile *tile, struct xe_ggtt_node *node)
* the xe_ggtt_clear() called by below xe_ggtt_remove_node().
*/
xe_ggtt_node_remove(node, false);
} else {
xe_ggtt_node_fini(node);
}
}
@ -442,7 +444,7 @@ static int pf_provision_vf_ggtt(struct xe_gt *gt, unsigned int vfid, u64 size)
config->ggtt_region = node;
return 0;
err:
xe_ggtt_node_fini(node);
pf_release_ggtt(tile, node);
return err;
}

View File

@ -72,6 +72,8 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work)
struct xe_device *xe = gt_to_xe(gt);
struct xe_gt_tlb_invalidation_fence *fence, *next;
LNL_FLUSH_WORK(&gt->uc.guc.ct.g2h_worker);
spin_lock_irq(&gt->tlb_invalidation.pending_lock);
list_for_each_entry_safe(fence, next,
&gt->tlb_invalidation.pending_fences, link) {

View File

@ -897,17 +897,8 @@ retry_same_fence:
ret = wait_event_timeout(ct->g2h_fence_wq, g2h_fence.done, HZ);
/*
* Occasionally it is seen that the G2H worker starts running after a delay of more than
* a second even after being queued and activated by the Linux workqueue subsystem. This
* leads to G2H timeout error. The root cause of issue lies with scheduling latency of
* Lunarlake Hybrid CPU. Issue dissappears if we disable Lunarlake atom cores from BIOS
* and this is beyond xe kmd.
*
* TODO: Drop this change once workqueue scheduling delay issue is fixed on LNL Hybrid CPU.
*/
if (!ret) {
flush_work(&ct->g2h_worker);
LNL_FLUSH_WORK(&ct->g2h_worker);
if (g2h_fence.done) {
xe_gt_warn(gt, "G2H fence %u, action %04x, done\n",
g2h_fence.seqno, action[0]);

View File

@ -745,8 +745,6 @@ static void guc_exec_queue_free_job(struct drm_sched_job *drm_job)
{
struct xe_sched_job *job = to_xe_sched_job(drm_job);
xe_exec_queue_update_run_ticks(job->q);
trace_xe_sched_job_free(job);
xe_sched_job_put(job);
}

View File

@ -155,6 +155,13 @@ int xe_wait_user_fence_ioctl(struct drm_device *dev, void *data,
}
if (!timeout) {
LNL_FLUSH_WORKQUEUE(xe->ordered_wq);
err = do_compare(addr, args->value, args->mask,
args->op);
if (err <= 0) {
drm_dbg(&xe->drm, "LNL_FLUSH_WORKQUEUE resolved ufence timeout\n");
break;
}
err = -ETIME;
break;
}