accel/ivpu: Improve stability of ivpu_submit_ioctl()

- Wake up the device as late as possible
- Remove job reference counting in order to simplify the code
- Don't put jobs that are not fully submitted on submitted_jobs_xa in
  order to avoid potential races with reset/recovery

Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
Reviewed-by: Wachowski, Karol <karol.wachowski@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240122120945.1150728-3-jacek.lawrynowicz@linux.intel.com
This commit is contained in:
Jacek Lawrynowicz 2024-01-22 13:09:44 +01:00
parent f1cc6aceec
commit 264b271d12
2 changed files with 62 additions and 78 deletions

View File

@ -125,7 +125,7 @@ void ivpu_cmdq_release_all_locked(struct ivpu_file_priv *file_priv)
/*
* Mark the doorbell as unregistered and reset job queue pointers.
* This function needs to be called when the VPU hardware is restarted
* and FW looses job queue state. The next time job queue is used it
* and FW loses job queue state. The next time job queue is used it
* will be registered again.
*/
static void ivpu_cmdq_reset_locked(struct ivpu_file_priv *file_priv, u16 engine)
@ -239,60 +239,32 @@ static struct dma_fence *ivpu_fence_create(struct ivpu_device *vdev)
return &fence->base;
}
static void job_get(struct ivpu_job *job, struct ivpu_job **link)
static void ivpu_job_destroy(struct ivpu_job *job)
{
struct ivpu_device *vdev = job->vdev;
kref_get(&job->ref);
*link = job;
ivpu_dbg(vdev, KREF, "Job get: id %u refcount %u\n", job->job_id, kref_read(&job->ref));
}
static void job_release(struct kref *ref)
{
struct ivpu_job *job = container_of(ref, struct ivpu_job, ref);
struct ivpu_device *vdev = job->vdev;
u32 i;
ivpu_dbg(vdev, JOB, "Job destroyed: id %3u ctx %2d engine %d",
job->job_id, job->file_priv->ctx.id, job->engine_idx);
for (i = 0; i < job->bo_count; i++)
if (job->bos[i])
drm_gem_object_put(&job->bos[i]->base.base);
dma_fence_put(job->done_fence);
ivpu_file_priv_put(&job->file_priv);
ivpu_dbg(vdev, KREF, "Job released: id %u\n", job->job_id);
kfree(job);
/* Allow the VPU to get suspended, must be called after ivpu_file_priv_put() */
ivpu_rpm_put(vdev);
}
static void job_put(struct ivpu_job *job)
{
struct ivpu_device *vdev = job->vdev;
ivpu_dbg(vdev, KREF, "Job put: id %u refcount %u\n", job->job_id, kref_read(&job->ref));
kref_put(&job->ref, job_release);
}
static struct ivpu_job *
ivpu_create_job(struct ivpu_file_priv *file_priv, u32 engine_idx, u32 bo_count)
ivpu_job_create(struct ivpu_file_priv *file_priv, u32 engine_idx, u32 bo_count)
{
struct ivpu_device *vdev = file_priv->vdev;
struct ivpu_job *job;
int ret;
ret = ivpu_rpm_get(vdev);
if (ret < 0)
return NULL;
job = kzalloc(struct_size(job, bos, bo_count), GFP_KERNEL);
if (!job)
goto err_rpm_put;
kref_init(&job->ref);
return NULL;
job->vdev = vdev;
job->engine_idx = engine_idx;
@ -306,17 +278,14 @@ ivpu_create_job(struct ivpu_file_priv *file_priv, u32 engine_idx, u32 bo_count)
job->file_priv = ivpu_file_priv_get(file_priv);
ivpu_dbg(vdev, JOB, "Job created: ctx %2d engine %d", file_priv->ctx.id, job->engine_idx);
return job;
err_free_job:
kfree(job);
err_rpm_put:
ivpu_rpm_put(vdev);
return NULL;
}
static int ivpu_job_done(struct ivpu_device *vdev, u32 job_id, u32 job_status)
static int ivpu_job_signal_and_destroy(struct ivpu_device *vdev, u32 job_id, u32 job_status)
{
struct ivpu_job *job;
@ -333,9 +302,10 @@ static int ivpu_job_done(struct ivpu_device *vdev, u32 job_id, u32 job_status)
ivpu_dbg(vdev, JOB, "Job complete: id %3u ctx %2d engine %d status 0x%x\n",
job->job_id, job->file_priv->ctx.id, job->engine_idx, job_status);
ivpu_job_destroy(job);
ivpu_stop_job_timeout_detection(vdev);
job_put(job);
ivpu_rpm_put(vdev);
return 0;
}
@ -345,10 +315,10 @@ void ivpu_jobs_abort_all(struct ivpu_device *vdev)
unsigned long id;
xa_for_each(&vdev->submitted_jobs_xa, id, job)
ivpu_job_done(vdev, id, VPU_JSM_STATUS_ABORTED);
ivpu_job_signal_and_destroy(vdev, id, VPU_JSM_STATUS_ABORTED);
}
static int ivpu_direct_job_submission(struct ivpu_job *job)
static int ivpu_job_submit(struct ivpu_job *job)
{
struct ivpu_file_priv *file_priv = job->file_priv;
struct ivpu_device *vdev = job->vdev;
@ -356,53 +326,65 @@ static int ivpu_direct_job_submission(struct ivpu_job *job)
struct ivpu_cmdq *cmdq;
int ret;
ret = ivpu_rpm_get(vdev);
if (ret < 0)
return ret;
mutex_lock(&file_priv->lock);
cmdq = ivpu_cmdq_acquire(job->file_priv, job->engine_idx);
if (!cmdq) {
ivpu_warn(vdev, "Failed get job queue, ctx %d engine %d\n",
file_priv->ctx.id, job->engine_idx);
ivpu_warn_ratelimited(vdev, "Failed get job queue, ctx %d engine %d\n",
file_priv->ctx.id, job->engine_idx);
ret = -EINVAL;
goto err_unlock;
goto err_unlock_file_priv;
}
job_id_range.min = FIELD_PREP(JOB_ID_CONTEXT_MASK, (file_priv->ctx.id - 1));
job_id_range.max = job_id_range.min | JOB_ID_JOB_MASK;
job_get(job, &job);
ret = xa_alloc(&vdev->submitted_jobs_xa, &job->job_id, job, job_id_range, GFP_KERNEL);
xa_lock(&vdev->submitted_jobs_xa);
ret = __xa_alloc(&vdev->submitted_jobs_xa, &job->job_id, job, job_id_range, GFP_KERNEL);
if (ret) {
ivpu_warn_ratelimited(vdev, "Failed to allocate job id: %d\n", ret);
goto err_job_put;
ivpu_dbg(vdev, JOB, "Too many active jobs in ctx %d\n",
file_priv->ctx.id);
ret = -EBUSY;
goto err_unlock_submitted_jobs_xa;
}
ret = ivpu_cmdq_push_job(cmdq, job);
if (ret)
goto err_xa_erase;
goto err_erase_xa;
ivpu_start_job_timeout_detection(vdev);
ivpu_dbg(vdev, JOB, "Job submitted: id %3u addr 0x%llx ctx %2d engine %d next %d\n",
job->job_id, job->cmd_buf_vpu_addr, file_priv->ctx.id,
job->engine_idx, cmdq->jobq->header.tail);
if (ivpu_test_mode & IVPU_TEST_MODE_NULL_HW) {
ivpu_job_done(vdev, job->job_id, VPU_JSM_STATUS_SUCCESS);
if (unlikely(ivpu_test_mode & IVPU_TEST_MODE_NULL_HW)) {
cmdq->jobq->header.head = cmdq->jobq->header.tail;
wmb(); /* Flush WC buffer for jobq header */
} else {
ivpu_cmdq_ring_db(vdev, cmdq);
}
ivpu_dbg(vdev, JOB, "Job submitted: id %3u ctx %2d engine %d addr 0x%llx next %d\n",
job->job_id, file_priv->ctx.id, job->engine_idx,
job->cmd_buf_vpu_addr, cmdq->jobq->header.tail);
xa_unlock(&vdev->submitted_jobs_xa);
mutex_unlock(&file_priv->lock);
if (unlikely(ivpu_test_mode & IVPU_TEST_MODE_NULL_HW))
ivpu_job_signal_and_destroy(vdev, job->job_id, VPU_JSM_STATUS_SUCCESS);
return 0;
err_xa_erase:
xa_erase(&vdev->submitted_jobs_xa, job->job_id);
err_job_put:
job_put(job);
err_unlock:
err_erase_xa:
__xa_erase(&vdev->submitted_jobs_xa, job->job_id);
err_unlock_submitted_jobs_xa:
xa_unlock(&vdev->submitted_jobs_xa);
err_unlock_file_priv:
mutex_unlock(&file_priv->lock);
ivpu_rpm_put(vdev);
return ret;
}
@ -508,44 +490,47 @@ int ivpu_submit_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
params->buffer_count * sizeof(u32));
if (ret) {
ret = -EFAULT;
goto free_handles;
goto err_free_handles;
}
if (!drm_dev_enter(&vdev->drm, &idx)) {
ret = -ENODEV;
goto free_handles;
goto err_free_handles;
}
ivpu_dbg(vdev, JOB, "Submit ioctl: ctx %u buf_count %u\n",
file_priv->ctx.id, params->buffer_count);
job = ivpu_create_job(file_priv, params->engine, params->buffer_count);
job = ivpu_job_create(file_priv, params->engine, params->buffer_count);
if (!job) {
ivpu_err(vdev, "Failed to create job\n");
ret = -ENOMEM;
goto dev_exit;
goto err_exit_dev;
}
ret = ivpu_job_prepare_bos_for_submit(file, job, buf_handles, params->buffer_count,
params->commands_offset);
if (ret) {
ivpu_err(vdev, "Failed to prepare job, ret %d\n", ret);
goto job_put;
ivpu_err(vdev, "Failed to prepare job: %d\n", ret);
goto err_destroy_job;
}
ret = ivpu_direct_job_submission(job);
if (ret) {
dma_fence_signal(job->done_fence);
ivpu_err(vdev, "Failed to submit job to the HW, ret %d\n", ret);
}
ret = ivpu_job_submit(job);
if (ret)
goto err_signal_fence;
job_put:
job_put(job);
dev_exit:
drm_dev_exit(idx);
free_handles:
kfree(buf_handles);
return ret;
err_signal_fence:
dma_fence_signal(job->done_fence);
err_destroy_job:
ivpu_job_destroy(job);
err_exit_dev:
drm_dev_exit(idx);
err_free_handles:
kfree(buf_handles);
return ret;
}
@ -567,7 +552,7 @@ ivpu_job_done_callback(struct ivpu_device *vdev, struct ivpu_ipc_hdr *ipc_hdr,
}
payload = (struct vpu_ipc_msg_payload_job_done *)&jsm_msg->payload;
ret = ivpu_job_done(vdev, payload->job_id, payload->job_status);
ret = ivpu_job_signal_and_destroy(vdev, payload->job_id, payload->job_status);
if (!ret && !xa_empty(&vdev->submitted_jobs_xa))
ivpu_start_job_timeout_detection(vdev);
}

View File

@ -43,7 +43,6 @@ struct ivpu_cmdq {
will update the job status
*/
struct ivpu_job {
struct kref ref;
struct ivpu_device *vdev;
struct ivpu_file_priv *file_priv;
struct dma_fence *done_fence;