drm/amdgpu:cleanup job reset routine(v2)
merge the setting guilty on context into this function to avoid implement extra routine. v2: go through entity list and compare the fence_ctx before operate on the entity, otherwise the entity may be just a wild pointer Signed-off-by: Monk Liu <Monk.Liu@amd.com> Reviewed-by: Chunming Zhou <David1.Zhou@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
7716ea564f
commit
a8a51a7041
@ -2869,7 +2869,7 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job)
|
||||
amd_sched_job_kickout(&job->base);
|
||||
|
||||
/* only do job_reset on the hang ring if @job not NULL */
|
||||
amd_sched_hw_job_reset(&ring->sched);
|
||||
amd_sched_hw_job_reset(&ring->sched, NULL);
|
||||
|
||||
/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
|
||||
amdgpu_fence_driver_force_completion(ring);
|
||||
@ -2990,7 +2990,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
|
||||
if (!ring || !ring->sched.thread)
|
||||
continue;
|
||||
kthread_park(ring->sched.thread);
|
||||
amd_sched_hw_job_reset(&ring->sched);
|
||||
amd_sched_hw_job_reset(&ring->sched, NULL);
|
||||
/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
|
||||
amdgpu_fence_driver_force_completion(ring);
|
||||
}
|
||||
|
@ -443,9 +443,18 @@ static void amd_sched_job_timedout(struct work_struct *work)
|
||||
job->sched->ops->timedout_job(job);
|
||||
}
|
||||
|
||||
void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)
|
||||
static void amd_sched_set_guilty(struct amd_sched_job *s_job)
|
||||
{
|
||||
if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
|
||||
if (s_job->s_entity->guilty)
|
||||
atomic_set(s_job->s_entity->guilty, 1);
|
||||
}
|
||||
|
||||
void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad)
|
||||
{
|
||||
struct amd_sched_job *s_job;
|
||||
struct amd_sched_entity *entity, *tmp;
|
||||
int i;;
|
||||
|
||||
spin_lock(&sched->job_list_lock);
|
||||
list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) {
|
||||
@ -458,6 +467,26 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)
|
||||
}
|
||||
}
|
||||
spin_unlock(&sched->job_list_lock);
|
||||
|
||||
if (bad) {
|
||||
bool found = false;
|
||||
|
||||
for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) {
|
||||
struct amd_sched_rq *rq = &sched->sched_rq[i];
|
||||
|
||||
spin_lock(&rq->lock);
|
||||
list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
|
||||
if (bad->s_fence->scheduled.context == entity->fence_context) {
|
||||
found = true;
|
||||
amd_sched_set_guilty(bad);
|
||||
break;
|
||||
}
|
||||
}
|
||||
spin_unlock(&rq->lock);
|
||||
if (found)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void amd_sched_job_kickout(struct amd_sched_job *s_job)
|
||||
|
@ -174,7 +174,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
|
||||
struct amd_gpu_scheduler *sched,
|
||||
struct amd_sched_entity *entity,
|
||||
void *owner);
|
||||
void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched);
|
||||
void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *job);
|
||||
void amd_sched_job_recovery(struct amd_gpu_scheduler *sched);
|
||||
bool amd_sched_dependency_optimized(struct dma_fence* fence,
|
||||
struct amd_sched_entity *entity);
|
||||
|
Loading…
Reference in New Issue
Block a user