forked from Minki/linux
drm/amdgpu: improve HMM error -ENOMEM and -EBUSY handling
Under memory pressure, hmm_range_fault may return error code -ENOMEM or -EBUSY, change pr_info to pr_debug to remove unnecessary kernel log message because we will retry restore again. Call get_user_pages_done if TTM get user pages failed will have WARN_ONCE kernel calling stack dump log. Signed-off-by: Philip Yang <Philip.Yang@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
c1d827d62f
commit
e82fdb16a0
@ -1731,37 +1731,19 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
|
||||
ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm,
|
||||
bo->tbo.ttm->pages);
|
||||
if (ret) {
|
||||
bo->tbo.ttm->pages[0] = NULL;
|
||||
pr_info("%s: Failed to get user pages: %d\n",
|
||||
pr_debug("%s: Failed to get user pages: %d\n",
|
||||
__func__, ret);
|
||||
/* Pretend it succeeded. It will fail later
|
||||
* with a VM fault if the GPU tries to access
|
||||
* it. Better than hanging indefinitely with
|
||||
* stalled user mode queues.
|
||||
*/
|
||||
|
||||
/* Return error -EBUSY or -ENOMEM, retry restore */
|
||||
return ret;
|
||||
}
|
||||
|
||||
amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Remove invalid userptr BOs from hmm track list
|
||||
*
|
||||
* Stop HMM track the userptr update
|
||||
*/
|
||||
static void untrack_invalid_user_pages(struct amdkfd_process_info *process_info)
|
||||
{
|
||||
struct kgd_mem *mem, *tmp_mem;
|
||||
struct amdgpu_bo *bo;
|
||||
|
||||
list_for_each_entry_safe(mem, tmp_mem,
|
||||
&process_info->userptr_inval_list,
|
||||
validate_list.head) {
|
||||
bo = mem->bo;
|
||||
amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
|
||||
}
|
||||
}
|
||||
|
||||
/* Validate invalid userptr BOs
|
||||
*
|
||||
* Validates BOs on the userptr_inval_list, and moves them back to the
|
||||
@ -1841,13 +1823,6 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
|
||||
list_move_tail(&mem->validate_list.head,
|
||||
&process_info->userptr_valid_list);
|
||||
|
||||
/* Stop HMM track the userptr update. We dont check the return
|
||||
* value for concurrent CPU page table update because we will
|
||||
* reschedule the restore worker if process_info->evicted_bos
|
||||
* is updated.
|
||||
*/
|
||||
amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
|
||||
|
||||
/* Update mapping. If the BO was not validated
|
||||
* (because we couldn't get user pages), this will
|
||||
* clear the page table entries, which will result in
|
||||
@ -1946,7 +1921,6 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
|
||||
}
|
||||
|
||||
unlock_out:
|
||||
untrack_invalid_user_pages(process_info);
|
||||
mutex_unlock(&process_info->lock);
|
||||
mmput(mm);
|
||||
put_task_struct(usertask);
|
||||
|
Loading…
Reference in New Issue
Block a user