drm/amdgpu: fix system hang issue during GPU reset

when GPU hang, driver has multi-paths to enter amdgpu_device_gpu_recover,
the atomic adev->in_gpu_reset and hive->in_reset are used to avoid
re-entering GPU recovery.

During GPU reset and resume, it is unsafe that other threads access GPU,
which maybe cause GPU reset failed. Therefore the new rw_semaphore
adev->reset_sem is introduced, which protect GPU from being accessed by
external threads during recovery.

v2:
1. add rwlock for some ioctls, debugfs and file-close function.
2. change to use dqm->is_resetting and dqm_lock for protection in kfd
driver.
3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid
re-enter GPU recovery for the same GPU hang.

v3:
1. change back to use adev->reset_sem to protect kfd callback
functions, because dqm_lock couldn't protect all codes, for example:
free_mqd must be called outside of dqm_lock;

[ 1230.176199] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 05/23/2019
[ 1230.177221] Call Trace:
[ 1230.178249]  dump_stack+0x98/0xd5
[ 1230.179443]  amdgpu_virt_kiq_reg_write_reg_wait+0x181/0x190 [amdgpu]
[ 1230.180673]  gmc_v9_0_flush_gpu_tlb+0xcc/0x310 [amdgpu]
[ 1230.181882]  amdgpu_gart_unbind+0xa9/0xe0 [amdgpu]
[ 1230.183098]  amdgpu_ttm_backend_unbind+0x46/0x180 [amdgpu]
[ 1230.184239]  ? ttm_bo_put+0x171/0x5f0 [ttm]
[ 1230.185394]  ttm_tt_unbind+0x21/0x40 [ttm]
[ 1230.186558]  ttm_tt_destroy.part.12+0x12/0x60 [ttm]
[ 1230.187707]  ttm_tt_destroy+0x13/0x20 [ttm]
[ 1230.188832]  ttm_bo_cleanup_memtype_use+0x36/0x80 [ttm]
[ 1230.189979]  ttm_bo_put+0x1be/0x5f0 [ttm]
[ 1230.191230]  amdgpu_bo_unref+0x1e/0x30 [amdgpu]
[ 1230.192522]  amdgpu_amdkfd_free_gtt_mem+0xaf/0x140 [amdgpu]
[ 1230.193833]  free_mqd+0x25/0x40 [amdgpu]
[ 1230.195143]  destroy_queue_cpsch+0x1a7/0x270 [amdgpu]
[ 1230.196475]  pqm_destroy_queue+0x105/0x260 [amdgpu]
[ 1230.197819]  kfd_ioctl_destroy_queue+0x37/0x70 [amdgpu]
[ 1230.199154]  kfd_ioctl+0x277/0x500 [amdgpu]
[ 1230.200458]  ? kfd_ioctl_get_clock_counters+0x60/0x60 [amdgpu]
[ 1230.201656]  ? tomoyo_file_ioctl+0x19/0x20
[ 1230.202831]  ksys_ioctl+0x98/0xb0
[ 1230.204004]  __x64_sys_ioctl+0x1a/0x20
[ 1230.205174]  do_syscall_64+0x5f/0x250
[ 1230.206339]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

2. remove try_lock and introduce atomic hive->in_reset, to avoid
re-enter GPU recovery.

v4:
1. remove an unnecessary whitespace change in kfd_chardev.c
2. remove comment codes in amdgpu_device.c
3. add more detailed comment in commit message
4. define a wrap function amdgpu_in_reset

v5:
1. Fix some style issues.

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Suggested-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Suggested-by: Christian König <christian.koenig@amd.com>
Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
Suggested-by: Lijo Lazar <Lijo.Lazar@amd.com>
Suggested-by: Luben Tukov <luben.tuikov@amd.com>
Signed-off-by: Dennis Li <Dennis.Li@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Dennis Li
2020-07-08 15:07:13 +08:00
committed by Alex Deucher
parent c5079f35c0
commit df9c8d1aa2
39 changed files with 463 additions and 183 deletions

View File

@@ -961,9 +961,9 @@ struct amdgpu_device {
bool in_suspend; bool in_suspend;
bool in_hibernate; bool in_hibernate;
bool in_gpu_reset; atomic_t in_gpu_reset;
enum pp_mp1_state mp1_state; enum pp_mp1_state mp1_state;
struct mutex lock_reset; struct rw_semaphore reset_sem;
struct amdgpu_doorbell_index doorbell_index; struct amdgpu_doorbell_index doorbell_index;
struct mutex notifier_lock; struct mutex notifier_lock;
@@ -1278,4 +1278,9 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)
return adev->gmc.tmz_enabled; return adev->gmc.tmz_enabled;
} }
static inline bool amdgpu_in_reset(struct amdgpu_device *adev)
{
return atomic_read(&adev->in_gpu_reset) ? true : false;
}
#endif #endif

View File

@@ -244,11 +244,14 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
if (cp_mqd_gfx9) if (cp_mqd_gfx9)
bp.flags |= AMDGPU_GEM_CREATE_CP_MQD_GFX9; bp.flags |= AMDGPU_GEM_CREATE_CP_MQD_GFX9;
if (!down_read_trylock(&adev->reset_sem))
return -EIO;
r = amdgpu_bo_create(adev, &bp, &bo); r = amdgpu_bo_create(adev, &bp, &bo);
if (r) { if (r) {
dev_err(adev->dev, dev_err(adev->dev,
"failed to allocate BO for amdkfd (%d)\n", r); "failed to allocate BO for amdkfd (%d)\n", r);
return r; goto err;
} }
/* map the buffer */ /* map the buffer */
@@ -283,6 +286,7 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
amdgpu_bo_unreserve(bo); amdgpu_bo_unreserve(bo);
up_read(&adev->reset_sem);
return 0; return 0;
allocate_mem_kmap_bo_failed: allocate_mem_kmap_bo_failed:
@@ -291,19 +295,25 @@ allocate_mem_pin_bo_failed:
amdgpu_bo_unreserve(bo); amdgpu_bo_unreserve(bo);
allocate_mem_reserve_bo_failed: allocate_mem_reserve_bo_failed:
amdgpu_bo_unref(&bo); amdgpu_bo_unref(&bo);
err:
up_read(&adev->reset_sem);
return r; return r;
} }
void amdgpu_amdkfd_free_gtt_mem(struct kgd_dev *kgd, void *mem_obj) void amdgpu_amdkfd_free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
{ {
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj; struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj;
down_read(&adev->reset_sem);
amdgpu_bo_reserve(bo, true); amdgpu_bo_reserve(bo, true);
amdgpu_bo_kunmap(bo); amdgpu_bo_kunmap(bo);
amdgpu_bo_unpin(bo); amdgpu_bo_unpin(bo);
amdgpu_bo_unreserve(bo); amdgpu_bo_unreserve(bo);
amdgpu_bo_unref(&(bo)); amdgpu_bo_unref(&(bo));
up_read(&adev->reset_sem);
} }
int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size, int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size,
@@ -335,9 +345,14 @@ int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size,
void amdgpu_amdkfd_free_gws(struct kgd_dev *kgd, void *mem_obj) void amdgpu_amdkfd_free_gws(struct kgd_dev *kgd, void *mem_obj)
{ {
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
struct amdgpu_bo *bo = (struct amdgpu_bo *)mem_obj; struct amdgpu_bo *bo = (struct amdgpu_bo *)mem_obj;
down_read(&adev->reset_sem);
amdgpu_bo_unref(&bo); amdgpu_bo_unref(&bo);
up_read(&adev->reset_sem);
} }
uint32_t amdgpu_amdkfd_get_fw_version(struct kgd_dev *kgd, uint32_t amdgpu_amdkfd_get_fw_version(struct kgd_dev *kgd,
@@ -611,12 +626,19 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
/* This works for NO_HWS. TODO: need to handle without knowing VMID */ /* This works for NO_HWS. TODO: need to handle without knowing VMID */
job->vmid = vmid; job->vmid = vmid;
if (!down_read_trylock(&adev->reset_sem)) {
ret = -EIO;
goto err_ib_sched;
}
ret = amdgpu_ib_schedule(ring, 1, ib, job, &f); ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);
if (ret) { if (ret) {
DRM_ERROR("amdgpu: failed to schedule IB.\n"); DRM_ERROR("amdgpu: failed to schedule IB.\n");
goto err_ib_sched; goto err_ib_sched;
} }
up_read(&adev->reset_sem);
ret = dma_fence_wait(f, false); ret = dma_fence_wait(f, false);
err_ib_sched: err_ib_sched:
@@ -647,6 +669,9 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
{ {
struct amdgpu_device *adev = (struct amdgpu_device *)kgd; struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
if (!down_read_trylock(&adev->reset_sem))
return -EIO;
if (adev->family == AMDGPU_FAMILY_AI) { if (adev->family == AMDGPU_FAMILY_AI) {
int i; int i;
@@ -656,6 +681,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0); amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0);
} }
up_read(&adev->reset_sem);
return 0; return 0;
} }
@@ -664,11 +691,18 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, uint16_t pasid)
struct amdgpu_device *adev = (struct amdgpu_device *)kgd; struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
const uint32_t flush_type = 0; const uint32_t flush_type = 0;
bool all_hub = false; bool all_hub = false;
int ret = -EIO;
if (adev->family == AMDGPU_FAMILY_AI) if (adev->family == AMDGPU_FAMILY_AI)
all_hub = true; all_hub = true;
return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub); if (down_read_trylock(&adev->reset_sem)) {
ret = amdgpu_gmc_flush_gpu_tlb_pasid(adev,
pasid, flush_type, all_hub);
up_read(&adev->reset_sem);
}
return ret;
} }
bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd) bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd)

View File

@@ -542,7 +542,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp; uint32_t temp;
struct v10_compute_mqd *m = get_mqd(mqd); struct v10_compute_mqd *m = get_mqd(mqd);
if (adev->in_gpu_reset) if (amdgpu_in_reset(adev))
return -EIO; return -EIO;
#if 0 #if 0

View File

@@ -423,7 +423,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
unsigned long flags, end_jiffies; unsigned long flags, end_jiffies;
int retry; int retry;
if (adev->in_gpu_reset) if (amdgpu_in_reset(adev))
return -EIO; return -EIO;
acquire_queue(kgd, pipe_id, queue_id); acquire_queue(kgd, pipe_id, queue_id);

View File

@@ -419,7 +419,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
int retry; int retry;
struct vi_mqd *m = get_mqd(mqd); struct vi_mqd *m = get_mqd(mqd);
if (adev->in_gpu_reset) if (amdgpu_in_reset(adev))
return -EIO; return -EIO;
acquire_queue(kgd, pipe_id, queue_id); acquire_queue(kgd, pipe_id, queue_id);

View File

@@ -539,7 +539,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp; uint32_t temp;
struct v9_mqd *m = get_mqd(mqd); struct v9_mqd *m = get_mqd(mqd);
if (adev->in_gpu_reset) if (amdgpu_in_reset(adev))
return -EIO; return -EIO;
acquire_queue(kgd, pipe_id, queue_id); acquire_queue(kgd, pipe_id, queue_id);

View File

@@ -1190,6 +1190,9 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
return -EINVAL; return -EINVAL;
} }
if (!down_read_trylock(&adev->reset_sem))
return -EIO;
*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
if (!*mem) { if (!*mem) {
ret = -ENOMEM; ret = -ENOMEM;
@@ -1256,6 +1259,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
if (offset) if (offset)
*offset = amdgpu_bo_mmap_offset(bo); *offset = amdgpu_bo_mmap_offset(bo);
up_read(&adev->reset_sem);
return 0; return 0;
allocate_init_user_pages_failed: allocate_init_user_pages_failed:
@@ -1273,6 +1277,9 @@ err:
sg_free_table(sg); sg_free_table(sg);
kfree(sg); kfree(sg);
} }
up_read(&adev->reset_sem);
return ret; return ret;
} }

View File

@@ -1292,6 +1292,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
parser.adev = adev; parser.adev = adev;
parser.filp = filp; parser.filp = filp;
down_read(&adev->reset_sem);
r = amdgpu_cs_parser_init(&parser, data); r = amdgpu_cs_parser_init(&parser, data);
if (r) { if (r) {
DRM_ERROR("Failed to initialize parser %d!\n", r); DRM_ERROR("Failed to initialize parser %d!\n", r);
@@ -1331,6 +1333,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
out: out:
amdgpu_cs_parser_fini(&parser, r, reserved_buffers); amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
up_read(&adev->reset_sem);
return r; return r;
} }

View File

@@ -358,6 +358,8 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
if (atomic_read(&ctx->guilty)) if (atomic_read(&ctx->guilty))
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY; out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
down_read(&adev->reset_sem);
/*query ue count*/ /*query ue count*/
ras_counter = amdgpu_ras_query_error_count(adev, false); ras_counter = amdgpu_ras_query_error_count(adev, false);
/*ras counter is monotonic increasing*/ /*ras counter is monotonic increasing*/
@@ -373,6 +375,8 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
ctx->ras_counter_ce = ras_counter; ctx->ras_counter_ce = ras_counter;
} }
up_read(&adev->reset_sem);
mutex_unlock(&mgr->lock); mutex_unlock(&mgr->lock);
return 0; return 0;
} }

View File

@@ -100,14 +100,14 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
file->private_data = adev; file->private_data = adev;
mutex_lock(&adev->lock_reset); down_read(&adev->reset_sem);
if (adev->autodump.dumping.done) { if (adev->autodump.dumping.done) {
reinit_completion(&adev->autodump.dumping); reinit_completion(&adev->autodump.dumping);
ret = 0; ret = 0;
} else { } else {
ret = -EBUSY; ret = -EBUSY;
} }
mutex_unlock(&adev->lock_reset); up_read(&adev->reset_sem);
return ret; return ret;
} }
@@ -126,7 +126,7 @@ static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_
poll_wait(file, &adev->autodump.gpu_hang, poll_table); poll_wait(file, &adev->autodump.gpu_hang, poll_table);
if (adev->in_gpu_reset) if (amdgpu_in_reset(adev))
return POLLIN | POLLRDNORM | POLLWRNORM; return POLLIN | POLLRDNORM | POLLWRNORM;
return 0; return 0;
@@ -1241,7 +1241,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
} }
/* Avoid accidently unparking the sched thread during GPU reset */ /* Avoid accidently unparking the sched thread during GPU reset */
mutex_lock(&adev->lock_reset); down_read(&adev->reset_sem);
/* hold on the scheduler */ /* hold on the scheduler */
for (i = 0; i < AMDGPU_MAX_RINGS; i++) { for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
@@ -1268,7 +1268,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
kthread_unpark(ring->sched.thread); kthread_unpark(ring->sched.thread);
} }
mutex_unlock(&adev->lock_reset); up_read(&adev->reset_sem);
pm_runtime_mark_last_busy(dev->dev); pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev); pm_runtime_put_autosuspend(dev->dev);
@@ -1458,7 +1458,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM; return -ENOMEM;
/* Avoid accidently unparking the sched thread during GPU reset */ /* Avoid accidently unparking the sched thread during GPU reset */
mutex_lock(&adev->lock_reset); down_read(&adev->reset_sem);
/* stop the scheduler */ /* stop the scheduler */
kthread_park(ring->sched.thread); kthread_park(ring->sched.thread);
@@ -1499,7 +1499,7 @@ failure:
/* restart the scheduler */ /* restart the scheduler */
kthread_unpark(ring->sched.thread); kthread_unpark(ring->sched.thread);
mutex_unlock(&adev->lock_reset); up_read(&adev->reset_sem);
ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);

View File

@@ -1935,7 +1935,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
if (adev->ip_blocks[i].status.hw == true) if (adev->ip_blocks[i].status.hw == true)
break; break;
if (adev->in_gpu_reset || adev->in_suspend) { if (amdgpu_in_reset(adev) || adev->in_suspend) {
r = adev->ip_blocks[i].version->funcs->resume(adev); r = adev->ip_blocks[i].version->funcs->resume(adev);
if (r) { if (r) {
DRM_ERROR("resume of IP block <%s> failed %d\n", DRM_ERROR("resume of IP block <%s> failed %d\n",
@@ -2106,7 +2106,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
AMDGPU_RESET_MAGIC_NUM)) AMDGPU_RESET_MAGIC_NUM))
return true; return true;
if (!adev->in_gpu_reset) if (!amdgpu_in_reset(adev))
return false; return false;
/* /*
@@ -3036,7 +3036,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(&adev->mn_lock); mutex_init(&adev->mn_lock);
mutex_init(&adev->virt.vf_errors.lock); mutex_init(&adev->virt.vf_errors.lock);
hash_init(adev->mn_hash); hash_init(adev->mn_hash);
mutex_init(&adev->lock_reset); init_rwsem(&adev->reset_sem);
atomic_set(&adev->in_gpu_reset, 0);
mutex_init(&adev->psp.mutex); mutex_init(&adev->psp.mutex);
mutex_init(&adev->notifier_lock); mutex_init(&adev->notifier_lock);
@@ -4064,8 +4065,11 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
if (need_full_reset) { if (need_full_reset) {
/* post card */ /* post card */
if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) {
DRM_WARN("asic atom init failed!"); dev_warn(tmp_adev->dev, "asic atom init failed!");
r = -EAGAIN;
goto out;
}
if (!r) { if (!r) {
dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
@@ -4141,16 +4145,14 @@ end:
return r; return r;
} }
static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) static bool amdgpu_device_lock_adev(struct amdgpu_device *adev)
{ {
if (trylock) { if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
if (!mutex_trylock(&adev->lock_reset)) return false;
return false;
} else down_write(&adev->reset_sem);
mutex_lock(&adev->lock_reset);
atomic_inc(&adev->gpu_reset_counter); atomic_inc(&adev->gpu_reset_counter);
adev->in_gpu_reset = true;
switch (amdgpu_asic_reset_method(adev)) { switch (amdgpu_asic_reset_method(adev)) {
case AMD_RESET_METHOD_MODE1: case AMD_RESET_METHOD_MODE1:
adev->mp1_state = PP_MP1_STATE_SHUTDOWN; adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
@@ -4170,8 +4172,8 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
{ {
amdgpu_vf_error_trans_all(adev); amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE; adev->mp1_state = PP_MP1_STATE_NONE;
adev->in_gpu_reset = false; atomic_set(&adev->in_gpu_reset, 0);
mutex_unlock(&adev->lock_reset); up_write(&adev->reset_sem);
} }
static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
@@ -4281,12 +4283,14 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
* We always reset all schedulers for device and all devices for XGMI * We always reset all schedulers for device and all devices for XGMI
* hive so that should take care of them too. * hive so that should take care of them too.
*/ */
hive = amdgpu_get_xgmi_hive(adev, true); hive = amdgpu_get_xgmi_hive(adev, false);
if (hive && !mutex_trylock(&hive->reset_lock)) { if (hive) {
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
job ? job->base.id : -1, hive->hive_id); DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
mutex_unlock(&hive->hive_lock); job ? job->base.id : -1, hive->hive_id);
return 0; return 0;
}
mutex_lock(&hive->hive_lock);
} }
/* /*
@@ -4308,11 +4312,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* block all schedulers and reset given job's ring */ /* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
if (!amdgpu_device_lock_adev(tmp_adev, !hive)) { if (!amdgpu_device_lock_adev(tmp_adev)) {
DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
job ? job->base.id : -1); job ? job->base.id : -1);
mutex_unlock(&hive->hive_lock); r = 0;
return 0; goto skip_recovery;
} }
/* /*
@@ -4445,8 +4449,9 @@ skip_sched_resume:
amdgpu_device_unlock_adev(tmp_adev); amdgpu_device_unlock_adev(tmp_adev);
} }
skip_recovery:
if (hive) { if (hive) {
mutex_unlock(&hive->reset_lock); atomic_set(&hive->in_reset, 0);
mutex_unlock(&hive->hive_lock); mutex_unlock(&hive->hive_lock);
} }

View File

@@ -671,6 +671,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
bo_va = NULL; bo_va = NULL;
} }
down_read(&adev->reset_sem);
switch (args->operation) { switch (args->operation) {
case AMDGPU_VA_OP_MAP: case AMDGPU_VA_OP_MAP:
va_flags = amdgpu_gem_va_map_flags(adev, args->flags); va_flags = amdgpu_gem_va_map_flags(adev, args->flags);
@@ -700,6 +702,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
amdgpu_gem_va_update_vm(adev, &fpriv->vm, bo_va, amdgpu_gem_va_update_vm(adev, &fpriv->vm, bo_va,
args->operation); args->operation);
up_read(&adev->reset_sem);
error_backoff: error_backoff:
ttm_eu_backoff_reservation(&ticket, &list); ttm_eu_backoff_reservation(&ticket, &list);

View File

@@ -724,7 +724,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
* *
* also don't wait anymore for IRQ context * also don't wait anymore for IRQ context
* */ * */
if (r < 1 && (adev->in_gpu_reset || in_interrupt())) if (r < 1 && (amdgpu_in_reset(adev) || in_interrupt()))
goto failed_kiq_read; goto failed_kiq_read;
might_sleep(); might_sleep();
@@ -782,7 +782,7 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
* *
* also don't wait anymore for IRQ context * also don't wait anymore for IRQ context
* */ * */
if (r < 1 && (adev->in_gpu_reset || in_interrupt())) if (r < 1 && (amdgpu_in_reset(adev) || in_interrupt()))
goto failed_kiq_write; goto failed_kiq_write;
might_sleep(); might_sleep();
@@ -801,5 +801,5 @@ failed_undo:
amdgpu_ring_undo(ring); amdgpu_ring_undo(ring);
spin_unlock_irqrestore(&kiq->ring_lock, flags); spin_unlock_irqrestore(&kiq->ring_lock, flags);
failed_kiq_write: failed_kiq_write:
pr_err("failed to write reg:%x\n", reg); dev_warn(adev->dev, "failed to write reg:%x\n", reg);
} }

View File

@@ -220,17 +220,17 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
trace_amdgpu_sched_run_job(job); trace_amdgpu_sched_run_job(job);
if (job->vram_lost_counter != atomic_read(&ring->adev->vram_lost_counter)) if (down_read_trylock(&ring->adev->reset_sem)) {
dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
if (finished->error < 0) {
DRM_INFO("Skip scheduling IBs!\n");
} else {
r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
&fence); &fence);
up_read(&ring->adev->reset_sem);
if (r) if (r)
DRM_ERROR("Error scheduling IBs (%d)\n", r); DRM_ERROR("Error scheduling IBs (%d)\n", r);
} else {
dma_fence_set_error(finished, -ECANCELED);
DRM_INFO("Skip scheduling IBs!\n");
} }
/* if gpu reset, hw fence will be replaced here */ /* if gpu reset, hw fence will be replaced here */
dma_fence_put(job->fence); dma_fence_put(job->fence);
job->fence = dma_fence_get(fence); job->fence = dma_fence_get(fence);

View File

@@ -1084,6 +1084,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
if (!fpriv) if (!fpriv)
return; return;
down_read(&adev->reset_sem);
pm_runtime_get_sync(dev->dev); pm_runtime_get_sync(dev->dev);
if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_UVD) != NULL) if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_UVD) != NULL)
@@ -1122,6 +1124,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
pm_runtime_mark_last_busy(dev->dev); pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev); pm_runtime_put_autosuspend(dev->dev);
up_read(&adev->reset_sem);
} }
/* /*

File diff suppressed because it is too large Load Diff

View File

@@ -1684,7 +1684,7 @@ static int psp_load_smu_fw(struct psp_context *psp)
return 0; return 0;
if (adev->in_gpu_reset && ras && ras->supported) { if (amdgpu_in_reset(adev) && ras && ras->supported) {
ret = amdgpu_dpm_set_mp1_state(adev, PP_MP1_STATE_UNLOAD); ret = amdgpu_dpm_set_mp1_state(adev, PP_MP1_STATE_UNLOAD);
if (ret) { if (ret) {
DRM_WARN("Failed to set MP1 state prepare for reload\n"); DRM_WARN("Failed to set MP1 state prepare for reload\n");
@@ -1799,7 +1799,7 @@ static int psp_load_fw(struct amdgpu_device *adev)
int ret; int ret;
struct psp_context *psp = &adev->psp; struct psp_context *psp = &adev->psp;
if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset) { if (amdgpu_sriov_vf(adev) && amdgpu_in_reset(adev)) {
psp_ring_stop(psp, PSP_RING_TYPE__KM); /* should not destroy ring, only stop */ psp_ring_stop(psp, PSP_RING_TYPE__KM); /* should not destroy ring, only stop */
goto skip_memalloc; goto skip_memalloc;
} }

View File

@@ -1978,7 +1978,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
amdgpu_ras_request_reset_on_boot(adev, amdgpu_ras_request_reset_on_boot(adev,
ras_block->block); ras_block->block);
return 0; return 0;
} else if (adev->in_suspend || adev->in_gpu_reset) { } else if (adev->in_suspend || amdgpu_in_reset(adev)) {
/* in resume phase, if fail to enable ras, /* in resume phase, if fail to enable ras,
* clean up all ras fs nodes, and disable ras */ * clean up all ras fs nodes, and disable ras */
goto cleanup; goto cleanup;
@@ -1987,7 +1987,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
} }
/* in resume phase, no need to create ras fs node */ /* in resume phase, no need to create ras fs node */
if (adev->in_suspend || adev->in_gpu_reset) if (adev->in_suspend || amdgpu_in_reset(adev))
return 0; return 0;
if (ih_info->cb) { if (ih_info->cb) {

View File

@@ -2088,7 +2088,7 @@ void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool enable)
uint64_t size; uint64_t size;
int r; int r;
if (!adev->mman.initialized || adev->in_gpu_reset || if (!adev->mman.initialized || amdgpu_in_reset(adev) ||
adev->mman.buffer_funcs_enabled == enable) adev->mman.buffer_funcs_enabled == enable)
return; return;

View File

@@ -628,7 +628,8 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
struct amdgpu_firmware_info *ucode = NULL; struct amdgpu_firmware_info *ucode = NULL;
/* for baremetal, the ucode is allocated in gtt, so don't need to fill the bo when reset/suspend */ /* for baremetal, the ucode is allocated in gtt, so don't need to fill the bo when reset/suspend */
if (!amdgpu_sriov_vf(adev) && (adev->in_gpu_reset || adev->in_suspend)) if (!amdgpu_sriov_vf(adev) &&
(amdgpu_in_reset(adev) || adev->in_suspend))
return 0; return 0;
/* /*
* if SMU loaded firmware, it needn't add SMC, UVD, and VCE * if SMU loaded firmware, it needn't add SMC, UVD, and VCE

View File

@@ -93,7 +93,7 @@ failed_undo:
amdgpu_ring_undo(ring); amdgpu_ring_undo(ring);
spin_unlock_irqrestore(&kiq->ring_lock, flags); spin_unlock_irqrestore(&kiq->ring_lock, flags);
failed_kiq: failed_kiq:
pr_err("failed to write reg %x wait reg %x\n", reg0, reg1); dev_warn(adev->dev, "failed to write reg %x wait reg %x\n", reg0, reg1);
} }
/** /**

View File

@@ -325,9 +325,9 @@ static inline bool is_virtual_machine(void)
#define amdgpu_sriov_is_pp_one_vf(adev) \ #define amdgpu_sriov_is_pp_one_vf(adev) \
((adev)->virt.gim_feature & AMDGIM_FEATURE_PP_ONE_VF) ((adev)->virt.gim_feature & AMDGIM_FEATURE_PP_ONE_VF)
#define amdgpu_sriov_is_debug(adev) \ #define amdgpu_sriov_is_debug(adev) \
((!adev->in_gpu_reset) && adev->virt.tdr_debug) ((!amdgpu_in_reset(adev)) && adev->virt.tdr_debug)
#define amdgpu_sriov_is_normal(adev) \ #define amdgpu_sriov_is_normal(adev) \
((!adev->in_gpu_reset) && (!adev->virt.tdr_debug)) ((!amdgpu_in_reset(adev)) && (!adev->virt.tdr_debug))
bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev); bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev);
void amdgpu_virt_init_setting(struct amdgpu_device *adev); void amdgpu_virt_init_setting(struct amdgpu_device *adev);

View File

@@ -372,7 +372,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
tmp->hive_id = adev->gmc.xgmi.hive_id; tmp->hive_id = adev->gmc.xgmi.hive_id;
INIT_LIST_HEAD(&tmp->device_list); INIT_LIST_HEAD(&tmp->device_list);
mutex_init(&tmp->hive_lock); mutex_init(&tmp->hive_lock);
mutex_init(&tmp->reset_lock); atomic_set(&tmp->in_reset, 0);
task_barrier_init(&tmp->tb); task_barrier_init(&tmp->tb);
if (lock) if (lock)
@@ -397,6 +397,7 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
hive->hi_req_gpu : adev; hive->hi_req_gpu : adev;
bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20; bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
bool init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN; bool init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN;
bool locked;
/* fw bug so temporarily disable pstate switching */ /* fw bug so temporarily disable pstate switching */
return 0; return 0;
@@ -404,7 +405,9 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
if (!hive || adev->asic_type != CHIP_VEGA20) if (!hive || adev->asic_type != CHIP_VEGA20)
return 0; return 0;
mutex_lock(&hive->hive_lock); locked = atomic_read(&hive->in_reset) ? false : true;
if (locked)
mutex_lock(&hive->hive_lock);
if (is_hi_req) if (is_hi_req)
hive->hi_req_count++; hive->hi_req_count++;
@@ -439,7 +442,8 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
adev : NULL; adev : NULL;
} }
out: out:
mutex_unlock(&hive->hive_lock); if (locked)
mutex_unlock(&hive->hive_lock);
return ret; return ret;
} }
@@ -594,7 +598,6 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
if(!(--hive->number_devices)){ if(!(--hive->number_devices)){
amdgpu_xgmi_sysfs_destroy(adev, hive); amdgpu_xgmi_sysfs_destroy(adev, hive);
mutex_destroy(&hive->hive_lock); mutex_destroy(&hive->hive_lock);
mutex_destroy(&hive->reset_lock);
} }
return psp_xgmi_terminate(&adev->psp); return psp_xgmi_terminate(&adev->psp);

View File

@@ -30,7 +30,8 @@ struct amdgpu_hive_info {
uint64_t hive_id; uint64_t hive_id;
struct list_head device_list; struct list_head device_list;
int number_devices; int number_devices;
struct mutex hive_lock, reset_lock; struct mutex hive_lock;
atomic_t in_reset;
struct kobject *kobj; struct kobject *kobj;
struct device_attribute dev_attr; struct device_attribute dev_attr;
struct amdgpu_device *adev; struct amdgpu_device *adev;

View File

@@ -755,6 +755,7 @@ static void atom_op_jump(atom_exec_context *ctx, int *ptr, int arg)
/* jiffies wrap around we will just wait a little longer */ /* jiffies wrap around we will just wait a little longer */
ctx->last_jump_jiffies = jiffies; ctx->last_jump_jiffies = jiffies;
} }
schedule();
} else { } else {
ctx->last_jump = ctx->start + target; ctx->last_jump = ctx->start + target;
ctx->last_jump_jiffies = jiffies; ctx->last_jump_jiffies = jiffies;

View File

@@ -6180,7 +6180,7 @@ static int gfx_v10_0_gfx_init_queue(struct amdgpu_ring *ring)
struct v10_gfx_mqd *mqd = ring->mqd_ptr; struct v10_gfx_mqd *mqd = ring->mqd_ptr;
int mqd_idx = ring - &adev->gfx.gfx_ring[0]; int mqd_idx = ring - &adev->gfx.gfx_ring[0];
if (!adev->in_gpu_reset && !adev->in_suspend) { if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
memset((void *)mqd, 0, sizeof(*mqd)); memset((void *)mqd, 0, sizeof(*mqd));
mutex_lock(&adev->srbm_mutex); mutex_lock(&adev->srbm_mutex);
nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
@@ -6192,7 +6192,7 @@ static int gfx_v10_0_gfx_init_queue(struct amdgpu_ring *ring)
mutex_unlock(&adev->srbm_mutex); mutex_unlock(&adev->srbm_mutex);
if (adev->gfx.me.mqd_backup[mqd_idx]) if (adev->gfx.me.mqd_backup[mqd_idx])
memcpy(adev->gfx.me.mqd_backup[mqd_idx], mqd, sizeof(*mqd)); memcpy(adev->gfx.me.mqd_backup[mqd_idx], mqd, sizeof(*mqd));
} else if (adev->in_gpu_reset) { } else if (amdgpu_in_reset(adev)) {
/* reset mqd with the backup copy */ /* reset mqd with the backup copy */
if (adev->gfx.me.mqd_backup[mqd_idx]) if (adev->gfx.me.mqd_backup[mqd_idx])
memcpy(mqd, adev->gfx.me.mqd_backup[mqd_idx], sizeof(*mqd)); memcpy(mqd, adev->gfx.me.mqd_backup[mqd_idx], sizeof(*mqd));
@@ -6541,7 +6541,7 @@ static int gfx_v10_0_kiq_init_queue(struct amdgpu_ring *ring)
gfx_v10_0_kiq_setting(ring); gfx_v10_0_kiq_setting(ring);
if (adev->in_gpu_reset) { /* for GPU_RESET case */ if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
/* reset MQD to a clean status */ /* reset MQD to a clean status */
if (adev->gfx.mec.mqd_backup[mqd_idx]) if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd)); memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd));
@@ -6577,7 +6577,7 @@ static int gfx_v10_0_kcq_init_queue(struct amdgpu_ring *ring)
struct v10_compute_mqd *mqd = ring->mqd_ptr; struct v10_compute_mqd *mqd = ring->mqd_ptr;
int mqd_idx = ring - &adev->gfx.compute_ring[0]; int mqd_idx = ring - &adev->gfx.compute_ring[0];
if (!adev->in_gpu_reset && !adev->in_suspend) { if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
memset((void *)mqd, 0, sizeof(*mqd)); memset((void *)mqd, 0, sizeof(*mqd));
mutex_lock(&adev->srbm_mutex); mutex_lock(&adev->srbm_mutex);
nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
@@ -6587,7 +6587,7 @@ static int gfx_v10_0_kcq_init_queue(struct amdgpu_ring *ring)
if (adev->gfx.mec.mqd_backup[mqd_idx]) if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(*mqd)); memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(*mqd));
} else if (adev->in_gpu_reset) { /* for GPU_RESET case */ } else if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
/* reset MQD to a clean status */ /* reset MQD to a clean status */
if (adev->gfx.mec.mqd_backup[mqd_idx]) if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd)); memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd));

View File

@@ -4632,7 +4632,7 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring)
gfx_v8_0_kiq_setting(ring); gfx_v8_0_kiq_setting(ring);
if (adev->in_gpu_reset) { /* for GPU_RESET case */ if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
/* reset MQD to a clean status */ /* reset MQD to a clean status */
if (adev->gfx.mec.mqd_backup[mqd_idx]) if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation)); memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation));
@@ -4669,7 +4669,7 @@ static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring)
struct vi_mqd *mqd = ring->mqd_ptr; struct vi_mqd *mqd = ring->mqd_ptr;
int mqd_idx = ring - &adev->gfx.compute_ring[0]; int mqd_idx = ring - &adev->gfx.compute_ring[0];
if (!adev->in_gpu_reset && !adev->in_suspend) { if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
memset((void *)mqd, 0, sizeof(struct vi_mqd_allocation)); memset((void *)mqd, 0, sizeof(struct vi_mqd_allocation));
((struct vi_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF; ((struct vi_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF;
((struct vi_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF; ((struct vi_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF;
@@ -4681,7 +4681,7 @@ static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring)
if (adev->gfx.mec.mqd_backup[mqd_idx]) if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct vi_mqd_allocation)); memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct vi_mqd_allocation));
} else if (adev->in_gpu_reset) { /* for GPU_RESET case */ } else if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
/* reset MQD to a clean status */ /* reset MQD to a clean status */
if (adev->gfx.mec.mqd_backup[mqd_idx]) if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation)); memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation));

View File

@@ -3684,7 +3684,7 @@ static int gfx_v9_0_kiq_init_queue(struct amdgpu_ring *ring)
gfx_v9_0_kiq_setting(ring); gfx_v9_0_kiq_setting(ring);
if (adev->in_gpu_reset) { /* for GPU_RESET case */ if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
/* reset MQD to a clean status */ /* reset MQD to a clean status */
if (adev->gfx.mec.mqd_backup[mqd_idx]) if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct v9_mqd_allocation)); memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct v9_mqd_allocation));
@@ -3722,7 +3722,7 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring *ring)
struct v9_mqd *mqd = ring->mqd_ptr; struct v9_mqd *mqd = ring->mqd_ptr;
int mqd_idx = ring - &adev->gfx.compute_ring[0]; int mqd_idx = ring - &adev->gfx.compute_ring[0];
if (!adev->in_gpu_reset && !adev->in_suspend) { if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
memset((void *)mqd, 0, sizeof(struct v9_mqd_allocation)); memset((void *)mqd, 0, sizeof(struct v9_mqd_allocation));
((struct v9_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF; ((struct v9_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF;
((struct v9_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF; ((struct v9_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF;
@@ -3734,7 +3734,7 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring *ring)
if (adev->gfx.mec.mqd_backup[mqd_idx]) if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct v9_mqd_allocation)); memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct v9_mqd_allocation));
} else if (adev->in_gpu_reset) { /* for GPU_RESET case */ } else if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
/* reset MQD to a clean status */ /* reset MQD to a clean status */
if (adev->gfx.mec.mqd_backup[mqd_idx]) if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct v9_mqd_allocation)); memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct v9_mqd_allocation));
@@ -3928,7 +3928,7 @@ static int gfx_v9_0_hw_fini(void *handle)
/* Use deinitialize sequence from CAIL when unbinding device from driver, /* Use deinitialize sequence from CAIL when unbinding device from driver,
* otherwise KIQ is hanging when binding back * otherwise KIQ is hanging when binding back
*/ */
if (!adev->in_gpu_reset && !adev->in_suspend) { if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
mutex_lock(&adev->srbm_mutex); mutex_lock(&adev->srbm_mutex);
soc15_grbm_select(adev, adev->gfx.kiq.ring.me, soc15_grbm_select(adev, adev->gfx.kiq.ring.me,
adev->gfx.kiq.ring.pipe, adev->gfx.kiq.ring.pipe,
@@ -4086,7 +4086,7 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
* *
* also don't wait anymore for IRQ context * also don't wait anymore for IRQ context
* */ * */
if (r < 1 && (adev->in_gpu_reset || in_interrupt())) if (r < 1 && (amdgpu_in_reset(adev) || in_interrupt()))
goto failed_kiq_read; goto failed_kiq_read;
might_sleep(); might_sleep();

View File

@@ -268,7 +268,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
*/ */
if (adev->gfx.kiq.ring.sched.ready && if (adev->gfx.kiq.ring.sched.ready &&
(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) && (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
!adev->in_gpu_reset) { !amdgpu_in_reset(adev)) {
struct amdgpu_vmhub *hub = &adev->vmhub[vmhub]; struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
const unsigned eng = 17; const unsigned eng = 17;
@@ -293,7 +293,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
if (!adev->mman.buffer_funcs_enabled || if (!adev->mman.buffer_funcs_enabled ||
!adev->ib_pool_ready || !adev->ib_pool_ready ||
adev->in_gpu_reset || amdgpu_in_reset(adev) ||
ring->sched.ready == false) { ring->sched.ready == false) {
gmc_v10_0_flush_vm_hub(adev, vmid, AMDGPU_GFXHUB_0, 0); gmc_v10_0_flush_vm_hub(adev, vmid, AMDGPU_GFXHUB_0, 0);
mutex_unlock(&adev->mman.gtt_window_lock); mutex_unlock(&adev->mman.gtt_window_lock);

View File

@@ -434,7 +434,7 @@ static int gmc_v7_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
int vmid; int vmid;
unsigned int tmp; unsigned int tmp;
if (adev->in_gpu_reset) if (amdgpu_in_reset(adev))
return -EIO; return -EIO;
for (vmid = 1; vmid < 16; vmid++) { for (vmid = 1; vmid < 16; vmid++) {

View File

@@ -635,7 +635,7 @@ static int gmc_v8_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
int vmid; int vmid;
unsigned int tmp; unsigned int tmp;
if (adev->in_gpu_reset) if (amdgpu_in_reset(adev))
return -EIO; return -EIO;
for (vmid = 1; vmid < 16; vmid++) { for (vmid = 1; vmid < 16; vmid++) {

View File

@@ -501,7 +501,7 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
*/ */
if (adev->gfx.kiq.ring.sched.ready && if (adev->gfx.kiq.ring.sched.ready &&
(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) && (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
!adev->in_gpu_reset) { !amdgpu_in_reset(adev)) {
uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng; uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng; uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
@@ -596,7 +596,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
struct amdgpu_ring *ring = &adev->gfx.kiq.ring; struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
struct amdgpu_kiq *kiq = &adev->gfx.kiq; struct amdgpu_kiq *kiq = &adev->gfx.kiq;
if (adev->in_gpu_reset) if (amdgpu_in_reset(adev))
return -EIO; return -EIO;
if (ring->sched.ready) { if (ring->sched.ready) {
@@ -633,7 +633,8 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
spin_unlock(&adev->gfx.kiq.ring_lock); spin_unlock(&adev->gfx.kiq.ring_lock);
r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout); r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
if (r < 1) { if (r < 1) {
DRM_ERROR("wait for kiq fence error: %ld.\n", r); dev_info(adev->dev,
"wait for kiq fence error: %ld\n", r);
return -ETIME; return -ETIME;
} }

View File

@@ -238,20 +238,16 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
int locked;
/* block amdgpu_gpu_recover till msg FLR COMPLETE received, /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
* otherwise the mailbox msg will be ruined/reseted by * otherwise the mailbox msg will be ruined/reseted by
* the VF FLR. * the VF FLR.
* *
* we can unlock the lock_reset to allow "amdgpu_job_timedout" * we can unlock the reset_sem to allow "amdgpu_job_timedout"
* to run gpu_recover() after FLR_NOTIFICATION_CMPL received * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
* which means host side had finished this VF's FLR. * which means host side had finished this VF's FLR.
*/ */
locked = mutex_trylock(&adev->lock_reset); down_read(&adev->reset_sem);
if (locked)
adev->in_gpu_reset = true;
do { do {
if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL) if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
goto flr_done; goto flr_done;
@@ -261,10 +257,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
} while (timeout > 1); } while (timeout > 1);
flr_done: flr_done:
if (locked) { up_read(&adev->reset_sem);
adev->in_gpu_reset = false;
mutex_unlock(&adev->lock_reset);
}
/* Trigger recovery for world switch failure if no TDR */ /* Trigger recovery for world switch failure if no TDR */
if (amdgpu_device_should_recover_gpu(adev) if (amdgpu_device_should_recover_gpu(adev)

View File

@@ -259,20 +259,16 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
int locked;
/* block amdgpu_gpu_recover till msg FLR COMPLETE received, /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
* otherwise the mailbox msg will be ruined/reseted by * otherwise the mailbox msg will be ruined/reseted by
* the VF FLR. * the VF FLR.
* *
* we can unlock the lock_reset to allow "amdgpu_job_timedout" * we can unlock the reset_sem to allow "amdgpu_job_timedout"
* to run gpu_recover() after FLR_NOTIFICATION_CMPL received * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
* which means host side had finished this VF's FLR. * which means host side had finished this VF's FLR.
*/ */
locked = mutex_trylock(&adev->lock_reset); down_read(&adev->reset_sem);
if (locked)
adev->in_gpu_reset = true;
do { do {
if (xgpu_nv_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL) if (xgpu_nv_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
goto flr_done; goto flr_done;
@@ -282,10 +278,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
} while (timeout > 1); } while (timeout > 1);
flr_done: flr_done:
if (locked) { up_read(&adev->reset_sem);
adev->in_gpu_reset = false;
mutex_unlock(&adev->lock_reset);
}
/* Trigger recovery for world switch failure if no TDR */ /* Trigger recovery for world switch failure if no TDR */
if (amdgpu_device_should_recover_gpu(adev) if (amdgpu_device_should_recover_gpu(adev)

View File

@@ -304,15 +304,17 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
struct qcm_process_device *qpd, struct qcm_process_device *qpd,
struct queue *q) struct queue *q)
{ {
/* On GFX v7, CP doesn't flush TC at dequeue */ if (!dqm->is_resetting) {
if (q->device->device_info->asic_family == CHIP_HAWAII) /* On GFX v7, CP doesn't flush TC at dequeue */
if (flush_texture_cache_nocpsch(q->device, qpd)) if (q->device->device_info->asic_family == CHIP_HAWAII)
pr_err("Failed to flush TC\n"); if (flush_texture_cache_nocpsch(q->device, qpd))
pr_err("Failed to flush TC\n");
kfd_flush_tlb(qpd_to_pdd(qpd)); kfd_flush_tlb(qpd_to_pdd(qpd));
/* Release the vmid mapping */ /* Release the vmid mapping */
set_pasid_vmid_mapping(dqm, 0, qpd->vmid); set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
}
dqm->vmid_pasid[qpd->vmid] = 0; dqm->vmid_pasid[qpd->vmid] = 0;
qpd->vmid = 0; qpd->vmid = 0;

View File

@@ -1551,6 +1551,10 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process,
void kfd_flush_tlb(struct kfd_process_device *pdd) void kfd_flush_tlb(struct kfd_process_device *pdd)
{ {
struct kfd_dev *dev = pdd->dev; struct kfd_dev *dev = pdd->dev;
struct device_queue_manager *dqm = dev->dqm;
if (dqm->is_resetting)
return;
if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) { if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
/* Nothing to flush until a VMID is assigned, which /* Nothing to flush until a VMID is assigned, which

View File

@@ -1647,7 +1647,7 @@ static int dm_suspend(void *handle)
struct amdgpu_display_manager *dm = &adev->dm; struct amdgpu_display_manager *dm = &adev->dm;
int ret = 0; int ret = 0;
if (adev->in_gpu_reset) { if (amdgpu_in_reset(adev)) {
mutex_lock(&dm->dc_lock); mutex_lock(&dm->dc_lock);
dm->cached_dc_state = dc_copy_state(dm->dc->current_state); dm->cached_dc_state = dc_copy_state(dm->dc->current_state);
@@ -1833,7 +1833,7 @@ static int dm_resume(void *handle)
struct dc_state *dc_state; struct dc_state *dc_state;
int i, r, j; int i, r, j;
if (adev->in_gpu_reset) { if (amdgpu_in_reset(adev)) {
dc_state = dm->cached_dc_state; dc_state = dm->cached_dc_state;
r = dm_dmub_hw_init(adev); r = dm_dmub_hw_init(adev);

View File

@@ -992,7 +992,7 @@ static int smu_disable_dpms(struct smu_context *smu)
struct amdgpu_device *adev = smu->adev; struct amdgpu_device *adev = smu->adev;
int ret = 0; int ret = 0;
bool use_baco = !smu->is_apu && bool use_baco = !smu->is_apu &&
((adev->in_gpu_reset && ((amdgpu_in_reset(adev) &&
(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)) || (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)) ||
((adev->in_runpm || adev->in_hibernate) && amdgpu_asic_supports_baco(adev))); ((adev->in_runpm || adev->in_hibernate) && amdgpu_asic_supports_baco(adev)));

View File

@@ -484,7 +484,7 @@ static int vega20_setup_asic_task(struct pp_hwmgr *hwmgr)
{ {
struct amdgpu_device *adev = (struct amdgpu_device *)(hwmgr->adev); struct amdgpu_device *adev = (struct amdgpu_device *)(hwmgr->adev);
int ret = 0; int ret = 0;
bool use_baco = (adev->in_gpu_reset && bool use_baco = (amdgpu_in_reset(adev) &&
(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)) || (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)) ||
(adev->in_runpm && amdgpu_asic_supports_baco(adev)); (adev->in_runpm && amdgpu_asic_supports_baco(adev));