drm/amdgpu: revert "fix system hang issue during GPU reset"

The whole approach wasn't thought through till the end.

We already had a reset lock like this in the past and it caused the same problems like this one.

Completely revert the patch for now and add individual trylock protection to the hardware access functions as necessary.

This reverts commit df9c8d1aa2.

Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Christian König 2020-08-12 17:48:26 +02:00 committed by Alex Deucher
parent 05f39286ce
commit f1403342eb
39 changed files with 184 additions and 469 deletions

View File

@ -949,9 +949,9 @@ struct amdgpu_device {
bool in_suspend;
bool in_hibernate;
atomic_t in_gpu_reset;
bool in_gpu_reset;
enum pp_mp1_state mp1_state;
struct rw_semaphore reset_sem;
struct mutex lock_reset;
struct amdgpu_doorbell_index doorbell_index;
struct mutex notifier_lock;
@ -1266,9 +1266,4 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)
return adev->gmc.tmz_enabled;
}
static inline bool amdgpu_in_reset(struct amdgpu_device *adev)
{
return atomic_read(&adev->in_gpu_reset) ? true : false;
}
#endif

View File

@ -244,14 +244,11 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
if (cp_mqd_gfx9)
bp.flags |= AMDGPU_GEM_CREATE_CP_MQD_GFX9;
if (!down_read_trylock(&adev->reset_sem))
return -EIO;
r = amdgpu_bo_create(adev, &bp, &bo);
if (r) {
dev_err(adev->dev,
"failed to allocate BO for amdkfd (%d)\n", r);
goto err;
return r;
}
/* map the buffer */
@ -286,7 +283,6 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
amdgpu_bo_unreserve(bo);
up_read(&adev->reset_sem);
return 0;
allocate_mem_kmap_bo_failed:
@ -295,25 +291,19 @@ allocate_mem_pin_bo_failed:
amdgpu_bo_unreserve(bo);
allocate_mem_reserve_bo_failed:
amdgpu_bo_unref(&bo);
err:
up_read(&adev->reset_sem);
return r;
}
void amdgpu_amdkfd_free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
{
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj;
down_read(&adev->reset_sem);
amdgpu_bo_reserve(bo, true);
amdgpu_bo_kunmap(bo);
amdgpu_bo_unpin(bo);
amdgpu_bo_unreserve(bo);
amdgpu_bo_unref(&(bo));
up_read(&adev->reset_sem);
}
int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size,
@ -345,14 +335,9 @@ int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size,
void amdgpu_amdkfd_free_gws(struct kgd_dev *kgd, void *mem_obj)
{
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
struct amdgpu_bo *bo = (struct amdgpu_bo *)mem_obj;
down_read(&adev->reset_sem);
amdgpu_bo_unref(&bo);
up_read(&adev->reset_sem);
}
uint32_t amdgpu_amdkfd_get_fw_version(struct kgd_dev *kgd,
@ -626,15 +611,8 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
/* This works for NO_HWS. TODO: need to handle without knowing VMID */
job->vmid = vmid;
if (!down_read_trylock(&adev->reset_sem)) {
ret = -EIO;
goto err_ib_sched;
}
ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);
up_read(&adev->reset_sem);
if (ret) {
DRM_ERROR("amdgpu: failed to schedule IB.\n");
goto err_ib_sched;
@ -670,9 +648,6 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
{
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
if (!down_read_trylock(&adev->reset_sem))
return -EIO;
if (adev->family == AMDGPU_FAMILY_AI) {
int i;
@ -682,8 +657,6 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0);
}
up_read(&adev->reset_sem);
return 0;
}
@ -692,18 +665,11 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, uint16_t pasid)
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
const uint32_t flush_type = 0;
bool all_hub = false;
int ret = -EIO;
if (adev->family == AMDGPU_FAMILY_AI)
all_hub = true;
if (down_read_trylock(&adev->reset_sem)) {
ret = amdgpu_gmc_flush_gpu_tlb_pasid(adev,
pasid, flush_type, all_hub);
up_read(&adev->reset_sem);
}
return ret;
return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub);
}
bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd)

View File

@ -542,7 +542,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v10_compute_mqd *m = get_mqd(mqd);
if (amdgpu_in_reset(adev))
if (adev->in_gpu_reset)
return -EIO;
#if 0

View File

@ -423,7 +423,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
unsigned long flags, end_jiffies;
int retry;
if (amdgpu_in_reset(adev))
if (adev->in_gpu_reset)
return -EIO;
acquire_queue(kgd, pipe_id, queue_id);

View File

@ -419,7 +419,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
int retry;
struct vi_mqd *m = get_mqd(mqd);
if (amdgpu_in_reset(adev))
if (adev->in_gpu_reset)
return -EIO;
acquire_queue(kgd, pipe_id, queue_id);

View File

@ -539,7 +539,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v9_mqd *m = get_mqd(mqd);
if (amdgpu_in_reset(adev))
if (adev->in_gpu_reset)
return -EIO;
acquire_queue(kgd, pipe_id, queue_id);

View File

@ -1194,9 +1194,6 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
return -EINVAL;
}
if (!down_read_trylock(&adev->reset_sem))
return -EIO;
*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
if (!*mem) {
ret = -ENOMEM;
@ -1263,7 +1260,6 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
if (offset)
*offset = amdgpu_bo_mmap_offset(bo);
up_read(&adev->reset_sem);
return 0;
allocate_init_user_pages_failed:
@ -1281,9 +1277,6 @@ err:
sg_free_table(sg);
kfree(sg);
}
up_read(&adev->reset_sem);
return ret;
}

View File

@ -1292,8 +1292,6 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
parser.adev = adev;
parser.filp = filp;
down_read(&adev->reset_sem);
r = amdgpu_cs_parser_init(&parser, data);
if (r) {
DRM_ERROR("Failed to initialize parser %d!\n", r);
@ -1333,8 +1331,6 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
out:
amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
up_read(&adev->reset_sem);
return r;
}

View File

@ -358,8 +358,6 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
if (atomic_read(&ctx->guilty))
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
down_read(&adev->reset_sem);
/*query ue count*/
ras_counter = amdgpu_ras_query_error_count(adev, false);
/*ras counter is monotonic increasing*/
@ -375,8 +373,6 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
ctx->ras_counter_ce = ras_counter;
}
up_read(&adev->reset_sem);
mutex_unlock(&mgr->lock);
return 0;
}

View File

@ -101,14 +101,14 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
file->private_data = adev;
down_read(&adev->reset_sem);
mutex_lock(&adev->lock_reset);
if (adev->autodump.dumping.done) {
reinit_completion(&adev->autodump.dumping);
ret = 0;
} else {
ret = -EBUSY;
}
up_read(&adev->reset_sem);
mutex_unlock(&adev->lock_reset);
return ret;
}
@ -127,7 +127,7 @@ static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_
poll_wait(file, &adev->autodump.gpu_hang, poll_table);
if (amdgpu_in_reset(adev))
if (adev->in_gpu_reset)
return POLLIN | POLLRDNORM | POLLWRNORM;
return 0;
@ -1242,7 +1242,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
}
/* Avoid accidently unparking the sched thread during GPU reset */
down_read(&adev->reset_sem);
mutex_lock(&adev->lock_reset);
/* hold on the scheduler */
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
@ -1269,7 +1269,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
kthread_unpark(ring->sched.thread);
}
up_read(&adev->reset_sem);
mutex_unlock(&adev->lock_reset);
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
@ -1459,7 +1459,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM;
/* Avoid accidently unparking the sched thread during GPU reset */
down_read(&adev->reset_sem);
mutex_lock(&adev->lock_reset);
/* stop the scheduler */
kthread_park(ring->sched.thread);
@ -1500,7 +1500,7 @@ failure:
/* restart the scheduler */
kthread_unpark(ring->sched.thread);
up_read(&adev->reset_sem);
mutex_unlock(&adev->lock_reset);
ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);

View File

@ -1940,7 +1940,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
if (adev->ip_blocks[i].status.hw == true)
break;
if (amdgpu_in_reset(adev) || adev->in_suspend) {
if (adev->in_gpu_reset || adev->in_suspend) {
r = adev->ip_blocks[i].version->funcs->resume(adev);
if (r) {
DRM_ERROR("resume of IP block <%s> failed %d\n",
@ -2117,7 +2117,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
AMDGPU_RESET_MAGIC_NUM))
return true;
if (!amdgpu_in_reset(adev))
if (!adev->in_gpu_reset)
return false;
/*
@ -3053,8 +3053,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(&adev->mn_lock);
mutex_init(&adev->virt.vf_errors.lock);
hash_init(adev->mn_hash);
init_rwsem(&adev->reset_sem);
atomic_set(&adev->in_gpu_reset, 0);
mutex_init(&adev->lock_reset);
mutex_init(&adev->psp.mutex);
mutex_init(&adev->notifier_lock);
@ -4082,11 +4081,8 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
if (need_full_reset) {
/* post card */
if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) {
dev_warn(tmp_adev->dev, "asic atom init failed!");
r = -EAGAIN;
goto out;
}
if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context))
DRM_WARN("asic atom init failed!");
if (!r) {
dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
@ -4176,18 +4172,16 @@ end:
return r;
}
static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
{
if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
if (trylock) {
if (!mutex_trylock(&adev->lock_reset))
return false;
if (hive) {
down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
} else {
down_write(&adev->reset_sem);
}
} else
mutex_lock(&adev->lock_reset);
atomic_inc(&adev->gpu_reset_counter);
adev->in_gpu_reset = true;
switch (amdgpu_asic_reset_method(adev)) {
case AMD_RESET_METHOD_MODE1:
adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
@ -4207,8 +4201,8 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
{
amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE;
atomic_set(&adev->in_gpu_reset, 0);
up_write(&adev->reset_sem);
adev->in_gpu_reset = false;
mutex_unlock(&adev->lock_reset);
}
static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
@ -4318,15 +4312,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
* We always reset all schedulers for device and all devices for XGMI
* hive so that should take care of them too.
*/
hive = amdgpu_get_xgmi_hive(adev, false);
if (hive) {
if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
hive = amdgpu_get_xgmi_hive(adev, true);
if (hive && !mutex_trylock(&hive->reset_lock)) {
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
job ? job->base.id : -1, hive->hive_id);
mutex_unlock(&hive->hive_lock);
return 0;
}
mutex_lock(&hive->hive_lock);
}
/*
* Build list of devices to reset.
@ -4347,11 +4339,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
if (!amdgpu_device_lock_adev(tmp_adev, !hive)) {
DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
job ? job->base.id : -1);
r = 0;
goto skip_recovery;
mutex_unlock(&hive->hive_lock);
return 0;
}
/*
@ -4484,9 +4476,8 @@ skip_sched_resume:
amdgpu_device_unlock_adev(tmp_adev);
}
skip_recovery:
if (hive) {
atomic_set(&hive->in_reset, 0);
mutex_unlock(&hive->reset_lock);
mutex_unlock(&hive->hive_lock);
}

View File

@ -671,8 +671,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
bo_va = NULL;
}
down_read(&adev->reset_sem);
switch (args->operation) {
case AMDGPU_VA_OP_MAP:
va_flags = amdgpu_gem_va_map_flags(adev, args->flags);
@ -702,8 +700,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
amdgpu_gem_va_update_vm(adev, &fpriv->vm, bo_va,
args->operation);
up_read(&adev->reset_sem);
error_backoff:
ttm_eu_backoff_reservation(&ticket, &list);

View File

@ -719,7 +719,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
*
* also don't wait anymore for IRQ context
* */
if (r < 1 && (amdgpu_in_reset(adev) || in_interrupt()))
if (r < 1 && (adev->in_gpu_reset || in_interrupt()))
goto failed_kiq_read;
might_sleep();
@ -777,7 +777,7 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
*
* also don't wait anymore for IRQ context
* */
if (r < 1 && (amdgpu_in_reset(adev) || in_interrupt()))
if (r < 1 && (adev->in_gpu_reset || in_interrupt()))
goto failed_kiq_write;
might_sleep();
@ -796,5 +796,5 @@ failed_undo:
amdgpu_ring_undo(ring);
spin_unlock_irqrestore(&kiq->ring_lock, flags);
failed_kiq_write:
dev_warn(adev->dev, "failed to write reg:%x\n", reg);
pr_err("failed to write reg:%x\n", reg);
}

View File

@ -220,17 +220,17 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
trace_amdgpu_sched_run_job(job);
if (down_read_trylock(&ring->adev->reset_sem)) {
if (job->vram_lost_counter != atomic_read(&ring->adev->vram_lost_counter))
dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
if (finished->error < 0) {
DRM_INFO("Skip scheduling IBs!\n");
} else {
r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
&fence);
up_read(&ring->adev->reset_sem);
if (r)
DRM_ERROR("Error scheduling IBs (%d)\n", r);
} else {
dma_fence_set_error(finished, -ECANCELED);
DRM_INFO("Skip scheduling IBs!\n");
}
/* if gpu reset, hw fence will be replaced here */
dma_fence_put(job->fence);
job->fence = dma_fence_get(fence);

View File

@ -1087,8 +1087,6 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
if (!fpriv)
return;
down_read(&adev->reset_sem);
pm_runtime_get_sync(dev->dev);
if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_UVD) != NULL)
@ -1127,8 +1125,6 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
up_read(&adev->reset_sem);
}
/*

File diff suppressed because it is too large Load Diff

View File

@ -1869,7 +1869,7 @@ static int psp_load_smu_fw(struct psp_context *psp)
return 0;
if (amdgpu_in_reset(adev) && ras && ras->supported) {
if (adev->in_gpu_reset && ras && ras->supported) {
ret = amdgpu_dpm_set_mp1_state(adev, PP_MP1_STATE_UNLOAD);
if (ret) {
DRM_WARN("Failed to set MP1 state prepare for reload\n");
@ -1984,7 +1984,7 @@ static int psp_load_fw(struct amdgpu_device *adev)
int ret;
struct psp_context *psp = &adev->psp;
if (amdgpu_sriov_vf(adev) && amdgpu_in_reset(adev)) {
if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset) {
psp_ring_stop(psp, PSP_RING_TYPE__KM); /* should not destroy ring, only stop */
goto skip_memalloc;
}

View File

@ -2079,7 +2079,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
amdgpu_ras_request_reset_on_boot(adev,
ras_block->block);
return 0;
} else if (adev->in_suspend || amdgpu_in_reset(adev)) {
} else if (adev->in_suspend || adev->in_gpu_reset) {
/* in resume phase, if fail to enable ras,
* clean up all ras fs nodes, and disable ras */
goto cleanup;
@ -2088,7 +2088,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
}
/* in resume phase, no need to create ras fs node */
if (adev->in_suspend || amdgpu_in_reset(adev))
if (adev->in_suspend || adev->in_gpu_reset)
return 0;
if (ih_info->cb) {

View File

@ -2098,7 +2098,7 @@ void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool enable)
uint64_t size;
int r;
if (!adev->mman.initialized || amdgpu_in_reset(adev) ||
if (!adev->mman.initialized || adev->in_gpu_reset ||
adev->mman.buffer_funcs_enabled == enable)
return;

View File

@ -628,8 +628,7 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
struct amdgpu_firmware_info *ucode = NULL;
/* for baremetal, the ucode is allocated in gtt, so don't need to fill the bo when reset/suspend */
if (!amdgpu_sriov_vf(adev) &&
(amdgpu_in_reset(adev) || adev->in_suspend))
if (!amdgpu_sriov_vf(adev) && (adev->in_gpu_reset || adev->in_suspend))
return 0;
/*
* if SMU loaded firmware, it needn't add SMC, UVD, and VCE

View File

@ -93,7 +93,7 @@ failed_undo:
amdgpu_ring_undo(ring);
spin_unlock_irqrestore(&kiq->ring_lock, flags);
failed_kiq:
dev_warn(adev->dev, "failed to write reg %x wait reg %x\n", reg0, reg1);
pr_err("failed to write reg %x wait reg %x\n", reg0, reg1);
}
/**

View File

@ -325,9 +325,9 @@ static inline bool is_virtual_machine(void)
#define amdgpu_sriov_is_pp_one_vf(adev) \
((adev)->virt.gim_feature & AMDGIM_FEATURE_PP_ONE_VF)
#define amdgpu_sriov_is_debug(adev) \
((!amdgpu_in_reset(adev)) && adev->virt.tdr_debug)
((!adev->in_gpu_reset) && adev->virt.tdr_debug)
#define amdgpu_sriov_is_normal(adev) \
((!amdgpu_in_reset(adev)) && (!adev->virt.tdr_debug))
((!adev->in_gpu_reset) && (!adev->virt.tdr_debug))
bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev);
void amdgpu_virt_init_setting(struct amdgpu_device *adev);

View File

@ -372,7 +372,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
tmp->hive_id = adev->gmc.xgmi.hive_id;
INIT_LIST_HEAD(&tmp->device_list);
mutex_init(&tmp->hive_lock);
atomic_set(&tmp->in_reset, 0);
mutex_init(&tmp->reset_lock);
task_barrier_init(&tmp->tb);
if (lock)
@ -397,7 +397,6 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
hive->hi_req_gpu : adev;
bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
bool init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN;
bool locked;
/* fw bug so temporarily disable pstate switching */
return 0;
@ -405,8 +404,6 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
if (!hive || adev->asic_type != CHIP_VEGA20)
return 0;
locked = atomic_read(&hive->in_reset) ? false : true;
if (locked)
mutex_lock(&hive->hive_lock);
if (is_hi_req)
@ -442,7 +439,6 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
adev : NULL;
}
out:
if (locked)
mutex_unlock(&hive->hive_lock);
return ret;
}
@ -598,6 +594,7 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
if(!(--hive->number_devices)){
amdgpu_xgmi_sysfs_destroy(adev, hive);
mutex_destroy(&hive->hive_lock);
mutex_destroy(&hive->reset_lock);
}
return psp_xgmi_terminate(&adev->psp);

View File

@ -30,8 +30,7 @@ struct amdgpu_hive_info {
uint64_t hive_id;
struct list_head device_list;
int number_devices;
struct mutex hive_lock;
atomic_t in_reset;
struct mutex hive_lock, reset_lock;
struct kobject *kobj;
struct device_attribute dev_attr;
struct amdgpu_device *adev;

View File

@ -755,7 +755,6 @@ static void atom_op_jump(atom_exec_context *ctx, int *ptr, int arg)
/* jiffies wrap around we will just wait a little longer */
ctx->last_jump_jiffies = jiffies;
}
schedule();
} else {
ctx->last_jump = ctx->start + target;
ctx->last_jump_jiffies = jiffies;

View File

@ -6201,7 +6201,7 @@ static int gfx_v10_0_gfx_init_queue(struct amdgpu_ring *ring)
struct v10_gfx_mqd *mqd = ring->mqd_ptr;
int mqd_idx = ring - &adev->gfx.gfx_ring[0];
if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
if (!adev->in_gpu_reset && !adev->in_suspend) {
memset((void *)mqd, 0, sizeof(*mqd));
mutex_lock(&adev->srbm_mutex);
nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
@ -6213,7 +6213,7 @@ static int gfx_v10_0_gfx_init_queue(struct amdgpu_ring *ring)
mutex_unlock(&adev->srbm_mutex);
if (adev->gfx.me.mqd_backup[mqd_idx])
memcpy(adev->gfx.me.mqd_backup[mqd_idx], mqd, sizeof(*mqd));
} else if (amdgpu_in_reset(adev)) {
} else if (adev->in_gpu_reset) {
/* reset mqd with the backup copy */
if (adev->gfx.me.mqd_backup[mqd_idx])
memcpy(mqd, adev->gfx.me.mqd_backup[mqd_idx], sizeof(*mqd));
@ -6566,7 +6566,7 @@ static int gfx_v10_0_kiq_init_queue(struct amdgpu_ring *ring)
gfx_v10_0_kiq_setting(ring);
if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
if (adev->in_gpu_reset) { /* for GPU_RESET case */
/* reset MQD to a clean status */
if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd));
@ -6602,7 +6602,7 @@ static int gfx_v10_0_kcq_init_queue(struct amdgpu_ring *ring)
struct v10_compute_mqd *mqd = ring->mqd_ptr;
int mqd_idx = ring - &adev->gfx.compute_ring[0];
if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
if (!adev->in_gpu_reset && !adev->in_suspend) {
memset((void *)mqd, 0, sizeof(*mqd));
mutex_lock(&adev->srbm_mutex);
nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
@ -6612,7 +6612,7 @@ static int gfx_v10_0_kcq_init_queue(struct amdgpu_ring *ring)
if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(*mqd));
} else if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
} else if (adev->in_gpu_reset) { /* for GPU_RESET case */
/* reset MQD to a clean status */
if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd));

View File

@ -4633,7 +4633,7 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring)
gfx_v8_0_kiq_setting(ring);
if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
if (adev->in_gpu_reset) { /* for GPU_RESET case */
/* reset MQD to a clean status */
if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation));
@ -4670,7 +4670,7 @@ static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring)
struct vi_mqd *mqd = ring->mqd_ptr;
int mqd_idx = ring - &adev->gfx.compute_ring[0];
if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
if (!adev->in_gpu_reset && !adev->in_suspend) {
memset((void *)mqd, 0, sizeof(struct vi_mqd_allocation));
((struct vi_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF;
((struct vi_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF;
@ -4682,7 +4682,7 @@ static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring)
if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct vi_mqd_allocation));
} else if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
} else if (adev->in_gpu_reset) { /* for GPU_RESET case */
/* reset MQD to a clean status */
if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation));

View File

@ -3686,7 +3686,7 @@ static int gfx_v9_0_kiq_init_queue(struct amdgpu_ring *ring)
gfx_v9_0_kiq_setting(ring);
if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
if (adev->in_gpu_reset) { /* for GPU_RESET case */
/* reset MQD to a clean status */
if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct v9_mqd_allocation));
@ -3724,7 +3724,7 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring *ring)
struct v9_mqd *mqd = ring->mqd_ptr;
int mqd_idx = ring - &adev->gfx.compute_ring[0];
if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
if (!adev->in_gpu_reset && !adev->in_suspend) {
memset((void *)mqd, 0, sizeof(struct v9_mqd_allocation));
((struct v9_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF;
((struct v9_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF;
@ -3736,7 +3736,7 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring *ring)
if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct v9_mqd_allocation));
} else if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
} else if (adev->in_gpu_reset) { /* for GPU_RESET case */
/* reset MQD to a clean status */
if (adev->gfx.mec.mqd_backup[mqd_idx])
memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct v9_mqd_allocation));
@ -3930,7 +3930,7 @@ static int gfx_v9_0_hw_fini(void *handle)
/* Use deinitialize sequence from CAIL when unbinding device from driver,
* otherwise KIQ is hanging when binding back
*/
if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
if (!adev->in_gpu_reset && !adev->in_suspend) {
mutex_lock(&adev->srbm_mutex);
soc15_grbm_select(adev, adev->gfx.kiq.ring.me,
adev->gfx.kiq.ring.pipe,
@ -4088,7 +4088,7 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
*
* also don't wait anymore for IRQ context
* */
if (r < 1 && (amdgpu_in_reset(adev) || in_interrupt()))
if (r < 1 && (adev->in_gpu_reset || in_interrupt()))
goto failed_kiq_read;
might_sleep();

View File

@ -287,7 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
*/
if (adev->gfx.kiq.ring.sched.ready &&
(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
!amdgpu_in_reset(adev)) {
!adev->in_gpu_reset) {
struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
const unsigned eng = 17;
@ -312,7 +312,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
if (!adev->mman.buffer_funcs_enabled ||
!adev->ib_pool_ready ||
amdgpu_in_reset(adev) ||
adev->in_gpu_reset ||
ring->sched.ready == false) {
gmc_v10_0_flush_vm_hub(adev, vmid, AMDGPU_GFXHUB_0, 0);
mutex_unlock(&adev->mman.gtt_window_lock);

View File

@ -434,7 +434,7 @@ static int gmc_v7_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
int vmid;
unsigned int tmp;
if (amdgpu_in_reset(adev))
if (adev->in_gpu_reset)
return -EIO;
for (vmid = 1; vmid < 16; vmid++) {

View File

@ -635,7 +635,7 @@ static int gmc_v8_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
int vmid;
unsigned int tmp;
if (amdgpu_in_reset(adev))
if (adev->in_gpu_reset)
return -EIO;
for (vmid = 1; vmid < 16; vmid++) {

View File

@ -501,7 +501,7 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
*/
if (adev->gfx.kiq.ring.sched.ready &&
(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
!amdgpu_in_reset(adev)) {
!adev->in_gpu_reset) {
uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
@ -596,7 +596,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
struct amdgpu_kiq *kiq = &adev->gfx.kiq;
if (amdgpu_in_reset(adev))
if (adev->in_gpu_reset)
return -EIO;
if (ring->sched.ready) {
@ -633,8 +633,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
spin_unlock(&adev->gfx.kiq.ring_lock);
r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
if (r < 1) {
dev_info(adev->dev,
"wait for kiq fence error: %ld\n", r);
DRM_ERROR("wait for kiq fence error: %ld.\n", r);
return -ETIME;
}

View File

@ -238,16 +238,20 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
int locked;
/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
* otherwise the mailbox msg will be ruined/reseted by
* the VF FLR.
*
* we can unlock the reset_sem to allow "amdgpu_job_timedout"
* we can unlock the lock_reset to allow "amdgpu_job_timedout"
* to run gpu_recover() after FLR_NOTIFICATION_CMPL received
* which means host side had finished this VF's FLR.
*/
down_read(&adev->reset_sem);
locked = mutex_trylock(&adev->lock_reset);
if (locked)
adev->in_gpu_reset = true;
do {
if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
goto flr_done;
@ -257,7 +261,10 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
} while (timeout > 1);
flr_done:
up_read(&adev->reset_sem);
if (locked) {
adev->in_gpu_reset = false;
mutex_unlock(&adev->lock_reset);
}
/* Trigger recovery for world switch failure if no TDR */
if (amdgpu_device_should_recover_gpu(adev)

View File

@ -259,16 +259,20 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
int locked;
/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
* otherwise the mailbox msg will be ruined/reseted by
* the VF FLR.
*
* we can unlock the reset_sem to allow "amdgpu_job_timedout"
* we can unlock the lock_reset to allow "amdgpu_job_timedout"
* to run gpu_recover() after FLR_NOTIFICATION_CMPL received
* which means host side had finished this VF's FLR.
*/
down_read(&adev->reset_sem);
locked = mutex_trylock(&adev->lock_reset);
if (locked)
adev->in_gpu_reset = true;
do {
if (xgpu_nv_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
goto flr_done;
@ -278,7 +282,10 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
} while (timeout > 1);
flr_done:
up_read(&adev->reset_sem);
if (locked) {
adev->in_gpu_reset = false;
mutex_unlock(&adev->lock_reset);
}
/* Trigger recovery for world switch failure if no TDR */
if (amdgpu_device_should_recover_gpu(adev)

View File

@ -304,7 +304,6 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
struct queue *q)
{
if (!dqm->is_resetting) {
/* On GFX v7, CP doesn't flush TC at dequeue */
if (q->device->device_info->asic_family == CHIP_HAWAII)
if (flush_texture_cache_nocpsch(q->device, qpd))
@ -314,7 +313,6 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
/* Release the vmid mapping */
set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
}
dqm->vmid_pasid[qpd->vmid] = 0;
qpd->vmid = 0;

View File

@ -1551,10 +1551,6 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process,
void kfd_flush_tlb(struct kfd_process_device *pdd)
{
struct kfd_dev *dev = pdd->dev;
struct device_queue_manager *dqm = dev->dqm;
if (dqm->is_resetting)
return;
if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
/* Nothing to flush until a VMID is assigned, which

View File

@ -1658,7 +1658,7 @@ static int dm_suspend(void *handle)
struct amdgpu_display_manager *dm = &adev->dm;
int ret = 0;
if (amdgpu_in_reset(adev)) {
if (adev->in_gpu_reset) {
mutex_lock(&dm->dc_lock);
dm->cached_dc_state = dc_copy_state(dm->dc->current_state);
@ -1844,7 +1844,7 @@ static int dm_resume(void *handle)
struct dc_state *dc_state;
int i, r, j;
if (amdgpu_in_reset(adev)) {
if (adev->in_gpu_reset) {
dc_state = dm->cached_dc_state;
r = dm_dmub_hw_init(adev);

View File

@ -1110,7 +1110,7 @@ static int smu_disable_dpms(struct smu_context *smu)
struct amdgpu_device *adev = smu->adev;
int ret = 0;
bool use_baco = !smu->is_apu &&
((amdgpu_in_reset(adev) &&
((adev->in_gpu_reset &&
(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)) ||
((adev->in_runpm || adev->in_hibernate) && amdgpu_asic_supports_baco(adev)));

View File

@ -489,7 +489,7 @@ static int vega20_setup_asic_task(struct pp_hwmgr *hwmgr)
{
struct amdgpu_device *adev = (struct amdgpu_device *)(hwmgr->adev);
int ret = 0;
bool use_baco = (amdgpu_in_reset(adev) &&
bool use_baco = (adev->in_gpu_reset &&
(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)) ||
(adev->in_runpm && amdgpu_asic_supports_baco(adev));