drm/amdkfd: add RAS ECC event support (v3)
RAS ECC event will combine with GPU reset event, due to ECC interrupts are caused by uncorrectable error that triggers GPU reset. v2: Fix misleading-indentation warning v3: fix build with CONFIG_HSA_AMD disabled Signed-off-by: Eric Huang <JinhuiEric.Huang@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
0dee45a25a
commit
9b54d20176
@ -640,4 +640,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
|
||||
void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
|
||||
{
|
||||
}
|
||||
|
||||
void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
@ -229,5 +229,6 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm);
|
||||
int kgd2kfd_resume_mm(struct mm_struct *mm);
|
||||
int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
|
||||
struct dma_fence *fence);
|
||||
void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
|
||||
|
||||
#endif /* AMDGPU_AMDKFD_H_INCLUDED */
|
||||
|
@ -4805,6 +4805,7 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
|
||||
struct amdgpu_iv_entry *entry)
|
||||
{
|
||||
/* TODO ue will trigger an interrupt. */
|
||||
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
|
||||
amdgpu_ras_reset_gpu(adev, 0);
|
||||
return AMDGPU_RAS_UE;
|
||||
}
|
||||
|
@ -354,6 +354,7 @@ static int gmc_v9_0_ecc_interrupt_state(struct amdgpu_device *adev,
|
||||
static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
|
||||
struct amdgpu_iv_entry *entry)
|
||||
{
|
||||
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
|
||||
amdgpu_ras_reset_gpu(adev, 0);
|
||||
return AMDGPU_RAS_UE;
|
||||
}
|
||||
|
@ -1851,6 +1851,8 @@ static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
|
||||
return 0;
|
||||
}
|
||||
|
||||
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
|
||||
|
||||
amdgpu_ras_reset_gpu(adev, 0);
|
||||
|
||||
return AMDGPU_RAS_UE;
|
||||
|
@ -466,6 +466,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
|
||||
memset(&kfd->doorbell_available_index, 0,
|
||||
sizeof(kfd->doorbell_available_index));
|
||||
|
||||
atomic_set(&kfd->sram_ecc_flag, 0);
|
||||
|
||||
return kfd;
|
||||
}
|
||||
|
||||
@ -661,6 +663,9 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
|
||||
return ret;
|
||||
count = atomic_dec_return(&kfd_locked);
|
||||
WARN_ONCE(count != 0, "KFD reset ref. error");
|
||||
|
||||
atomic_set(&kfd->sram_ecc_flag, 0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1024,6 +1029,12 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj)
|
||||
return 0;
|
||||
}
|
||||
|
||||
void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
|
||||
{
|
||||
if (kfd)
|
||||
atomic_inc(&kfd->sram_ecc_flag);
|
||||
}
|
||||
|
||||
#if defined(CONFIG_DEBUG_FS)
|
||||
|
||||
/* This function will send a package to HIQ to hang the HWS
|
||||
|
@ -1011,25 +1011,41 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
|
||||
void kfd_signal_reset_event(struct kfd_dev *dev)
|
||||
{
|
||||
struct kfd_hsa_hw_exception_data hw_exception_data;
|
||||
struct kfd_hsa_memory_exception_data memory_exception_data;
|
||||
struct kfd_process *p;
|
||||
struct kfd_event *ev;
|
||||
unsigned int temp;
|
||||
uint32_t id, idx;
|
||||
int reset_cause = atomic_read(&dev->sram_ecc_flag) ?
|
||||
KFD_HW_EXCEPTION_ECC :
|
||||
KFD_HW_EXCEPTION_GPU_HANG;
|
||||
|
||||
/* Whole gpu reset caused by GPU hang and memory is lost */
|
||||
memset(&hw_exception_data, 0, sizeof(hw_exception_data));
|
||||
hw_exception_data.gpu_id = dev->id;
|
||||
hw_exception_data.memory_lost = 1;
|
||||
hw_exception_data.reset_cause = reset_cause;
|
||||
|
||||
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
|
||||
memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
|
||||
memory_exception_data.gpu_id = dev->id;
|
||||
memory_exception_data.failure.imprecise = true;
|
||||
|
||||
idx = srcu_read_lock(&kfd_processes_srcu);
|
||||
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
|
||||
mutex_lock(&p->event_mutex);
|
||||
id = KFD_FIRST_NONSIGNAL_EVENT_ID;
|
||||
idr_for_each_entry_continue(&p->event_idr, ev, id)
|
||||
idr_for_each_entry_continue(&p->event_idr, ev, id) {
|
||||
if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
|
||||
ev->hw_exception_data = hw_exception_data;
|
||||
set_event(ev);
|
||||
}
|
||||
if (ev->type == KFD_EVENT_TYPE_MEMORY &&
|
||||
reset_cause == KFD_HW_EXCEPTION_ECC) {
|
||||
ev->memory_exception_data = memory_exception_data;
|
||||
set_event(ev);
|
||||
}
|
||||
}
|
||||
mutex_unlock(&p->event_mutex);
|
||||
}
|
||||
srcu_read_unlock(&kfd_processes_srcu, idx);
|
||||
|
@ -276,6 +276,9 @@ struct kfd_dev {
|
||||
uint64_t hive_id;
|
||||
|
||||
bool pci_atomic_requested;
|
||||
|
||||
/* SRAM ECC flag */
|
||||
atomic_t sram_ecc_flag;
|
||||
};
|
||||
|
||||
enum kfd_mempool {
|
||||
|
@ -211,6 +211,11 @@ struct kfd_ioctl_dbg_wave_control_args {
|
||||
#define KFD_HW_EXCEPTION_GPU_HANG 0
|
||||
#define KFD_HW_EXCEPTION_ECC 1
|
||||
|
||||
/* For kfd_hsa_memory_exception_data.ErrorType */
|
||||
#define KFD_MEM_ERR_NO_RAS 0
|
||||
#define KFD_MEM_ERR_SRAM_ECC 1
|
||||
#define KFD_MEM_ERR_POISON_CONSUMED 2
|
||||
#define KFD_MEM_ERR_GPU_HANG 3
|
||||
|
||||
struct kfd_ioctl_create_event_args {
|
||||
__u64 event_page_offset; /* from KFD */
|
||||
@ -250,7 +255,12 @@ struct kfd_hsa_memory_exception_data {
|
||||
struct kfd_memory_exception_failure failure;
|
||||
__u64 va;
|
||||
__u32 gpu_id;
|
||||
__u32 pad;
|
||||
__u32 ErrorType; /* 0 = no RAS error,
|
||||
* 1 = ECC_SRAM,
|
||||
* 2 = Link_SYNFLOOD (poison),
|
||||
* 3 = GPU hang (not attributable to a specific cause),
|
||||
* other values reserved
|
||||
*/
|
||||
};
|
||||
|
||||
/* hw exception data */
|
||||
|
Loading…
Reference in New Issue
Block a user