drm/amdkfd: add RAS ECC event support (v3)

RAS ECC event will combine with GPU reset event, due to ECC interrupts are caused by uncorrectable error that triggers GPU reset. v2: Fix misleading-indentation warning v3: fix build with CONFIG_HSA_AMD disabled Signed-off-by: Eric Huang <JinhuiEric.Huang@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
2019-01-11 14:38:51 -05:00 · 2019-01-11 14:38:51 -05:00 · 9b54d20176
commit 9b54d20176
parent 0dee45a25a
9 changed files with 51 additions and 2 deletions
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@ -640,4 +640,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
 void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
 {
 }
+
+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
+{
+}
 #endif
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@ -229,5 +229,6 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm);
 int kgd2kfd_resume_mm(struct mm_struct *mm);
 int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
 					       struct dma_fence *fence);
+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);

 #endif /* AMDGPU_AMDKFD_H_INCLUDED */
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@ -4805,6 +4805,7 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
 		struct amdgpu_iv_entry *entry)
 {
 	/* TODO ue will trigger an interrupt. */
+	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
 	amdgpu_ras_reset_gpu(adev, 0);
 	return AMDGPU_RAS_UE;
 }
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@ -354,6 +354,7 @@ static int gmc_v9_0_ecc_interrupt_state(struct amdgpu_device *adev,
 static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
 		struct amdgpu_iv_entry *entry)
 {
+	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
 	amdgpu_ras_reset_gpu(adev, 0);
 	return AMDGPU_RAS_UE;
 }
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@ -1851,6 +1851,8 @@ static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
 		return 0;
 	}

+	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+
 	amdgpu_ras_reset_gpu(adev, 0);

 	return AMDGPU_RAS_UE;
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@ -466,6 +466,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
 	memset(&kfd->doorbell_available_index, 0,
 		sizeof(kfd->doorbell_available_index));

+	atomic_set(&kfd->sram_ecc_flag, 0);
+
 	return kfd;
 }

@ -661,6 +663,9 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
 		return ret;
 	count = atomic_dec_return(&kfd_locked);
 	WARN_ONCE(count != 0, "KFD reset ref. error");
+
+	atomic_set(&kfd->sram_ecc_flag, 0);
+
 	return 0;
 }

@ -1024,6 +1029,12 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj)
 	return 0;
 }

+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
+{
+	if (kfd)
+		atomic_inc(&kfd->sram_ecc_flag);
+}
+
 #if defined(CONFIG_DEBUG_FS)

 /* This function will send a package to HIQ to hang the HWS
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@ -1011,25 +1011,41 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
 void kfd_signal_reset_event(struct kfd_dev *dev)
 {
 	struct kfd_hsa_hw_exception_data hw_exception_data;
+	struct kfd_hsa_memory_exception_data memory_exception_data;
 	struct kfd_process *p;
 	struct kfd_event *ev;
 	unsigned int temp;
 	uint32_t id, idx;
+	int reset_cause = atomic_read(&dev->sram_ecc_flag) ?
+			KFD_HW_EXCEPTION_ECC :
+			KFD_HW_EXCEPTION_GPU_HANG;

 	/* Whole gpu reset caused by GPU hang and memory is lost */
 	memset(&hw_exception_data, 0, sizeof(hw_exception_data));
 	hw_exception_data.gpu_id = dev->id;
 	hw_exception_data.memory_lost = 1;
+	hw_exception_data.reset_cause = reset_cause;
+
+	memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+	memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
+	memory_exception_data.gpu_id = dev->id;
+	memory_exception_data.failure.imprecise = true;

 	idx = srcu_read_lock(&kfd_processes_srcu);
 	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
 		mutex_lock(&p->event_mutex);
 		id = KFD_FIRST_NONSIGNAL_EVENT_ID;
-		idr_for_each_entry_continue(&p->event_idr, ev, id)
+		idr_for_each_entry_continue(&p->event_idr, ev, id) {
 			if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
 				ev->hw_exception_data = hw_exception_data;
 				set_event(ev);
 			}
+			if (ev->type == KFD_EVENT_TYPE_MEMORY &&
+			    reset_cause == KFD_HW_EXCEPTION_ECC) {
+				ev->memory_exception_data = memory_exception_data;
+				set_event(ev);
+			}
+		}
 		mutex_unlock(&p->event_mutex);
 	}
 	srcu_read_unlock(&kfd_processes_srcu, idx);
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@ -276,6 +276,9 @@ struct kfd_dev {
 	uint64_t hive_id;

 	bool pci_atomic_requested;
+
+	/* SRAM ECC flag */
+	atomic_t sram_ecc_flag;
 };

 enum kfd_mempool {
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@ -211,6 +211,11 @@ struct kfd_ioctl_dbg_wave_control_args {
 #define KFD_HW_EXCEPTION_GPU_HANG	0
 #define KFD_HW_EXCEPTION_ECC		1

+/* For kfd_hsa_memory_exception_data.ErrorType */
+#define KFD_MEM_ERR_NO_RAS		0
+#define KFD_MEM_ERR_SRAM_ECC		1
+#define KFD_MEM_ERR_POISON_CONSUMED	2
+#define KFD_MEM_ERR_GPU_HANG		3

 struct kfd_ioctl_create_event_args {
 	__u64 event_page_offset;	/* from KFD */
@ -250,7 +255,12 @@ struct kfd_hsa_memory_exception_data {
 	struct kfd_memory_exception_failure failure;
 	__u64 va;
 	__u32 gpu_id;
-	__u32 pad;
+	__u32 ErrorType; /* 0 = no RAS error,
+			  * 1 = ECC_SRAM,
+			  * 2 = Link_SYNFLOOD (poison),
+			  * 3 = GPU hang (not attributable to a specific cause),
+			  * other values reserved
+			  */
 };

 /* hw exception data */