drm/amdgpu: Implement DPC recovery
Add PCI Downstream Port Containment (DPC) with basic recovery functionality v2: remove pci_save_state to avoid breaking suspend/resume v3: Fix style comments v4: Improve description. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
		
							parent
							
								
									2a9787dcf5
								
							
						
					
					
						commit
						c9a6b82f45
					
				| @ -49,6 +49,8 @@ | ||||
| #include <linux/rbtree.h> | ||||
| #include <linux/hashtable.h> | ||||
| #include <linux/dma-fence.h> | ||||
| #include <linux/pci.h> | ||||
| #include <linux/aer.h> | ||||
| 
 | ||||
| #include <drm/ttm/ttm_bo_api.h> | ||||
| #include <drm/ttm/ttm_bo_driver.h> | ||||
| @ -1260,6 +1262,12 @@ static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev) { return | ||||
| void amdgpu_register_gpu_instance(struct amdgpu_device *adev); | ||||
| void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev); | ||||
| 
 | ||||
| pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, | ||||
| 					   pci_channel_state_t state); | ||||
| pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev); | ||||
| pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev); | ||||
| void amdgpu_pci_resume(struct pci_dev *pdev); | ||||
| 
 | ||||
| #include "amdgpu_object.h" | ||||
| 
 | ||||
| /* used by df_v3_6.c and amdgpu_pmu.c */ | ||||
|  | ||||
| @ -2999,6 +2999,7 @@ static const struct attribute *amdgpu_dev_attributes[] = { | ||||
| 	NULL | ||||
| }; | ||||
| 
 | ||||
| 
 | ||||
| /**
 | ||||
|  * amdgpu_device_init - initialize the driver | ||||
|  * | ||||
| @ -3217,6 +3218,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	pci_enable_pcie_error_reporting(adev->ddev.pdev); | ||||
| 
 | ||||
| 	/* Post card if necessary */ | ||||
| 	if (amdgpu_device_need_post(adev)) { | ||||
| 		if (!adev->bios) { | ||||
| @ -4705,3 +4708,161 @@ int amdgpu_device_baco_exit(struct drm_device *dev) | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * amdgpu_pci_error_detected - Called when a PCI error is detected. | ||||
|  * @pdev: PCI device struct | ||||
|  * @state: PCI channel state | ||||
|  * | ||||
|  * Description: Called when a PCI error is detected. | ||||
|  * | ||||
|  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. | ||||
|  */ | ||||
| pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) | ||||
| { | ||||
| 	struct drm_device *dev = pci_get_drvdata(pdev); | ||||
| 	struct amdgpu_device *adev = drm_to_adev(dev); | ||||
| 
 | ||||
| 	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); | ||||
| 
 | ||||
| 	switch (state) { | ||||
| 	case pci_channel_io_normal: | ||||
| 		return PCI_ERS_RESULT_CAN_RECOVER; | ||||
| 	case pci_channel_io_frozen: | ||||
| 		/* Fatal error, prepare for slot reset */ | ||||
| 		amdgpu_device_lock_adev(adev); | ||||
| 		return PCI_ERS_RESULT_NEED_RESET; | ||||
| 	case pci_channel_io_perm_failure: | ||||
| 		/* Permanent error, prepare for device removal */ | ||||
| 		return PCI_ERS_RESULT_DISCONNECT; | ||||
| 	} | ||||
| 
 | ||||
| 	return PCI_ERS_RESULT_NEED_RESET; | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers | ||||
|  * @pdev: pointer to PCI device | ||||
|  */ | ||||
| pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) | ||||
| { | ||||
| 
 | ||||
| 	DRM_INFO("PCI error: mmio enabled callback!!\n"); | ||||
| 
 | ||||
| 	/* TODO - dump whatever for debugging purposes */ | ||||
| 
 | ||||
| 	/* This called only if amdgpu_pci_error_detected returns
 | ||||
| 	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still | ||||
| 	 * works, no need to reset slot. | ||||
| 	 */ | ||||
| 
 | ||||
| 	return PCI_ERS_RESULT_RECOVERED; | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * amdgpu_pci_slot_reset - Called when PCI slot has been reset. | ||||
|  * @pdev: PCI device struct | ||||
|  * | ||||
|  * Description: This routine is called by the pci error recovery | ||||
|  * code after the PCI slot has been reset, just before we | ||||
|  * should resume normal operations. | ||||
|  */ | ||||
| pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) | ||||
| { | ||||
| 	struct drm_device *dev = pci_get_drvdata(pdev); | ||||
| 	struct amdgpu_device *adev = drm_to_adev(dev); | ||||
| 	int r; | ||||
| 	bool vram_lost; | ||||
| 
 | ||||
| 	DRM_INFO("PCI error: slot reset callback!!\n"); | ||||
| 
 | ||||
| 	pci_restore_state(pdev); | ||||
| 
 | ||||
| 	r = amdgpu_device_ip_suspend(adev); | ||||
| 	if (r) | ||||
| 		goto out; | ||||
| 
 | ||||
| 
 | ||||
| 	/* post card */ | ||||
| 	r = amdgpu_atom_asic_init(adev->mode_info.atom_context); | ||||
| 	if (r) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	r = amdgpu_device_ip_resume_phase1(adev); | ||||
| 	if (r) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	vram_lost = amdgpu_device_check_vram_lost(adev); | ||||
| 	if (vram_lost) { | ||||
| 		DRM_INFO("VRAM is lost due to GPU reset!\n"); | ||||
| 		amdgpu_inc_vram_lost(adev); | ||||
| 	} | ||||
| 
 | ||||
| 	r = amdgpu_gtt_mgr_recover( | ||||
| 		&adev->mman.bdev.man[TTM_PL_TT]); | ||||
| 	if (r) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	r = amdgpu_device_fw_loading(adev); | ||||
| 	if (r) | ||||
| 		return r; | ||||
| 
 | ||||
| 	r = amdgpu_device_ip_resume_phase2(adev); | ||||
| 	if (r) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	if (vram_lost) | ||||
| 		amdgpu_device_fill_reset_magic(adev); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Add this ASIC as tracked as reset was already | ||||
| 	 * complete successfully. | ||||
| 	 */ | ||||
| 	amdgpu_register_gpu_instance(adev); | ||||
| 
 | ||||
| 	r = amdgpu_device_ip_late_init(adev); | ||||
| 	if (r) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	amdgpu_fbdev_set_suspend(adev, 0); | ||||
| 
 | ||||
| 	/* must succeed. */ | ||||
| 	amdgpu_ras_resume(adev); | ||||
| 
 | ||||
| 
 | ||||
| 	amdgpu_irq_gpu_reset_resume_helper(adev); | ||||
| 	r = amdgpu_ib_ring_tests(adev); | ||||
| 	if (r) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	r = amdgpu_device_recover_vram(adev); | ||||
| 
 | ||||
| out: | ||||
| 
 | ||||
| 	if (!r) { | ||||
| 		DRM_INFO("PCIe error recovery succeeded\n"); | ||||
| 	} else { | ||||
| 		DRM_ERROR("PCIe error recovery failed, err:%d", r); | ||||
| 		amdgpu_device_unlock_adev(adev); | ||||
| 	} | ||||
| 
 | ||||
| 	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * amdgpu_pci_resume() - resume normal ops after PCI reset | ||||
|  * @pdev: pointer to PCI device | ||||
|  * | ||||
|  * Called when the error recovery driver tells us that its | ||||
|  * OK to resume normal operation. Use completion to allow | ||||
|  * halted scsi ops to resume. | ||||
|  */ | ||||
| void amdgpu_pci_resume(struct pci_dev *pdev) | ||||
| { | ||||
| 	struct drm_device *dev = pci_get_drvdata(pdev); | ||||
| 	struct amdgpu_device *adev = drm_to_adev(dev); | ||||
| 
 | ||||
| 	amdgpu_device_unlock_adev(adev); | ||||
| 
 | ||||
| 	DRM_INFO("PCI error: resume callback!!\n"); | ||||
| } | ||||
|  | ||||
| @ -32,7 +32,6 @@ | ||||
| #include <drm/drm_pciids.h> | ||||
| #include <linux/console.h> | ||||
| #include <linux/module.h> | ||||
| #include <linux/pci.h> | ||||
| #include <linux/pm_runtime.h> | ||||
| #include <linux/vga_switcheroo.h> | ||||
| #include <drm/drm_probe_helper.h> | ||||
| @ -1528,6 +1527,13 @@ static struct drm_driver kms_driver = { | ||||
| 	.patchlevel = KMS_DRIVER_PATCHLEVEL, | ||||
| }; | ||||
| 
 | ||||
| static struct pci_error_handlers amdgpu_pci_err_handler = { | ||||
| 	.error_detected	= amdgpu_pci_error_detected, | ||||
| 	.mmio_enabled	= amdgpu_pci_mmio_enabled, | ||||
| 	.slot_reset	= amdgpu_pci_slot_reset, | ||||
| 	.resume		= amdgpu_pci_resume, | ||||
| }; | ||||
| 
 | ||||
| static struct pci_driver amdgpu_kms_pci_driver = { | ||||
| 	.name = DRIVER_NAME, | ||||
| 	.id_table = pciidlist, | ||||
| @ -1535,6 +1541,7 @@ static struct pci_driver amdgpu_kms_pci_driver = { | ||||
| 	.remove = amdgpu_pci_remove, | ||||
| 	.shutdown = amdgpu_pci_shutdown, | ||||
| 	.driver.pm = &amdgpu_pm_ops, | ||||
| 	.err_handler = &amdgpu_pci_err_handler, | ||||
| }; | ||||
| 
 | ||||
| static int __init amdgpu_init(void) | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user