mirror of
https://github.com/torvalds/linux.git
synced 2024-11-26 22:21:42 +00:00
drm/amdgpu: Adjust removal control flow for smu v13_0_2
Adjust removal control flow for smu v13_0_2: During amdgpu uninstallation, when removing the first device, the kernel needs to first send a mode1reset message to all gpu devices. Otherwise, smu initialization will fail the next time amdgpu is installed. V2: 1. Update commit comments. 2. Remove the global variable amdgpu_device_remove_cnt and add a variable to the structure amdgpu_hive_info. 3. Use hive to detect the first removed device instead of a global variable. V3: 1. Update commit comments. 2. Split a patch into multiple patches. 3. The current patch does: a. Add a work mode of AMDGPU_RESET_FOR_DEVICE_REMOVE into the existing gpu recover path, which make all devices in hive list only have HW reset but no resume (except the base IP). b. Call AMDGPU_RESET_FOR_DEVICE_REMOVE and AMDGPU_NEED_FULL_RESET mode of amdgpu_device_gpu_recover in amdgpu_pci_remove when removing the first device in hive list. c. When removing the first device, the IP blocks keyword function call sequence is as follows: .suspend->mode1reset->.resume(basic ip)->.hw_fini->.early_fini->.sw_fini. ^ | |-<----------<---------<----| The first three sequences are because of a call to amdgpu_device_gpu_recover. The three sequences will be executed in a loop until all devices in the hive list are iterated. The sequences starting from .hw_fini only apply to the first device. Since .suspend has been called before, except the resumed phase1 basic ip blocks, all other ip blocks .hw_fini of current device will do nothing. d. When removing other devices, the calling sequences is the same as legacy: .hw_fini -> .early_fini -> .sw_fini. Since .suspend has been called when removing the first device, except the resumed phase1 basic ip blocks, all of other ip blocks .hw_fini of current device will do nothing. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
e4cf73fdfa
commit
f5c7e77970
@ -4749,6 +4749,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
|
|||||||
struct amdgpu_device *tmp_adev = NULL;
|
struct amdgpu_device *tmp_adev = NULL;
|
||||||
bool need_full_reset, skip_hw_reset, vram_lost = false;
|
bool need_full_reset, skip_hw_reset, vram_lost = false;
|
||||||
int r = 0;
|
int r = 0;
|
||||||
|
bool gpu_reset_for_dev_remove = 0;
|
||||||
|
|
||||||
/* Try reset handler method first */
|
/* Try reset handler method first */
|
||||||
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
|
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
|
||||||
@ -4768,6 +4769,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
|
|||||||
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
|
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
|
||||||
skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
|
skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
|
||||||
|
|
||||||
|
gpu_reset_for_dev_remove =
|
||||||
|
test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
|
||||||
|
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* ASIC reset has to be done on all XGMI hive nodes ASAP
|
* ASIC reset has to be done on all XGMI hive nodes ASAP
|
||||||
* to allow proper links negotiation in FW (within 1 sec)
|
* to allow proper links negotiation in FW (within 1 sec)
|
||||||
@ -4812,6 +4817,18 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
|
|||||||
amdgpu_ras_intr_cleared();
|
amdgpu_ras_intr_cleared();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Since the mode1 reset affects base ip blocks, the
|
||||||
|
* phase1 ip blocks need to be resumed. Otherwise there
|
||||||
|
* will be a BIOS signature error and the psp bootloader
|
||||||
|
* can't load kdb on the next amdgpu install.
|
||||||
|
*/
|
||||||
|
if (gpu_reset_for_dev_remove) {
|
||||||
|
list_for_each_entry(tmp_adev, device_list_handle, reset_list)
|
||||||
|
amdgpu_device_ip_resume_phase1(tmp_adev);
|
||||||
|
|
||||||
|
goto end;
|
||||||
|
}
|
||||||
|
|
||||||
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
|
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
|
||||||
if (need_full_reset) {
|
if (need_full_reset) {
|
||||||
/* post card */
|
/* post card */
|
||||||
@ -5134,6 +5151,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||||||
bool need_emergency_restart = false;
|
bool need_emergency_restart = false;
|
||||||
bool audio_suspended = false;
|
bool audio_suspended = false;
|
||||||
int tmp_vram_lost_counter;
|
int tmp_vram_lost_counter;
|
||||||
|
bool gpu_reset_for_dev_remove = false;
|
||||||
|
|
||||||
|
gpu_reset_for_dev_remove =
|
||||||
|
test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
|
||||||
|
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Special case: RAS triggered and full reset isn't supported
|
* Special case: RAS triggered and full reset isn't supported
|
||||||
@ -5253,6 +5275,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||||||
|
|
||||||
retry: /* Rest of adevs pre asic reset from XGMI hive. */
|
retry: /* Rest of adevs pre asic reset from XGMI hive. */
|
||||||
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
|
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
|
||||||
|
if (gpu_reset_for_dev_remove) {
|
||||||
|
/* Workaroud for ASICs need to disable SMC first */
|
||||||
|
amdgpu_device_smu_fini_early(tmp_adev);
|
||||||
|
}
|
||||||
r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
|
r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
|
||||||
/*TODO Should we stop ?*/
|
/*TODO Should we stop ?*/
|
||||||
if (r) {
|
if (r) {
|
||||||
@ -5286,6 +5312,9 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
|
|||||||
adev->asic_reset_res = 0;
|
adev->asic_reset_res = 0;
|
||||||
goto retry;
|
goto retry;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!r && gpu_reset_for_dev_remove)
|
||||||
|
goto recover_end;
|
||||||
}
|
}
|
||||||
|
|
||||||
skip_hw_reset:
|
skip_hw_reset:
|
||||||
@ -5359,6 +5388,7 @@ skip_sched_resume:
|
|||||||
amdgpu_device_unset_mp1_state(tmp_adev);
|
amdgpu_device_unset_mp1_state(tmp_adev);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
recover_end:
|
||||||
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
|
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
|
||||||
reset_list);
|
reset_list);
|
||||||
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
|
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
|
||||||
|
@ -2186,6 +2186,36 @@ amdgpu_pci_remove(struct pci_dev *pdev)
|
|||||||
pm_runtime_forbid(dev->dev);
|
pm_runtime_forbid(dev->dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)) {
|
||||||
|
bool need_to_reset_gpu = false;
|
||||||
|
|
||||||
|
if (adev->gmc.xgmi.num_physical_nodes > 1) {
|
||||||
|
struct amdgpu_hive_info *hive;
|
||||||
|
|
||||||
|
hive = amdgpu_get_xgmi_hive(adev);
|
||||||
|
if (hive->device_remove_count == 0)
|
||||||
|
need_to_reset_gpu = true;
|
||||||
|
hive->device_remove_count++;
|
||||||
|
amdgpu_put_xgmi_hive(hive);
|
||||||
|
} else {
|
||||||
|
need_to_reset_gpu = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Workaround for ASICs need to reset SMU.
|
||||||
|
* Called only when the first device is removed.
|
||||||
|
*/
|
||||||
|
if (need_to_reset_gpu) {
|
||||||
|
struct amdgpu_reset_context reset_context;
|
||||||
|
|
||||||
|
memset(&reset_context, 0, sizeof(reset_context));
|
||||||
|
reset_context.method = AMD_RESET_METHOD_NONE;
|
||||||
|
reset_context.reset_req_dev = adev;
|
||||||
|
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||||
|
set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context.flags);
|
||||||
|
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
amdgpu_driver_unload_kms(dev);
|
amdgpu_driver_unload_kms(dev);
|
||||||
|
|
||||||
drm_dev_unplug(dev);
|
drm_dev_unplug(dev);
|
||||||
|
@ -31,6 +31,7 @@ enum AMDGPU_RESET_FLAGS {
|
|||||||
AMDGPU_NEED_FULL_RESET = 0,
|
AMDGPU_NEED_FULL_RESET = 0,
|
||||||
AMDGPU_SKIP_HW_RESET = 1,
|
AMDGPU_SKIP_HW_RESET = 1,
|
||||||
AMDGPU_SKIP_MODE2_RESET = 2,
|
AMDGPU_SKIP_MODE2_RESET = 2,
|
||||||
|
AMDGPU_RESET_FOR_DEVICE_REMOVE = 3,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct amdgpu_reset_context {
|
struct amdgpu_reset_context {
|
||||||
|
@ -43,6 +43,7 @@ struct amdgpu_hive_info {
|
|||||||
} pstate;
|
} pstate;
|
||||||
|
|
||||||
struct amdgpu_reset_domain *reset_domain;
|
struct amdgpu_reset_domain *reset_domain;
|
||||||
|
uint32_t device_remove_count;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct amdgpu_pcs_ras_field {
|
struct amdgpu_pcs_ras_field {
|
||||||
|
Loading…
Reference in New Issue
Block a user