drm/amdgpu: break driver init process when it's bad GPU(v5)
When retrieving bad gpu tag from eeprom, GPU init should
fail as the GPU needs to be retired for further check.
v2: Fix spelling typo, correct the condition to detect
bad gpu tag and refine error message.
v3: Refine function argument name.
v4: Fix missing check of returning value of i2c
initialization error case.
v5: Use dev_err to print PCI information in dmesg instead
of DRM_ERROR.
Signed-off-by: Guchun Chen <guchun.chen@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
committed by
Alex Deucher
parent
1d6a9d122d
commit
b82e65a935
@@ -2055,13 +2055,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
|
|||||||
* it should be called after amdgpu_device_ip_hw_init_phase2 since
|
* it should be called after amdgpu_device_ip_hw_init_phase2 since
|
||||||
* for some ASICs the RAS EEPROM code relies on SMU fully functioning
|
* for some ASICs the RAS EEPROM code relies on SMU fully functioning
|
||||||
* for I2C communication which only true at this point.
|
* for I2C communication which only true at this point.
|
||||||
* recovery_init may fail, but it can free all resources allocated by
|
*
|
||||||
* itself and its failure should not stop amdgpu init process.
|
* amdgpu_ras_recovery_init may fail, but the upper only cares the
|
||||||
|
* failure from bad gpu situation and stop amdgpu init process
|
||||||
|
* accordingly. For other failed cases, it will still release all
|
||||||
|
* the resource and print error message, rather than returning one
|
||||||
|
* negative value to upper level.
|
||||||
*
|
*
|
||||||
* Note: theoretically, this should be called before all vram allocations
|
* Note: theoretically, this should be called before all vram allocations
|
||||||
* to protect retired page from abusing
|
* to protect retired page from abusing
|
||||||
*/
|
*/
|
||||||
amdgpu_ras_recovery_init(adev);
|
r = amdgpu_ras_recovery_init(adev);
|
||||||
|
if (r)
|
||||||
|
goto init_failed;
|
||||||
|
|
||||||
if (adev->gmc.xgmi.num_physical_nodes > 1)
|
if (adev->gmc.xgmi.num_physical_nodes > 1)
|
||||||
amdgpu_xgmi_add_device(adev);
|
amdgpu_xgmi_add_device(adev);
|
||||||
|
|||||||
@@ -1821,6 +1821,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
|
|||||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||||
struct ras_err_handler_data **data;
|
struct ras_err_handler_data **data;
|
||||||
uint32_t max_eeprom_records_len = 0;
|
uint32_t max_eeprom_records_len = 0;
|
||||||
|
bool exc_err_limit = false;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
if (con)
|
if (con)
|
||||||
@@ -1842,8 +1843,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
|
|||||||
max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
|
max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
|
||||||
amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
|
amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
|
||||||
|
|
||||||
ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
|
ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
|
||||||
if (ret)
|
/*
|
||||||
|
* This calling fails when exc_err_limit is true or
|
||||||
|
* ret != 0.
|
||||||
|
*/
|
||||||
|
if (exc_err_limit || ret)
|
||||||
goto free;
|
goto free;
|
||||||
|
|
||||||
if (con->eeprom_control.num_recs) {
|
if (con->eeprom_control.num_recs) {
|
||||||
@@ -1867,6 +1872,15 @@ free:
|
|||||||
out:
|
out:
|
||||||
dev_warn(adev->dev, "Failed to initialize ras recovery!\n");
|
dev_warn(adev->dev, "Failed to initialize ras recovery!\n");
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Except error threshold exceeding case, other failure cases in this
|
||||||
|
* function would not fail amdgpu driver init.
|
||||||
|
*/
|
||||||
|
if (!exc_err_limit)
|
||||||
|
ret = 0;
|
||||||
|
else
|
||||||
|
ret = -EINVAL;
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -241,7 +241,8 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
|
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
|
||||||
|
bool *exceed_err_limit)
|
||||||
{
|
{
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
struct amdgpu_device *adev = to_amdgpu_device(control);
|
struct amdgpu_device *adev = to_amdgpu_device(control);
|
||||||
@@ -254,6 +255,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
|
|||||||
.buf = buff,
|
.buf = buff,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
*exceed_err_limit = false;
|
||||||
|
|
||||||
/* Verify i2c adapter is initialized */
|
/* Verify i2c adapter is initialized */
|
||||||
if (!adev->pm.smu_i2c.algo)
|
if (!adev->pm.smu_i2c.algo)
|
||||||
return -ENOENT;
|
return -ENOENT;
|
||||||
@@ -282,6 +285,11 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
|
|||||||
DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
|
DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
|
||||||
control->num_recs);
|
control->num_recs);
|
||||||
|
|
||||||
|
} else if ((hdr->header == EEPROM_TABLE_HDR_BAD) &&
|
||||||
|
(amdgpu_bad_page_threshold != 0)) {
|
||||||
|
*exceed_err_limit = true;
|
||||||
|
dev_err(adev->dev, "Exceeding the bad_page_threshold parameter, "
|
||||||
|
"disabling the GPU.\n");
|
||||||
} else {
|
} else {
|
||||||
DRM_INFO("Creating new EEPROM table");
|
DRM_INFO("Creating new EEPROM table");
|
||||||
|
|
||||||
|
|||||||
@@ -76,7 +76,8 @@ struct eeprom_table_record {
|
|||||||
unsigned char mcumc_id;
|
unsigned char mcumc_id;
|
||||||
}__attribute__((__packed__));
|
}__attribute__((__packed__));
|
||||||
|
|
||||||
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control);
|
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
|
||||||
|
bool *exceed_err_limit);
|
||||||
int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);
|
int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);
|
||||||
|
|
||||||
int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
|
int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
|
||||||
|
|||||||
Reference in New Issue
Block a user