drm/amdgpu: message smu to update bad channel info
It should notice SMU to update bad channel info when detected uncorrectable error in UMC block Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
d510eccfa5
commit
69691c8235
@ -2068,6 +2068,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
|
||||
mutex_init(&con->recovery_lock);
|
||||
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
|
||||
atomic_set(&con->in_recovery, 0);
|
||||
con->eeprom_control.bad_channel_bitmap = 0;
|
||||
|
||||
max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
|
||||
amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
|
||||
@ -2092,6 +2093,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
|
||||
goto free;
|
||||
|
||||
amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
|
||||
|
||||
if (con->update_channel_flag == true) {
|
||||
amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
|
||||
con->update_channel_flag = false;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_MCE_AMD
|
||||
@ -2285,6 +2291,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
|
||||
goto release_con;
|
||||
}
|
||||
|
||||
con->update_channel_flag = false;
|
||||
con->features = 0;
|
||||
INIT_LIST_HEAD(&con->head);
|
||||
/* Might need get this flag from vbios. */
|
||||
|
@ -374,6 +374,9 @@ struct amdgpu_ras {
|
||||
|
||||
/* record umc error info queried from smu */
|
||||
struct umc_ecc_info umc_ecc;
|
||||
|
||||
/* Indicates smu whether need update bad channel info */
|
||||
bool update_channel_flag;
|
||||
};
|
||||
|
||||
struct ras_fs_data {
|
||||
|
@ -267,6 +267,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
|
||||
{
|
||||
struct amdgpu_device *adev = to_amdgpu_device(control);
|
||||
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
u8 csum;
|
||||
int res;
|
||||
|
||||
@ -287,6 +288,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
|
||||
|
||||
amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs);
|
||||
|
||||
control->bad_channel_bitmap = 0;
|
||||
amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
|
||||
con->update_channel_flag = false;
|
||||
|
||||
amdgpu_ras_debugfs_set_ret_size(control);
|
||||
|
||||
mutex_unlock(&control->ras_tbl_mutex);
|
||||
@ -420,6 +425,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
|
||||
struct eeprom_table_record *record,
|
||||
const u32 num)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
|
||||
u32 a, b, i;
|
||||
u8 *buf, *pp;
|
||||
int res;
|
||||
@ -431,9 +437,16 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
|
||||
/* Encode all of them in one go.
|
||||
*/
|
||||
pp = buf;
|
||||
for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
|
||||
for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
|
||||
__encode_table_record_to_buf(control, &record[i], pp);
|
||||
|
||||
/* update bad channel bitmap */
|
||||
if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
|
||||
control->bad_channel_bitmap |= 1 << record[i].mem_channel;
|
||||
con->update_channel_flag = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* a, first record index to write into.
|
||||
* b, last record index to write into.
|
||||
* a = first index to read (fri) + number of records in the table,
|
||||
@ -686,6 +699,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
|
||||
const u32 num)
|
||||
{
|
||||
struct amdgpu_device *adev = to_amdgpu_device(control);
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
int i, res;
|
||||
u8 *buf, *pp;
|
||||
u32 g0, g1;
|
||||
@ -753,8 +767,15 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
|
||||
/* Read up everything? Then transform.
|
||||
*/
|
||||
pp = buf;
|
||||
for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
|
||||
for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
|
||||
__decode_table_record_from_buf(control, &record[i], pp);
|
||||
|
||||
/* update bad channel bitmap */
|
||||
if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
|
||||
control->bad_channel_bitmap |= 1 << record[i].mem_channel;
|
||||
con->update_channel_flag = true;
|
||||
}
|
||||
}
|
||||
Out:
|
||||
kfree(buf);
|
||||
mutex_unlock(&control->ras_tbl_mutex);
|
||||
|
@ -80,6 +80,10 @@ struct amdgpu_ras_eeprom_control {
|
||||
/* Protect table access via this mutex.
|
||||
*/
|
||||
struct mutex ras_tbl_mutex;
|
||||
|
||||
/* Record channel info which occurred bad pages
|
||||
*/
|
||||
u32 bad_channel_bitmap;
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -97,6 +97,11 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
|
||||
amdgpu_ras_save_bad_pages(adev);
|
||||
|
||||
amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
|
||||
|
||||
if (con->update_channel_flag == true) {
|
||||
amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
|
||||
con->update_channel_flag = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (reset)
|
||||
|
Loading…
Reference in New Issue
Block a user