drm/amdgpu: validate bad page threshold in ras(v3)
Bad page threshold value should be valid in the range between -1 and max records length of eeprom. It could determine when saved bad pages exceed threshold value, and proceed corresponding actions. v2: When using the default typical value, it should be min value between typical value and eeprom max records length. v3: drop the case of setting bad_page_cnt_threshold to be 0xFFFFFFFF, as it confuses user. Signed-off-by: Guchun Chen <guchun.chen@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
acc0204cdb
commit
c84d46707e
@ -69,6 +69,9 @@ const char *ras_block_string[] = {
|
||||
/* inject address is 52 bits */
|
||||
#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
|
||||
|
||||
/* typical ECC bad page rate(1 bad page per 100MB VRAM) */
|
||||
#define RAS_BAD_PAGE_RATE (100 * 1024 * 1024ULL)
|
||||
|
||||
enum amdgpu_ras_retire_page_reservation {
|
||||
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
|
||||
AMDGPU_RAS_RETIRE_PAGE_PENDING,
|
||||
@ -1699,6 +1702,47 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
|
||||
uint32_t max_length)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
int tmp_threshold = amdgpu_bad_page_threshold;
|
||||
u64 val;
|
||||
|
||||
/*
|
||||
* Justification of value bad_page_cnt_threshold in ras structure
|
||||
*
|
||||
* Generally, -1 <= amdgpu_bad_page_threshold <= max record length
|
||||
* in eeprom, and introduce two scenarios accordingly.
|
||||
*
|
||||
* Bad page retirement enablement:
|
||||
* - If amdgpu_bad_page_threshold = -1,
|
||||
* bad_page_cnt_threshold = typical value by formula.
|
||||
*
|
||||
* - When the value from user is 0 < amdgpu_bad_page_threshold <
|
||||
* max record length in eeprom, use it directly.
|
||||
*
|
||||
* Bad page retirement disablement:
|
||||
* - If amdgpu_bad_page_threshold = 0, bad page retirement
|
||||
* functionality is disabled, and bad_page_cnt_threshold will
|
||||
* take no effect.
|
||||
*/
|
||||
|
||||
if (tmp_threshold < -1)
|
||||
tmp_threshold = -1;
|
||||
else if (tmp_threshold > max_length)
|
||||
tmp_threshold = max_length;
|
||||
|
||||
if (tmp_threshold == -1) {
|
||||
val = adev->gmc.mc_vram_size;
|
||||
do_div(val, RAS_BAD_PAGE_RATE);
|
||||
con->bad_page_cnt_threshold = min(lower_32_bits(val),
|
||||
max_length);
|
||||
} else {
|
||||
con->bad_page_cnt_threshold = tmp_threshold;
|
||||
}
|
||||
}
|
||||
|
||||
/* called in gpu recovery/init */
|
||||
int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
|
||||
{
|
||||
@ -1776,6 +1820,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct ras_err_handler_data **data;
|
||||
uint32_t max_eeprom_records_len = 0;
|
||||
int ret;
|
||||
|
||||
if (con)
|
||||
@ -1794,6 +1839,9 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
|
||||
atomic_set(&con->in_recovery, 0);
|
||||
con->adev = adev;
|
||||
|
||||
max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
|
||||
amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
|
||||
|
||||
ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
|
||||
if (ret)
|
||||
goto free;
|
||||
|
@ -336,6 +336,9 @@ struct amdgpu_ras {
|
||||
struct amdgpu_ras_eeprom_control eeprom_control;
|
||||
|
||||
bool error_query_ready;
|
||||
|
||||
/* bad page count threshold */
|
||||
uint32_t bad_page_cnt_threshold;
|
||||
};
|
||||
|
||||
struct ras_fs_data {
|
||||
|
@ -499,6 +499,11 @@ free_buff:
|
||||
return ret == num ? 0 : -EIO;
|
||||
}
|
||||
|
||||
inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void)
|
||||
{
|
||||
return EEPROM_MAX_RECORD_NUM;
|
||||
}
|
||||
|
||||
/* Used for testing if bugs encountered */
|
||||
#if 0
|
||||
void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control)
|
||||
|
@ -84,6 +84,8 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
|
||||
bool write,
|
||||
int num);
|
||||
|
||||
inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void);
|
||||
|
||||
void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control);
|
||||
|
||||
#endif // _AMDGPU_RAS_EEPROM_H
|
||||
|
Loading…
Reference in New Issue
Block a user