drm/amdgpu: move convert_error_address out of umc_ras

RAS error address translation algorithm is common
across dGPU and A + A platform as along as the SOC
integrates the same generation of UMC IP.

UMC RAS is managed by x86 MCA on A + A platform,
umc_ras in GPU driver is not initialized at all on
A + A platform. In such case, any umc_ras callback
implemented for dGPU config shouldn't be invoked
from A + A specific callback.

The change moves convert_error_address out of dGPU
umc_ras structure and makes it share between A + A
and dGPU config.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Stanley Yang <Stanley.Yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Hawking Zhang 2022-10-14 15:17:43 +08:00 committed by Alex Deucher
parent 027bf0cee8
commit 6c0ca74820
4 changed files with 18 additions and 12 deletions

View File

@ -36,6 +36,7 @@
#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
#include "atom.h"
#include "amdgpu_reset.h"
#include "umc_v6_7.h"
#ifdef CONFIG_X86_MCE_AMD
#include <asm/mce.h>
@ -2899,10 +2900,17 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,
/*
* Translate UMC channel address to Physical address
*/
if (adev->umc.ras &&
adev->umc.ras->convert_ras_error_address)
adev->umc.ras->convert_ras_error_address(adev,
&err_data, m->addr, ch_inst, umc_inst);
switch (adev->ip_versions[UMC_HWIP][0]) {
case IP_VERSION(6, 7, 0):
umc_v6_7_convert_error_address(adev,
&err_data, m->addr, ch_inst, umc_inst);
break;
default:
dev_warn(adev->dev,
"UMC address to Physical address translation is not supported\n");
kfree(err_data.err_addr);
return NOTIFY_DONE;
}
if (amdgpu_bad_page_threshold != 0) {
amdgpu_ras_add_bad_pages(adev, err_data.err_addr,

View File

@ -51,9 +51,6 @@ struct amdgpu_umc_ras {
struct amdgpu_ras_block_object ras_block;
void (*err_cnt_init)(struct amdgpu_device *adev);
bool (*query_ras_poison_mode)(struct amdgpu_device *adev);
void (*convert_ras_error_address)(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t err_addr,
uint32_t ch_inst, uint32_t umc_inst);
void (*ecc_info_query_ras_error_count)(struct amdgpu_device *adev,
void *ras_error_status);
void (*ecc_info_query_ras_error_address)(struct amdgpu_device *adev,

View File

@ -187,9 +187,9 @@ static void umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
}
}
static void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t err_addr,
uint32_t ch_inst, uint32_t umc_inst)
void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t err_addr,
uint32_t ch_inst, uint32_t umc_inst)
{
uint32_t channel_index;
uint64_t soc_pa, retired_page, column;
@ -553,5 +553,4 @@ struct amdgpu_umc_ras umc_v6_7_ras = {
.query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
.ecc_info_query_ras_error_count = umc_v6_7_ecc_info_query_ras_error_count,
.ecc_info_query_ras_error_address = umc_v6_7_ecc_info_query_ras_error_address,
.convert_ras_error_address = umc_v6_7_convert_error_address,
};

View File

@ -71,5 +71,7 @@ extern const uint32_t
umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM];
extern const uint32_t
umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM];
void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t err_addr,
uint32_t ch_inst, uint32_t umc_inst);
#endif