mirror of
https://github.com/torvalds/linux.git
synced 2024-11-21 19:41:42 +00:00
amd-drm-next-6.10-2024-04-13:
amdgpu: - HDCP fixes - ODM fixes - RAS fixes - Devcoredump improvements - Misc code cleanups - Expose VCN activity via sysfs - SMY 13.0.x updates - Enable fast updates on DCN 3.1.4 - Add dclk and vclk reporting on additional devices - Add ACA RAS infrastructure - Implement TLB flush fence - EEPROM handling fixes - SMUIO 14.0.2 support - SMU 14.0.1 Updates - Sync page table freeing with TLB flushes - DML2 refactor - DC debug improvements - SR-IOV fixes - Suspend and Resume fixes - DCN 3.5.x Updates - Z8 fixes - UMSCH fixes - GPU reset fixes - HDP fix for second GFX pipe on GC 10.x - Enable secondary GFX pipe on GC 10.3 - Refactor and clean up BACO/BOCO/BAMACO handling - VCN partitioning fix - DC DWB fixes - VSC SDP fixes - DCN 3.1.6 fix - GC 11.5 fixes - Remove invalid TTM resource start check - DCN 1.0 fixes amdkfd: - MQD handling cleanup - Preemption handling fixes for XCDs - TLB flush fix for GC 9.4.2 - Properly clean up workqueue during module unload - Fix memory leak process create failure - Range check CP bad op exception targets to avoid reporting invalid exceptions to userspace radeon: - Misc code cleanups -----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQQgO5Idg2tXNTSZAr293/aFa7yZ2AUCZhr4EAAKCRC93/aFa7yZ 2B8jAP9z1JpOnjSQvc2mhHAooXRYO4Mj5HCQ25ZE8N4c8ZZhjAEAqefmEx5/UyLh lv2pWILL4o597qhq9nA7hJ6tTICLPAU= =HUwY -----END PGP SIGNATURE----- Merge tag 'amd-drm-next-6.10-2024-04-13' of https://gitlab.freedesktop.org/agd5f/linux into drm-next amd-drm-next-6.10-2024-04-13: amdgpu: - HDCP fixes - ODM fixes - RAS fixes - Devcoredump improvements - Misc code cleanups - Expose VCN activity via sysfs - SMY 13.0.x updates - Enable fast updates on DCN 3.1.4 - Add dclk and vclk reporting on additional devices - Add ACA RAS infrastructure - Implement TLB flush fence - EEPROM handling fixes - SMUIO 14.0.2 support - SMU 14.0.1 Updates - Sync page table freeing with TLB flushes - DML2 refactor - DC debug improvements - SR-IOV fixes - Suspend and Resume fixes - DCN 3.5.x Updates - Z8 fixes - UMSCH fixes - GPU reset fixes - HDP fix for second GFX pipe on GC 10.x - Enable secondary GFX pipe on GC 10.3 - Refactor and clean up BACO/BOCO/BAMACO handling - VCN partitioning fix - DC DWB fixes - VSC SDP fixes - DCN 3.1.6 fix - GC 11.5 fixes - Remove invalid TTM resource start check - DCN 1.0 fixes amdkfd: - MQD handling cleanup - Preemption handling fixes for XCDs - TLB flush fix for GC 9.4.2 - Properly clean up workqueue during module unload - Fix memory leak process create failure - Range check CP bad op exception targets to avoid reporting invalid exceptions to userspace radeon: - Misc code cleanups From: Alex Deucher <alexander.deucher@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20240413213708.3427038-1-alexander.deucher@amd.com Signed-off-by: Dave Airlie <airlied@redhat.com>
This commit is contained in:
commit
34633158b8
80
Documentation/gpu/amdgpu/debugging.rst
Normal file
80
Documentation/gpu/amdgpu/debugging.rst
Normal file
@ -0,0 +1,80 @@
|
||||
===============
|
||||
GPU Debugging
|
||||
===============
|
||||
|
||||
GPUVM Debugging
|
||||
===============
|
||||
|
||||
To aid in debugging GPU virtual memory related problems, the driver supports a
|
||||
number of options module parameters:
|
||||
|
||||
`vm_fault_stop` - If non-0, halt the GPU memory controller on a GPU page fault.
|
||||
|
||||
`vm_update_mode` - If non-0, use the CPU to update GPU page tables rather than
|
||||
the GPU.
|
||||
|
||||
|
||||
Decoding a GPUVM Page Fault
|
||||
===========================
|
||||
|
||||
If you see a GPU page fault in the kernel log, you can decode it to figure
|
||||
out what is going wrong in your application. A page fault in your kernel
|
||||
log may look something like this:
|
||||
|
||||
::
|
||||
|
||||
[gfxhub0] no-retry page fault (src_id:0 ring:24 vmid:3 pasid:32777, for process glxinfo pid 2424 thread glxinfo:cs0 pid 2425)
|
||||
in page starting at address 0x0000800102800000 from IH client 0x1b (UTCL2)
|
||||
VM_L2_PROTECTION_FAULT_STATUS:0x00301030
|
||||
Faulty UTCL2 client ID: TCP (0x8)
|
||||
MORE_FAULTS: 0x0
|
||||
WALKER_ERROR: 0x0
|
||||
PERMISSION_FAULTS: 0x3
|
||||
MAPPING_ERROR: 0x0
|
||||
RW: 0x0
|
||||
|
||||
First you have the memory hub, gfxhub and mmhub. gfxhub is the memory
|
||||
hub used for graphics, compute, and sdma on some chips. mmhub is the
|
||||
memory hub used for multi-media and sdma on some chips.
|
||||
|
||||
Next you have the vmid and pasid. If the vmid is 0, this fault was likely
|
||||
caused by the kernel driver or firmware. If the vmid is non-0, it is generally
|
||||
a fault in a user application. The pasid is used to link a vmid to a system
|
||||
process id. If the process is active when the fault happens, the process
|
||||
information will be printed.
|
||||
|
||||
The GPU virtual address that caused the fault comes next.
|
||||
|
||||
The client ID indicates the GPU block that caused the fault.
|
||||
Some common client IDs:
|
||||
|
||||
- CB/DB: The color/depth backend of the graphics pipe
|
||||
- CPF: Command Processor Frontend
|
||||
- CPC: Command Processor Compute
|
||||
- CPG: Command Processor Graphics
|
||||
- TCP/SQC/SQG: Shaders
|
||||
- SDMA: SDMA engines
|
||||
- VCN: Video encode/decode engines
|
||||
- JPEG: JPEG engines
|
||||
|
||||
PERMISSION_FAULTS describe what faults were encountered:
|
||||
|
||||
- bit 0: the PTE was not valid
|
||||
- bit 1: the PTE read bit was not set
|
||||
- bit 2: the PTE write bit was not set
|
||||
- bit 3: the PTE execute bit was not set
|
||||
|
||||
Finally, RW, indicates whether the access was a read (0) or a write (1).
|
||||
|
||||
In the example above, a shader (cliend id = TCP) generated a read (RW = 0x0) to
|
||||
an invalid page (PERMISSION_FAULTS = 0x3) at GPU virtual address
|
||||
0x0000800102800000. The user can then inspect their shader code and resource
|
||||
descriptor state to determine what caused the GPU page fault.
|
||||
|
||||
UMR
|
||||
===
|
||||
|
||||
`umr <https://gitlab.freedesktop.org/tomstdenis/umr>`_ is a general purpose
|
||||
GPU debugging and diagnostics tool. Please see the umr
|
||||
`documentation <https://umr.readthedocs.io/en/main/>`_ for more information
|
||||
about its capabilities.
|
@ -135,7 +135,7 @@ Enable underlay
|
||||
---------------
|
||||
|
||||
AMD display has this feature called underlay (which you can read more about at
|
||||
'Documentation/GPU/amdgpu/display/mpo-overview.rst') which is intended to
|
||||
'Documentation/gpu/amdgpu/display/mpo-overview.rst') which is intended to
|
||||
save power when playing a video. The basic idea is to put a video in the
|
||||
underlay plane at the bottom and the desktop in the plane above it with a hole
|
||||
in the video area. This feature is enabled in ChromeOS, and from our data
|
||||
|
@ -15,4 +15,5 @@ Next (GCN), Radeon DNA (RDNA), and Compute DNA (CDNA) architectures.
|
||||
ras
|
||||
thermal
|
||||
driver-misc
|
||||
debugging
|
||||
amdgpu-glossary
|
||||
|
@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \
|
||||
amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \
|
||||
atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \
|
||||
atombios_encoders.o amdgpu_sa.o atombios_i2c.o \
|
||||
amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \
|
||||
amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \
|
||||
amdgpu_ib.o amdgpu_pll.o \
|
||||
amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
|
||||
amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
|
||||
amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
|
||||
@ -80,7 +81,7 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \
|
||||
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
|
||||
amdgpu_fw_attestation.o amdgpu_securedisplay.o \
|
||||
amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
|
||||
amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o
|
||||
amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o amdgpu_dev_coredump.o
|
||||
|
||||
amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
|
||||
|
||||
@ -247,7 +248,8 @@ amdgpu-y += \
|
||||
smuio_v11_0_6.o \
|
||||
smuio_v13_0.o \
|
||||
smuio_v13_0_3.o \
|
||||
smuio_v13_0_6.o
|
||||
smuio_v13_0_6.o \
|
||||
smuio_v14_0_2.o
|
||||
|
||||
# add reset block
|
||||
amdgpu-y += \
|
||||
|
@ -210,6 +210,7 @@ extern int amdgpu_async_gfx_ring;
|
||||
extern int amdgpu_mcbp;
|
||||
extern int amdgpu_discovery;
|
||||
extern int amdgpu_mes;
|
||||
extern int amdgpu_mes_log_enable;
|
||||
extern int amdgpu_mes_kiq;
|
||||
extern int amdgpu_noretry;
|
||||
extern int amdgpu_force_asic_type;
|
||||
@ -605,7 +606,7 @@ struct amdgpu_asic_funcs {
|
||||
/* PCIe replay counter */
|
||||
uint64_t (*get_pcie_replay_count)(struct amdgpu_device *adev);
|
||||
/* device supports BACO */
|
||||
bool (*supports_baco)(struct amdgpu_device *adev);
|
||||
int (*supports_baco)(struct amdgpu_device *adev);
|
||||
/* pre asic_init quirks */
|
||||
void (*pre_asic_init)(struct amdgpu_device *adev);
|
||||
/* enter/exit umd stable pstate */
|
||||
@ -1407,7 +1408,7 @@ bool amdgpu_device_supports_atpx(struct drm_device *dev);
|
||||
bool amdgpu_device_supports_px(struct drm_device *dev);
|
||||
bool amdgpu_device_supports_boco(struct drm_device *dev);
|
||||
bool amdgpu_device_supports_smart_shift(struct drm_device *dev);
|
||||
bool amdgpu_device_supports_baco(struct drm_device *dev);
|
||||
int amdgpu_device_supports_baco(struct drm_device *dev);
|
||||
bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
|
||||
struct amdgpu_device *peer_adev);
|
||||
int amdgpu_device_baco_enter(struct drm_device *dev);
|
||||
|
@ -28,7 +28,7 @@
|
||||
|
||||
#define ACA_BANK_HWID(type, hwid, mcatype) [ACA_HWIP_TYPE_##type] = {hwid, mcatype}
|
||||
|
||||
typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, void *data);
|
||||
typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data);
|
||||
|
||||
struct aca_banks {
|
||||
int nr_banks;
|
||||
@ -86,7 +86,7 @@ static void aca_banks_release(struct aca_banks *banks)
|
||||
}
|
||||
}
|
||||
|
||||
static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_error_type type, u32 *count)
|
||||
static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count)
|
||||
{
|
||||
struct amdgpu_aca *aca = &adev->aca;
|
||||
const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
|
||||
@ -116,20 +116,22 @@ static struct aca_regs_dump {
|
||||
{"CONTROL_MASK", ACA_REG_IDX_CTL_MASK},
|
||||
};
|
||||
|
||||
static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank)
|
||||
static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank,
|
||||
struct ras_query_context *qctx)
|
||||
{
|
||||
u64 event_id = qctx ? qctx->event_id : 0ULL;
|
||||
int i;
|
||||
|
||||
dev_info(adev->dev, HW_ERR "Accelerator Check Architecture events logged\n");
|
||||
RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");
|
||||
/* plus 1 for output format, e.g: ACA[08/08]: xxxx */
|
||||
for (i = 0; i < ARRAY_SIZE(aca_regs); i++)
|
||||
dev_info(adev->dev, HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n",
|
||||
idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]);
|
||||
RAS_EVENT_LOG(adev, event_id, HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n",
|
||||
idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]);
|
||||
}
|
||||
|
||||
static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_error_type type,
|
||||
static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type,
|
||||
int start, int count,
|
||||
struct aca_banks *banks)
|
||||
struct aca_banks *banks, struct ras_query_context *qctx)
|
||||
{
|
||||
struct amdgpu_aca *aca = &adev->aca;
|
||||
const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
|
||||
@ -143,13 +145,12 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_erro
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
switch (type) {
|
||||
case ACA_ERROR_TYPE_UE:
|
||||
case ACA_SMU_TYPE_UE:
|
||||
max_count = smu_funcs->max_ue_bank_count;
|
||||
break;
|
||||
case ACA_ERROR_TYPE_CE:
|
||||
case ACA_SMU_TYPE_CE:
|
||||
max_count = smu_funcs->max_ce_bank_count;
|
||||
break;
|
||||
case ACA_ERROR_TYPE_DEFERRED:
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
@ -164,7 +165,9 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_erro
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
aca_smu_bank_dump(adev, i, count, &bank);
|
||||
bank.type = type;
|
||||
|
||||
aca_smu_bank_dump(adev, i, count, &bank, qctx);
|
||||
|
||||
ret = aca_banks_add_bank(banks, &bank);
|
||||
if (ret)
|
||||
@ -195,7 +198,7 @@ static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type t
|
||||
return hwip->hwid == hwid && hwip->mcatype == mcatype;
|
||||
}
|
||||
|
||||
static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type)
|
||||
static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)
|
||||
{
|
||||
const struct aca_bank_ops *bank_ops = handle->bank_ops;
|
||||
|
||||
@ -273,59 +276,49 @@ static struct aca_bank_error *get_bank_error(struct aca_error *aerr, struct aca_
|
||||
return new_bank_error(aerr, info);
|
||||
}
|
||||
|
||||
static int aca_log_errors(struct aca_handle *handle, enum aca_error_type type,
|
||||
struct aca_bank_report *report)
|
||||
int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info,
|
||||
enum aca_error_type type, u64 count)
|
||||
{
|
||||
struct aca_error_cache *error_cache = &handle->error_cache;
|
||||
struct aca_bank_error *bank_error;
|
||||
struct aca_error *aerr;
|
||||
|
||||
if (!handle || !report)
|
||||
if (!handle || !info || type >= ACA_ERROR_TYPE_COUNT)
|
||||
return -EINVAL;
|
||||
|
||||
if (!report->count[type])
|
||||
if (!count)
|
||||
return 0;
|
||||
|
||||
aerr = &error_cache->errors[type];
|
||||
bank_error = get_bank_error(aerr, &report->info);
|
||||
bank_error = get_bank_error(aerr, info);
|
||||
if (!bank_error)
|
||||
return -ENOMEM;
|
||||
|
||||
bank_error->count[type] += report->count[type];
|
||||
bank_error->count += count;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int aca_generate_bank_report(struct aca_handle *handle, struct aca_bank *bank,
|
||||
enum aca_error_type type, struct aca_bank_report *report)
|
||||
static int aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)
|
||||
{
|
||||
const struct aca_bank_ops *bank_ops = handle->bank_ops;
|
||||
|
||||
if (!bank || !report)
|
||||
if (!bank)
|
||||
return -EINVAL;
|
||||
|
||||
if (!bank_ops->aca_bank_generate_report)
|
||||
if (!bank_ops->aca_bank_parser)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
memset(report, 0, sizeof(*report));
|
||||
return bank_ops->aca_bank_generate_report(handle, bank, type,
|
||||
report, handle->data);
|
||||
return bank_ops->aca_bank_parser(handle, bank, type,
|
||||
handle->data);
|
||||
}
|
||||
|
||||
static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank *bank,
|
||||
enum aca_error_type type, void *data)
|
||||
enum aca_smu_type type, void *data)
|
||||
{
|
||||
struct aca_bank_report report;
|
||||
int ret;
|
||||
|
||||
ret = aca_generate_bank_report(handle, bank, type, &report);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (!report.count[type])
|
||||
return 0;
|
||||
|
||||
ret = aca_log_errors(handle, type, &report);
|
||||
ret = aca_bank_parser(handle, bank, type);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@ -333,7 +326,7 @@ static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank
|
||||
}
|
||||
|
||||
static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *bank,
|
||||
enum aca_error_type type, bank_handler_t handler, void *data)
|
||||
enum aca_smu_type type, bank_handler_t handler, void *data)
|
||||
{
|
||||
struct aca_handle *handle;
|
||||
int ret;
|
||||
@ -354,7 +347,7 @@ static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *ba
|
||||
}
|
||||
|
||||
static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *banks,
|
||||
enum aca_error_type type, bank_handler_t handler, void *data)
|
||||
enum aca_smu_type type, bank_handler_t handler, void *data)
|
||||
{
|
||||
struct aca_bank_node *node;
|
||||
struct aca_bank *bank;
|
||||
@ -378,8 +371,28 @@ static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type,
|
||||
bank_handler_t handler, void *data)
|
||||
static bool aca_bank_should_update(struct amdgpu_device *adev, enum aca_smu_type type)
|
||||
{
|
||||
struct amdgpu_aca *aca = &adev->aca;
|
||||
bool ret = true;
|
||||
|
||||
/*
|
||||
* Because the UE Valid MCA count will only be cleared after reset,
|
||||
* in order to avoid repeated counting of the error count,
|
||||
* the aca bank is only updated once during the gpu recovery stage.
|
||||
*/
|
||||
if (type == ACA_SMU_TYPE_UE) {
|
||||
if (amdgpu_ras_intr_triggered())
|
||||
ret = atomic_cmpxchg(&aca->ue_update_flag, 0, 1) == 0;
|
||||
else
|
||||
atomic_set(&aca->ue_update_flag, 0);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type,
|
||||
bank_handler_t handler, struct ras_query_context *qctx, void *data)
|
||||
{
|
||||
struct amdgpu_aca *aca = &adev->aca;
|
||||
struct aca_banks banks;
|
||||
@ -389,9 +402,8 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type
|
||||
if (list_empty(&aca->mgr.list))
|
||||
return 0;
|
||||
|
||||
/* NOTE: pmfw is only support UE and CE */
|
||||
if (type == ACA_ERROR_TYPE_DEFERRED)
|
||||
type = ACA_ERROR_TYPE_CE;
|
||||
if (!aca_bank_should_update(adev, type))
|
||||
return 0;
|
||||
|
||||
ret = aca_smu_get_valid_aca_count(adev, type, &count);
|
||||
if (ret)
|
||||
@ -402,7 +414,7 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type
|
||||
|
||||
aca_banks_init(&banks);
|
||||
|
||||
ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks);
|
||||
ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks, qctx);
|
||||
if (ret)
|
||||
goto err_release_banks;
|
||||
|
||||
@ -431,7 +443,7 @@ static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_er
|
||||
if (type >= ACA_ERROR_TYPE_COUNT)
|
||||
return -EINVAL;
|
||||
|
||||
count = bank_error->count[type];
|
||||
count = bank_error->count;
|
||||
if (!count)
|
||||
return 0;
|
||||
|
||||
@ -447,6 +459,8 @@ static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_er
|
||||
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, count);
|
||||
break;
|
||||
case ACA_ERROR_TYPE_DEFERRED:
|
||||
amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, NULL, count);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -477,12 +491,25 @@ out_unlock:
|
||||
}
|
||||
|
||||
static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, enum aca_error_type type,
|
||||
struct ras_err_data *err_data)
|
||||
struct ras_err_data *err_data, struct ras_query_context *qctx)
|
||||
{
|
||||
enum aca_smu_type smu_type;
|
||||
int ret;
|
||||
|
||||
switch (type) {
|
||||
case ACA_ERROR_TYPE_UE:
|
||||
smu_type = ACA_SMU_TYPE_UE;
|
||||
break;
|
||||
case ACA_ERROR_TYPE_CE:
|
||||
case ACA_ERROR_TYPE_DEFERRED:
|
||||
smu_type = ACA_SMU_TYPE_CE;
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* udpate aca bank to aca source error_cache first */
|
||||
ret = aca_banks_update(adev, type, handler_aca_log_bank_error, NULL);
|
||||
ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, qctx, NULL);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@ -498,10 +525,9 @@ static bool aca_handle_is_valid(struct aca_handle *handle)
|
||||
}
|
||||
|
||||
int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle,
|
||||
enum aca_error_type type, void *data)
|
||||
enum aca_error_type type, struct ras_err_data *err_data,
|
||||
struct ras_query_context *qctx)
|
||||
{
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)data;
|
||||
|
||||
if (!handle || !err_data)
|
||||
return -EINVAL;
|
||||
|
||||
@ -511,7 +537,7 @@ int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *han
|
||||
if (!(BIT(type) & handle->mask))
|
||||
return 0;
|
||||
|
||||
return __aca_get_error_data(adev, handle, type, err_data);
|
||||
return __aca_get_error_data(adev, handle, type, err_data, qctx);
|
||||
}
|
||||
|
||||
static void aca_error_init(struct aca_error *aerr, enum aca_error_type type)
|
||||
@ -668,6 +694,8 @@ int amdgpu_aca_init(struct amdgpu_device *adev)
|
||||
struct amdgpu_aca *aca = &adev->aca;
|
||||
int ret;
|
||||
|
||||
atomic_set(&aca->ue_update_flag, 0);
|
||||
|
||||
ret = aca_manager_init(&aca->mgr);
|
||||
if (ret)
|
||||
return ret;
|
||||
@ -680,6 +708,8 @@ void amdgpu_aca_fini(struct amdgpu_device *adev)
|
||||
struct amdgpu_aca *aca = &adev->aca;
|
||||
|
||||
aca_manager_fini(&aca->mgr);
|
||||
|
||||
atomic_set(&aca->ue_update_flag, 0);
|
||||
}
|
||||
|
||||
int amdgpu_aca_reset(struct amdgpu_device *adev)
|
||||
@ -784,7 +814,7 @@ static int amdgpu_aca_smu_debug_mode_set(void *data, u64 val)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_error_type type, int idx)
|
||||
static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_smu_type type, int idx)
|
||||
{
|
||||
struct aca_bank_info info;
|
||||
int i, ret;
|
||||
@ -793,7 +823,7 @@ static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_e
|
||||
if (ret)
|
||||
return;
|
||||
|
||||
seq_printf(m, "aca entry[%d].type: %s\n", idx, type == ACA_ERROR_TYPE_UE ? "UE" : "CE");
|
||||
seq_printf(m, "aca entry[%d].type: %s\n", idx, type == ACA_SMU_TYPE_UE ? "UE" : "CE");
|
||||
seq_printf(m, "aca entry[%d].info: socketid:%d aid:%d hwid:0x%03x mcatype:0x%04x\n",
|
||||
idx, info.socket_id, info.die_id, info.hwid, info.mcatype);
|
||||
|
||||
@ -807,7 +837,7 @@ struct aca_dump_context {
|
||||
};
|
||||
|
||||
static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *bank,
|
||||
enum aca_error_type type, void *data)
|
||||
enum aca_smu_type type, void *data)
|
||||
{
|
||||
struct aca_dump_context *ctx = (struct aca_dump_context *)data;
|
||||
|
||||
@ -816,7 +846,7 @@ static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *ban
|
||||
return handler_aca_log_bank_error(handle, bank, type, NULL);
|
||||
}
|
||||
|
||||
static int aca_dump_show(struct seq_file *m, enum aca_error_type type)
|
||||
static int aca_dump_show(struct seq_file *m, enum aca_smu_type type)
|
||||
{
|
||||
struct amdgpu_device *adev = (struct amdgpu_device *)m->private;
|
||||
struct aca_dump_context context = {
|
||||
@ -824,12 +854,12 @@ static int aca_dump_show(struct seq_file *m, enum aca_error_type type)
|
||||
.idx = 0,
|
||||
};
|
||||
|
||||
return aca_banks_update(adev, type, handler_aca_bank_dump, (void *)&context);
|
||||
return aca_banks_update(adev, type, handler_aca_bank_dump, NULL, (void *)&context);
|
||||
}
|
||||
|
||||
static int aca_dump_ce_show(struct seq_file *m, void *unused)
|
||||
{
|
||||
return aca_dump_show(m, ACA_ERROR_TYPE_CE);
|
||||
return aca_dump_show(m, ACA_SMU_TYPE_CE);
|
||||
}
|
||||
|
||||
static int aca_dump_ce_open(struct inode *inode, struct file *file)
|
||||
@ -847,7 +877,7 @@ static const struct file_operations aca_ce_dump_debug_fops = {
|
||||
|
||||
static int aca_dump_ue_show(struct seq_file *m, void *unused)
|
||||
{
|
||||
return aca_dump_show(m, ACA_ERROR_TYPE_UE);
|
||||
return aca_dump_show(m, ACA_SMU_TYPE_UE);
|
||||
}
|
||||
|
||||
static int aca_dump_ue_open(struct inode *inode, struct file *file)
|
||||
|
@ -26,6 +26,9 @@
|
||||
|
||||
#include <linux/list.h>
|
||||
|
||||
struct ras_err_data;
|
||||
struct ras_query_context;
|
||||
|
||||
#define ACA_MAX_REGS_COUNT (16)
|
||||
|
||||
#define ACA_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> l)
|
||||
@ -99,7 +102,14 @@ enum aca_error_type {
|
||||
ACA_ERROR_TYPE_COUNT
|
||||
};
|
||||
|
||||
enum aca_smu_type {
|
||||
ACA_SMU_TYPE_UE = 0,
|
||||
ACA_SMU_TYPE_CE,
|
||||
ACA_SMU_TYPE_COUNT,
|
||||
};
|
||||
|
||||
struct aca_bank {
|
||||
enum aca_smu_type type;
|
||||
u64 regs[ACA_MAX_REGS_COUNT];
|
||||
};
|
||||
|
||||
@ -115,15 +125,10 @@ struct aca_bank_info {
|
||||
int mcatype;
|
||||
};
|
||||
|
||||
struct aca_bank_report {
|
||||
struct aca_bank_info info;
|
||||
u64 count[ACA_ERROR_TYPE_COUNT];
|
||||
};
|
||||
|
||||
struct aca_bank_error {
|
||||
struct list_head node;
|
||||
struct aca_bank_info info;
|
||||
u64 count[ACA_ERROR_TYPE_COUNT];
|
||||
u64 count;
|
||||
};
|
||||
|
||||
struct aca_error {
|
||||
@ -157,9 +162,8 @@ struct aca_handle {
|
||||
};
|
||||
|
||||
struct aca_bank_ops {
|
||||
int (*aca_bank_generate_report)(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
|
||||
struct aca_bank_report *report, void *data);
|
||||
bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
|
||||
int (*aca_bank_parser)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data);
|
||||
bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type,
|
||||
void *data);
|
||||
};
|
||||
|
||||
@ -167,13 +171,14 @@ struct aca_smu_funcs {
|
||||
int max_ue_bank_count;
|
||||
int max_ce_bank_count;
|
||||
int (*set_debug_mode)(struct amdgpu_device *adev, bool enable);
|
||||
int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_error_type type, u32 *count);
|
||||
int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_error_type type, int idx, struct aca_bank *bank);
|
||||
int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count);
|
||||
int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_smu_type type, int idx, struct aca_bank *bank);
|
||||
};
|
||||
|
||||
struct amdgpu_aca {
|
||||
struct aca_handle_manager mgr;
|
||||
const struct aca_smu_funcs *smu_funcs;
|
||||
atomic_t ue_update_flag;
|
||||
bool is_enabled;
|
||||
};
|
||||
|
||||
@ -196,7 +201,10 @@ int amdgpu_aca_add_handle(struct amdgpu_device *adev, struct aca_handle *handle,
|
||||
const char *name, const struct aca_info *aca_info, void *data);
|
||||
void amdgpu_aca_remove_handle(struct aca_handle *handle);
|
||||
int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle,
|
||||
enum aca_error_type type, void *data);
|
||||
enum aca_error_type type, struct ras_err_data *err_data,
|
||||
struct ras_query_context *qctx);
|
||||
int amdgpu_aca_smu_set_debug_mode(struct amdgpu_device *adev, bool en);
|
||||
void amdgpu_aca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root);
|
||||
int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info,
|
||||
enum aca_error_type type, u64 count);
|
||||
#endif
|
||||
|
@ -748,7 +748,7 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
|
||||
}
|
||||
|
||||
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, bool reset)
|
||||
enum amdgpu_ras_block block, uint32_t reset)
|
||||
{
|
||||
amdgpu_umc_poison_handler(adev, block, reset);
|
||||
}
|
||||
@ -769,12 +769,20 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
|
||||
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
|
||||
int hub_inst, int hub_type)
|
||||
{
|
||||
if (adev->gfx.ras && adev->gfx.ras->query_utcl2_poison_status)
|
||||
return adev->gfx.ras->query_utcl2_poison_status(adev);
|
||||
else
|
||||
return false;
|
||||
if (!hub_type) {
|
||||
if (adev->gfxhub.funcs->query_utcl2_poison_status)
|
||||
return adev->gfxhub.funcs->query_utcl2_poison_status(adev, hub_inst);
|
||||
else
|
||||
return false;
|
||||
} else {
|
||||
if (adev->mmhub.funcs->query_utcl2_poison_status)
|
||||
return adev->mmhub.funcs->query_utcl2_poison_status(adev, hub_inst);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev)
|
||||
|
@ -336,12 +336,13 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
|
||||
int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
|
||||
struct tile_config *config);
|
||||
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, bool reset);
|
||||
enum amdgpu_ras_block block, uint32_t reset);
|
||||
bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
|
||||
bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
|
||||
void amdgpu_amdkfd_block_mmu_notifications(void *p);
|
||||
int amdgpu_amdkfd_criu_resume(void *p);
|
||||
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev);
|
||||
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
|
||||
int hub_inst, int hub_type);
|
||||
int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
|
||||
uint64_t size, u32 alloc_flag, int8_t xcp_id);
|
||||
void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
|
||||
|
@ -34,6 +34,7 @@ union firmware_info {
|
||||
struct atom_firmware_info_v3_2 v32;
|
||||
struct atom_firmware_info_v3_3 v33;
|
||||
struct atom_firmware_info_v3_4 v34;
|
||||
struct atom_firmware_info_v3_5 v35;
|
||||
};
|
||||
|
||||
/*
|
||||
@ -872,6 +873,10 @@ int amdgpu_atomfirmware_get_fw_reserved_fb_size(struct amdgpu_device *adev)
|
||||
fw_reserved_fb_size =
|
||||
(firmware_info->v34.fw_reserved_size_in_kb << 10);
|
||||
break;
|
||||
case 5:
|
||||
fw_reserved_fb_size =
|
||||
(firmware_info->v35.fw_reserved_size_in_kb << 10);
|
||||
break;
|
||||
default:
|
||||
fw_reserved_fb_size = 0;
|
||||
break;
|
||||
|
345
drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
Normal file
345
drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
Normal file
@ -0,0 +1,345 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
/*
|
||||
* Copyright 2024 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <generated/utsrelease.h>
|
||||
#include <linux/devcoredump.h>
|
||||
#include "amdgpu_dev_coredump.h"
|
||||
#include "atom.h"
|
||||
|
||||
#ifndef CONFIG_DEV_COREDUMP
|
||||
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
|
||||
struct amdgpu_reset_context *reset_context)
|
||||
{
|
||||
}
|
||||
#else
|
||||
|
||||
const char *hw_ip_names[MAX_HWIP] = {
|
||||
[GC_HWIP] = "GC",
|
||||
[HDP_HWIP] = "HDP",
|
||||
[SDMA0_HWIP] = "SDMA0",
|
||||
[SDMA1_HWIP] = "SDMA1",
|
||||
[SDMA2_HWIP] = "SDMA2",
|
||||
[SDMA3_HWIP] = "SDMA3",
|
||||
[SDMA4_HWIP] = "SDMA4",
|
||||
[SDMA5_HWIP] = "SDMA5",
|
||||
[SDMA6_HWIP] = "SDMA6",
|
||||
[SDMA7_HWIP] = "SDMA7",
|
||||
[LSDMA_HWIP] = "LSDMA",
|
||||
[MMHUB_HWIP] = "MMHUB",
|
||||
[ATHUB_HWIP] = "ATHUB",
|
||||
[NBIO_HWIP] = "NBIO",
|
||||
[MP0_HWIP] = "MP0",
|
||||
[MP1_HWIP] = "MP1",
|
||||
[UVD_HWIP] = "UVD/JPEG/VCN",
|
||||
[VCN1_HWIP] = "VCN1",
|
||||
[VCE_HWIP] = "VCE",
|
||||
[VPE_HWIP] = "VPE",
|
||||
[DF_HWIP] = "DF",
|
||||
[DCE_HWIP] = "DCE",
|
||||
[OSSSYS_HWIP] = "OSSSYS",
|
||||
[SMUIO_HWIP] = "SMUIO",
|
||||
[PWR_HWIP] = "PWR",
|
||||
[NBIF_HWIP] = "NBIF",
|
||||
[THM_HWIP] = "THM",
|
||||
[CLK_HWIP] = "CLK",
|
||||
[UMC_HWIP] = "UMC",
|
||||
[RSMU_HWIP] = "RSMU",
|
||||
[XGMI_HWIP] = "XGMI",
|
||||
[DCI_HWIP] = "DCI",
|
||||
[PCIE_HWIP] = "PCIE",
|
||||
};
|
||||
|
||||
static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev,
|
||||
struct drm_printer *p)
|
||||
{
|
||||
uint32_t version;
|
||||
uint32_t feature;
|
||||
uint8_t smu_program, smu_major, smu_minor, smu_debug;
|
||||
struct atom_context *ctx = adev->mode_info.atom_context;
|
||||
|
||||
drm_printf(p, "VCE feature version: %u, fw version: 0x%08x\n",
|
||||
adev->vce.fb_version, adev->vce.fw_version);
|
||||
drm_printf(p, "UVD feature version: %u, fw version: 0x%08x\n", 0,
|
||||
adev->uvd.fw_version);
|
||||
drm_printf(p, "GMC feature version: %u, fw version: 0x%08x\n", 0,
|
||||
adev->gmc.fw_version);
|
||||
drm_printf(p, "ME feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.me_feature_version, adev->gfx.me_fw_version);
|
||||
drm_printf(p, "PFP feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.pfp_feature_version, adev->gfx.pfp_fw_version);
|
||||
drm_printf(p, "CE feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.ce_feature_version, adev->gfx.ce_fw_version);
|
||||
drm_printf(p, "RLC feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.rlc_feature_version, adev->gfx.rlc_fw_version);
|
||||
|
||||
drm_printf(p, "RLC SRLC feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.rlc_srlc_feature_version,
|
||||
adev->gfx.rlc_srlc_fw_version);
|
||||
drm_printf(p, "RLC SRLG feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.rlc_srlg_feature_version,
|
||||
adev->gfx.rlc_srlg_fw_version);
|
||||
drm_printf(p, "RLC SRLS feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.rlc_srls_feature_version,
|
||||
adev->gfx.rlc_srls_fw_version);
|
||||
drm_printf(p, "RLCP feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.rlcp_ucode_feature_version,
|
||||
adev->gfx.rlcp_ucode_version);
|
||||
drm_printf(p, "RLCV feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.rlcv_ucode_feature_version,
|
||||
adev->gfx.rlcv_ucode_version);
|
||||
drm_printf(p, "MEC feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.mec_feature_version, adev->gfx.mec_fw_version);
|
||||
|
||||
if (adev->gfx.mec2_fw)
|
||||
drm_printf(p, "MEC2 feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.mec2_feature_version,
|
||||
adev->gfx.mec2_fw_version);
|
||||
|
||||
drm_printf(p, "IMU feature version: %u, fw version: 0x%08x\n", 0,
|
||||
adev->gfx.imu_fw_version);
|
||||
drm_printf(p, "PSP SOS feature version: %u, fw version: 0x%08x\n",
|
||||
adev->psp.sos.feature_version, adev->psp.sos.fw_version);
|
||||
drm_printf(p, "PSP ASD feature version: %u, fw version: 0x%08x\n",
|
||||
adev->psp.asd_context.bin_desc.feature_version,
|
||||
adev->psp.asd_context.bin_desc.fw_version);
|
||||
|
||||
drm_printf(p, "TA XGMI feature version: 0x%08x, fw version: 0x%08x\n",
|
||||
adev->psp.xgmi_context.context.bin_desc.feature_version,
|
||||
adev->psp.xgmi_context.context.bin_desc.fw_version);
|
||||
drm_printf(p, "TA RAS feature version: 0x%08x, fw version: 0x%08x\n",
|
||||
adev->psp.ras_context.context.bin_desc.feature_version,
|
||||
adev->psp.ras_context.context.bin_desc.fw_version);
|
||||
drm_printf(p, "TA HDCP feature version: 0x%08x, fw version: 0x%08x\n",
|
||||
adev->psp.hdcp_context.context.bin_desc.feature_version,
|
||||
adev->psp.hdcp_context.context.bin_desc.fw_version);
|
||||
drm_printf(p, "TA DTM feature version: 0x%08x, fw version: 0x%08x\n",
|
||||
adev->psp.dtm_context.context.bin_desc.feature_version,
|
||||
adev->psp.dtm_context.context.bin_desc.fw_version);
|
||||
drm_printf(p, "TA RAP feature version: 0x%08x, fw version: 0x%08x\n",
|
||||
adev->psp.rap_context.context.bin_desc.feature_version,
|
||||
adev->psp.rap_context.context.bin_desc.fw_version);
|
||||
drm_printf(p,
|
||||
"TA SECURE DISPLAY feature version: 0x%08x, fw version: 0x%08x\n",
|
||||
adev->psp.securedisplay_context.context.bin_desc.feature_version,
|
||||
adev->psp.securedisplay_context.context.bin_desc.fw_version);
|
||||
|
||||
/* SMC firmware */
|
||||
version = adev->pm.fw_version;
|
||||
|
||||
smu_program = (version >> 24) & 0xff;
|
||||
smu_major = (version >> 16) & 0xff;
|
||||
smu_minor = (version >> 8) & 0xff;
|
||||
smu_debug = (version >> 0) & 0xff;
|
||||
drm_printf(p,
|
||||
"SMC feature version: %u, program: %d, fw version: 0x%08x (%d.%d.%d)\n",
|
||||
0, smu_program, version, smu_major, smu_minor, smu_debug);
|
||||
|
||||
/* SDMA firmware */
|
||||
for (int i = 0; i < adev->sdma.num_instances; i++) {
|
||||
drm_printf(p,
|
||||
"SDMA%d feature version: %u, firmware version: 0x%08x\n",
|
||||
i, adev->sdma.instance[i].feature_version,
|
||||
adev->sdma.instance[i].fw_version);
|
||||
}
|
||||
|
||||
drm_printf(p, "VCN feature version: %u, fw version: 0x%08x\n", 0,
|
||||
adev->vcn.fw_version);
|
||||
drm_printf(p, "DMCU feature version: %u, fw version: 0x%08x\n", 0,
|
||||
adev->dm.dmcu_fw_version);
|
||||
drm_printf(p, "DMCUB feature version: %u, fw version: 0x%08x\n", 0,
|
||||
adev->dm.dmcub_fw_version);
|
||||
drm_printf(p, "PSP TOC feature version: %u, fw version: 0x%08x\n",
|
||||
adev->psp.toc.feature_version, adev->psp.toc.fw_version);
|
||||
|
||||
version = adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK;
|
||||
feature = (adev->mes.kiq_version & AMDGPU_MES_FEAT_VERSION_MASK) >>
|
||||
AMDGPU_MES_FEAT_VERSION_SHIFT;
|
||||
drm_printf(p, "MES_KIQ feature version: %u, fw version: 0x%08x\n",
|
||||
feature, version);
|
||||
|
||||
version = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK;
|
||||
feature = (adev->mes.sched_version & AMDGPU_MES_FEAT_VERSION_MASK) >>
|
||||
AMDGPU_MES_FEAT_VERSION_SHIFT;
|
||||
drm_printf(p, "MES feature version: %u, fw version: 0x%08x\n", feature,
|
||||
version);
|
||||
|
||||
drm_printf(p, "VPE feature version: %u, fw version: 0x%08x\n",
|
||||
adev->vpe.feature_version, adev->vpe.fw_version);
|
||||
|
||||
drm_printf(p, "\nVBIOS Information\n");
|
||||
drm_printf(p, "name: %s\n", ctx->name);
|
||||
drm_printf(p, "pn %s\n", ctx->vbios_pn);
|
||||
drm_printf(p, "version: %s\n", ctx->vbios_ver_str);
|
||||
drm_printf(p, "date: %s\n", ctx->date);
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
|
||||
void *data, size_t datalen)
|
||||
{
|
||||
struct drm_printer p;
|
||||
struct amdgpu_coredump_info *coredump = data;
|
||||
struct drm_print_iterator iter;
|
||||
struct amdgpu_vm_fault_info *fault_info;
|
||||
int i, ver;
|
||||
|
||||
iter.data = buffer;
|
||||
iter.offset = 0;
|
||||
iter.start = offset;
|
||||
iter.remain = count;
|
||||
|
||||
p = drm_coredump_printer(&iter);
|
||||
|
||||
drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
|
||||
drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n");
|
||||
drm_printf(&p, "kernel: " UTS_RELEASE "\n");
|
||||
drm_printf(&p, "module: " KBUILD_MODNAME "\n");
|
||||
drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec,
|
||||
coredump->reset_time.tv_nsec);
|
||||
|
||||
if (coredump->reset_task_info.pid)
|
||||
drm_printf(&p, "process_name: %s PID: %d\n",
|
||||
coredump->reset_task_info.process_name,
|
||||
coredump->reset_task_info.pid);
|
||||
|
||||
/* GPU IP's information of the SOC */
|
||||
drm_printf(&p, "\nIP Information\n");
|
||||
drm_printf(&p, "SOC Family: %d\n", coredump->adev->family);
|
||||
drm_printf(&p, "SOC Revision id: %d\n", coredump->adev->rev_id);
|
||||
drm_printf(&p, "SOC External Revision id: %d\n", coredump->adev->external_rev_id);
|
||||
|
||||
for (int i = 1; i < MAX_HWIP; i++) {
|
||||
for (int j = 0; j < HWIP_MAX_INSTANCE; j++) {
|
||||
ver = coredump->adev->ip_versions[i][j];
|
||||
if (ver)
|
||||
drm_printf(&p, "HWIP: %s[%d][%d]: v%d.%d.%d.%d.%d\n",
|
||||
hw_ip_names[i], i, j,
|
||||
IP_VERSION_MAJ(ver),
|
||||
IP_VERSION_MIN(ver),
|
||||
IP_VERSION_REV(ver),
|
||||
IP_VERSION_VARIANT(ver),
|
||||
IP_VERSION_SUBREV(ver));
|
||||
}
|
||||
}
|
||||
|
||||
/* IP firmware information */
|
||||
drm_printf(&p, "\nIP Firmwares\n");
|
||||
amdgpu_devcoredump_fw_info(coredump->adev, &p);
|
||||
|
||||
if (coredump->ring) {
|
||||
drm_printf(&p, "\nRing timed out details\n");
|
||||
drm_printf(&p, "IP Type: %d Ring Name: %s\n",
|
||||
coredump->ring->funcs->type,
|
||||
coredump->ring->name);
|
||||
}
|
||||
|
||||
/* Add page fault information */
|
||||
fault_info = &coredump->adev->vm_manager.fault_info;
|
||||
drm_printf(&p, "\n[%s] Page fault observed\n",
|
||||
fault_info->vmhub ? "mmhub" : "gfxhub");
|
||||
drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr);
|
||||
drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status);
|
||||
|
||||
/* Add ring buffer information */
|
||||
drm_printf(&p, "Ring buffer information\n");
|
||||
for (int i = 0; i < coredump->adev->num_rings; i++) {
|
||||
int j = 0;
|
||||
struct amdgpu_ring *ring = coredump->adev->rings[i];
|
||||
|
||||
drm_printf(&p, "ring name: %s\n", ring->name);
|
||||
drm_printf(&p, "Rptr: 0x%llx Wptr: 0x%llx RB mask: %x\n",
|
||||
amdgpu_ring_get_rptr(ring),
|
||||
amdgpu_ring_get_wptr(ring),
|
||||
ring->buf_mask);
|
||||
drm_printf(&p, "Ring size in dwords: %d\n",
|
||||
ring->ring_size / 4);
|
||||
drm_printf(&p, "Ring contents\n");
|
||||
drm_printf(&p, "Offset \t Value\n");
|
||||
|
||||
while (j < ring->ring_size) {
|
||||
drm_printf(&p, "0x%x \t 0x%x\n", j, ring->ring[j / 4]);
|
||||
j += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (coredump->reset_vram_lost)
|
||||
drm_printf(&p, "VRAM is lost due to GPU reset!\n");
|
||||
if (coredump->adev->reset_info.num_regs) {
|
||||
drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
|
||||
|
||||
for (i = 0; i < coredump->adev->reset_info.num_regs; i++)
|
||||
drm_printf(&p, "0x%08x: 0x%08x\n",
|
||||
coredump->adev->reset_info.reset_dump_reg_list[i],
|
||||
coredump->adev->reset_info.reset_dump_reg_value[i]);
|
||||
}
|
||||
|
||||
return count - iter.remain;
|
||||
}
|
||||
|
||||
static void amdgpu_devcoredump_free(void *data)
|
||||
{
|
||||
kfree(data);
|
||||
}
|
||||
|
||||
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
|
||||
struct amdgpu_reset_context *reset_context)
|
||||
{
|
||||
struct amdgpu_coredump_info *coredump;
|
||||
struct drm_device *dev = adev_to_drm(adev);
|
||||
struct amdgpu_job *job = reset_context->job;
|
||||
struct drm_sched_job *s_job;
|
||||
|
||||
coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
|
||||
|
||||
if (!coredump) {
|
||||
DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
coredump->reset_vram_lost = vram_lost;
|
||||
|
||||
if (reset_context->job && reset_context->job->vm) {
|
||||
struct amdgpu_task_info *ti;
|
||||
struct amdgpu_vm *vm = reset_context->job->vm;
|
||||
|
||||
ti = amdgpu_vm_get_task_info_vm(vm);
|
||||
if (ti) {
|
||||
coredump->reset_task_info = *ti;
|
||||
amdgpu_vm_put_task_info(ti);
|
||||
}
|
||||
}
|
||||
|
||||
if (job) {
|
||||
s_job = &job->base;
|
||||
coredump->ring = to_amdgpu_ring(s_job->sched);
|
||||
}
|
||||
|
||||
coredump->adev = adev;
|
||||
|
||||
ktime_get_ts64(&coredump->reset_time);
|
||||
|
||||
dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
|
||||
amdgpu_devcoredump_read, amdgpu_devcoredump_free);
|
||||
}
|
||||
#endif
|
47
drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
Normal file
47
drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
Normal file
@ -0,0 +1,47 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/*
|
||||
* Copyright 2024 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __AMDGPU_DEV_COREDUMP_H__
|
||||
#define __AMDGPU_DEV_COREDUMP_H__
|
||||
|
||||
#include "amdgpu.h"
|
||||
#include "amdgpu_reset.h"
|
||||
|
||||
#ifdef CONFIG_DEV_COREDUMP
|
||||
|
||||
#define AMDGPU_COREDUMP_VERSION "1"
|
||||
|
||||
struct amdgpu_coredump_info {
|
||||
struct amdgpu_device *adev;
|
||||
struct amdgpu_task_info reset_task_info;
|
||||
struct timespec64 reset_time;
|
||||
bool reset_vram_lost;
|
||||
struct amdgpu_ring *ring;
|
||||
};
|
||||
#endif
|
||||
|
||||
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
|
||||
struct amdgpu_reset_context *reset_context);
|
||||
|
||||
#endif
|
@ -74,6 +74,7 @@
|
||||
#include "amdgpu_fru_eeprom.h"
|
||||
#include "amdgpu_reset.h"
|
||||
#include "amdgpu_virt.h"
|
||||
#include "amdgpu_dev_coredump.h"
|
||||
|
||||
#include <linux/suspend.h>
|
||||
#include <drm/task_barrier.h>
|
||||
@ -143,6 +144,8 @@ const char *amdgpu_asic_name[] = {
|
||||
"LAST",
|
||||
};
|
||||
|
||||
static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
|
||||
|
||||
/**
|
||||
* DOC: pcie_replay_count
|
||||
*
|
||||
@ -335,10 +338,12 @@ bool amdgpu_device_supports_boco(struct drm_device *dev)
|
||||
*
|
||||
* @dev: drm_device pointer
|
||||
*
|
||||
* Returns true if the device supporte BACO,
|
||||
* otherwise return false.
|
||||
* Return:
|
||||
* 1 if the device supporte BACO;
|
||||
* 3 if the device support MACO (only works if BACO is supported)
|
||||
* otherwise return 0.
|
||||
*/
|
||||
bool amdgpu_device_supports_baco(struct drm_device *dev)
|
||||
int amdgpu_device_supports_baco(struct drm_device *dev)
|
||||
{
|
||||
struct amdgpu_device *adev = drm_to_adev(dev);
|
||||
|
||||
@ -4069,6 +4074,13 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
/* Enable TMZ based on IP_VERSION */
|
||||
amdgpu_gmc_tmz_set(adev);
|
||||
|
||||
if (amdgpu_sriov_vf(adev) &&
|
||||
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
|
||||
/* VF MMIO access (except mailbox range) from CPU
|
||||
* will be blocked during sriov runtime
|
||||
*/
|
||||
adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
|
||||
|
||||
amdgpu_gmc_noretry_set(adev);
|
||||
/* Need to get xgmi info early to decide the reset behavior*/
|
||||
if (adev->gmc.xgmi.supported) {
|
||||
@ -4135,18 +4147,22 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
adev->ip_blocks[i].status.hw = true;
|
||||
}
|
||||
}
|
||||
} else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
|
||||
!amdgpu_device_has_display_hardware(adev)) {
|
||||
r = psp_gpu_reset(adev);
|
||||
} else {
|
||||
tmp = amdgpu_reset_method;
|
||||
/* It should do a default reset when loading or reloading the driver,
|
||||
* regardless of the module parameter reset_method.
|
||||
*/
|
||||
amdgpu_reset_method = AMD_RESET_METHOD_NONE;
|
||||
r = amdgpu_asic_reset(adev);
|
||||
amdgpu_reset_method = tmp;
|
||||
if (r) {
|
||||
dev_err(adev->dev, "asic reset on init failed\n");
|
||||
goto failed;
|
||||
}
|
||||
tmp = amdgpu_reset_method;
|
||||
/* It should do a default reset when loading or reloading the driver,
|
||||
* regardless of the module parameter reset_method.
|
||||
*/
|
||||
amdgpu_reset_method = AMD_RESET_METHOD_NONE;
|
||||
r = amdgpu_asic_reset(adev);
|
||||
amdgpu_reset_method = tmp;
|
||||
}
|
||||
|
||||
if (r) {
|
||||
dev_err(adev->dev, "asic reset on init failed\n");
|
||||
goto failed;
|
||||
}
|
||||
}
|
||||
|
||||
@ -4970,12 +4986,15 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
|
||||
retry:
|
||||
amdgpu_amdkfd_pre_reset(adev);
|
||||
|
||||
amdgpu_device_stop_pending_resets(adev);
|
||||
|
||||
if (from_hypervisor)
|
||||
r = amdgpu_virt_request_full_gpu(adev, true);
|
||||
else
|
||||
r = amdgpu_virt_reset_gpu(adev);
|
||||
if (r)
|
||||
return r;
|
||||
amdgpu_ras_set_fed(adev, false);
|
||||
amdgpu_irq_gpu_reset_resume_helper(adev);
|
||||
|
||||
/* some sw clean up VF needs to do before recover */
|
||||
@ -5534,6 +5553,23 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
|
||||
|
||||
}
|
||||
|
||||
static int amdgpu_device_health_check(struct list_head *device_list_handle)
|
||||
{
|
||||
struct amdgpu_device *tmp_adev;
|
||||
int ret = 0;
|
||||
u32 status;
|
||||
|
||||
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
|
||||
pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
|
||||
if (PCI_POSSIBLE_ERROR(status)) {
|
||||
dev_err(tmp_adev->dev, "device lost from bus!");
|
||||
ret = -ENODEV;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_gpu_recover - reset the asic and recover scheduler
|
||||
*
|
||||
@ -5605,6 +5641,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
device_list_handle = &device_list;
|
||||
}
|
||||
|
||||
if (!amdgpu_sriov_vf(adev)) {
|
||||
r = amdgpu_device_health_check(device_list_handle);
|
||||
if (r)
|
||||
goto end_reset;
|
||||
}
|
||||
|
||||
/* We need to lock reset domain only once both for XGMI and single device */
|
||||
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
|
||||
reset_list);
|
||||
@ -5687,11 +5729,12 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
|
||||
tmp_adev->asic_reset_res = r;
|
||||
}
|
||||
|
||||
/*
|
||||
* Drop all pending non scheduler resets. Scheduler resets
|
||||
* were already dropped during drm_sched_stop
|
||||
*/
|
||||
amdgpu_device_stop_pending_resets(tmp_adev);
|
||||
if (!amdgpu_sriov_vf(tmp_adev))
|
||||
/*
|
||||
* Drop all pending non scheduler resets. Scheduler resets
|
||||
* were already dropped during drm_sched_stop
|
||||
*/
|
||||
amdgpu_device_stop_pending_resets(tmp_adev);
|
||||
}
|
||||
|
||||
/* Actual ASIC resets if needed.*/
|
||||
@ -5770,6 +5813,7 @@ skip_sched_resume:
|
||||
reset_list);
|
||||
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
|
||||
|
||||
end_reset:
|
||||
if (hive) {
|
||||
mutex_unlock(&hive->hive_lock);
|
||||
amdgpu_put_xgmi_hive(hive);
|
||||
|
@ -97,6 +97,7 @@
|
||||
#include "smuio_v13_0.h"
|
||||
#include "smuio_v13_0_3.h"
|
||||
#include "smuio_v13_0_6.h"
|
||||
#include "smuio_v14_0_2.h"
|
||||
#include "vcn_v5_0_0.h"
|
||||
#include "jpeg_v5_0_0.h"
|
||||
|
||||
@ -245,12 +246,16 @@ static int amdgpu_discovery_read_binary_from_sysmem(struct amdgpu_device *adev,
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
#define IP_DISCOVERY_V2 2
|
||||
#define IP_DISCOVERY_V4 4
|
||||
|
||||
static int amdgpu_discovery_read_binary_from_mem(struct amdgpu_device *adev,
|
||||
uint8_t *binary)
|
||||
{
|
||||
uint64_t vram_size;
|
||||
u32 msg;
|
||||
int i, ret = 0;
|
||||
int ip_discovery_ver = 0;
|
||||
|
||||
/* It can take up to a second for IFWI init to complete on some dGPUs,
|
||||
* but generally it should be in the 60-100ms range. Normally this starts
|
||||
@ -259,7 +264,11 @@ static int amdgpu_discovery_read_binary_from_mem(struct amdgpu_device *adev,
|
||||
* wait for this to complete. Once the C2PMSG is updated, we can
|
||||
* continue.
|
||||
*/
|
||||
if (dev_is_removable(&adev->pdev->dev)) {
|
||||
|
||||
ip_discovery_ver = RREG32(mmIP_DISCOVERY_VERSION);
|
||||
if ((dev_is_removable(&adev->pdev->dev)) ||
|
||||
(ip_discovery_ver == IP_DISCOVERY_V2) ||
|
||||
(ip_discovery_ver == IP_DISCOVERY_V4)) {
|
||||
for (i = 0; i < 1000; i++) {
|
||||
msg = RREG32(mmMP0_SMN_C2PMSG_33);
|
||||
if (msg & 0x80000000)
|
||||
@ -1896,6 +1905,7 @@ static int amdgpu_discovery_set_smu_ip_blocks(struct amdgpu_device *adev)
|
||||
amdgpu_device_ip_block_add(adev, &smu_v13_0_ip_block);
|
||||
break;
|
||||
case IP_VERSION(14, 0, 0):
|
||||
case IP_VERSION(14, 0, 1):
|
||||
amdgpu_device_ip_block_add(adev, &smu_v14_0_ip_block);
|
||||
break;
|
||||
default:
|
||||
@ -2677,6 +2687,9 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
|
||||
case IP_VERSION(14, 0, 1):
|
||||
adev->smuio.funcs = &smuio_v13_0_6_funcs;
|
||||
break;
|
||||
case IP_VERSION(14, 0, 2):
|
||||
adev->smuio.funcs = &smuio_v14_0_2_funcs;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@ -195,6 +195,7 @@ int amdgpu_async_gfx_ring = 1;
|
||||
int amdgpu_mcbp = -1;
|
||||
int amdgpu_discovery = -1;
|
||||
int amdgpu_mes;
|
||||
int amdgpu_mes_log_enable = 0;
|
||||
int amdgpu_mes_kiq;
|
||||
int amdgpu_noretry = -1;
|
||||
int amdgpu_force_asic_type = -1;
|
||||
@ -667,6 +668,15 @@ MODULE_PARM_DESC(mes,
|
||||
"Enable Micro Engine Scheduler (0 = disabled (default), 1 = enabled)");
|
||||
module_param_named(mes, amdgpu_mes, int, 0444);
|
||||
|
||||
/**
|
||||
* DOC: mes_log_enable (int)
|
||||
* Enable Micro Engine Scheduler log. This is used to enable/disable MES internal log.
|
||||
* (0 = disabled (default), 1 = enabled)
|
||||
*/
|
||||
MODULE_PARM_DESC(mes_log_enable,
|
||||
"Enable Micro Engine Scheduler log (0 = disabled (default), 1 = enabled)");
|
||||
module_param_named(mes_log_enable, amdgpu_mes_log_enable, int, 0444);
|
||||
|
||||
/**
|
||||
* DOC: mes_kiq (int)
|
||||
* Enable Micro Engine Scheduler KIQ. This is a new engine pipe for kiq.
|
||||
@ -2734,7 +2744,8 @@ static int amdgpu_pmops_runtime_suspend(struct device *dev)
|
||||
drm_dev->switch_power_state = DRM_SWITCH_POWER_DYNAMIC_OFF;
|
||||
} else if (adev->pm.rpm_mode == AMDGPU_RUNPM_BOCO) {
|
||||
/* nothing to do */
|
||||
} else if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
|
||||
} else if ((adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) ||
|
||||
(adev->pm.rpm_mode == AMDGPU_RUNPM_BAMACO)) {
|
||||
amdgpu_device_baco_enter(drm_dev);
|
||||
}
|
||||
|
||||
@ -2774,7 +2785,8 @@ static int amdgpu_pmops_runtime_resume(struct device *dev)
|
||||
* PCI core handles it for _PR3.
|
||||
*/
|
||||
pci_set_master(pdev);
|
||||
} else if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
|
||||
} else if ((adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) ||
|
||||
(adev->pm.rpm_mode == AMDGPU_RUNPM_BAMACO)) {
|
||||
amdgpu_device_baco_exit(drm_dev);
|
||||
}
|
||||
ret = amdgpu_device_resume(drm_dev, false);
|
||||
|
@ -259,7 +259,6 @@ struct amdgpu_cu_info {
|
||||
struct amdgpu_gfx_ras {
|
||||
struct amdgpu_ras_block_object ras_block;
|
||||
void (*enable_watchdog_timer)(struct amdgpu_device *adev);
|
||||
bool (*query_utcl2_poison_status)(struct amdgpu_device *adev);
|
||||
int (*rlc_gc_fed_irq)(struct amdgpu_device *adev,
|
||||
struct amdgpu_irq_src *source,
|
||||
struct amdgpu_iv_entry *entry);
|
||||
|
@ -38,6 +38,8 @@ struct amdgpu_gfxhub_funcs {
|
||||
void (*mode2_save_regs)(struct amdgpu_device *adev);
|
||||
void (*mode2_restore_regs)(struct amdgpu_device *adev);
|
||||
void (*halt)(struct amdgpu_device *adev);
|
||||
bool (*query_utcl2_poison_status)(struct amdgpu_device *adev,
|
||||
int xcc_id);
|
||||
};
|
||||
|
||||
struct amdgpu_gfxhub {
|
||||
|
@ -304,12 +304,15 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
|
||||
dma_fence_set_error(finished, -ECANCELED);
|
||||
|
||||
if (finished->error < 0) {
|
||||
DRM_INFO("Skip scheduling IBs!\n");
|
||||
dev_dbg(adev->dev, "Skip scheduling IBs in ring(%s)",
|
||||
ring->name);
|
||||
} else {
|
||||
r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
|
||||
&fence);
|
||||
if (r)
|
||||
DRM_ERROR("Error scheduling IBs (%d)\n", r);
|
||||
dev_err(adev->dev,
|
||||
"Error scheduling IBs (%d) in ring(%s)", r,
|
||||
ring->name);
|
||||
}
|
||||
|
||||
job->job_run_counter++;
|
||||
|
@ -133,6 +133,7 @@ void amdgpu_register_gpu_instance(struct amdgpu_device *adev)
|
||||
int amdgpu_driver_load_kms(struct amdgpu_device *adev, unsigned long flags)
|
||||
{
|
||||
struct drm_device *dev;
|
||||
int bamaco_support = 0;
|
||||
int r, acpi_status;
|
||||
|
||||
dev = adev_to_drm(adev);
|
||||
@ -158,8 +159,12 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, unsigned long flags)
|
||||
(amdgpu_runtime_pm != 0)) { /* enable boco as runtime mode */
|
||||
adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
|
||||
dev_info(adev->dev, "Using BOCO for runtime pm\n");
|
||||
} else if (amdgpu_device_supports_baco(dev) &&
|
||||
(amdgpu_runtime_pm != 0)) {
|
||||
} else if (amdgpu_runtime_pm != 0) {
|
||||
bamaco_support = amdgpu_device_supports_baco(dev);
|
||||
|
||||
if (!bamaco_support)
|
||||
goto no_runtime_pm;
|
||||
|
||||
switch (adev->asic_type) {
|
||||
case CHIP_VEGA20:
|
||||
case CHIP_ARCTURUS:
|
||||
@ -178,10 +183,20 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, unsigned long flags)
|
||||
break;
|
||||
}
|
||||
|
||||
if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO)
|
||||
dev_info(adev->dev, "Using BACO for runtime pm\n");
|
||||
if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
|
||||
if (bamaco_support & MACO_SUPPORT) {
|
||||
adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
|
||||
dev_info(adev->dev, "Using BAMACO for runtime pm\n");
|
||||
} else {
|
||||
dev_info(adev->dev, "Using BACO for runtime pm\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
no_runtime_pm:
|
||||
if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
|
||||
dev_info(adev->dev, "NO pm mode for runtime pm\n");
|
||||
|
||||
/* Call ACPI methods: require modeset init
|
||||
* but failure is not fatal
|
||||
*/
|
||||
|
@ -210,22 +210,26 @@ int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry)
|
||||
static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry,
|
||||
struct ras_query_context *qctx)
|
||||
{
|
||||
dev_info(adev->dev, HW_ERR "Accelerator Check Architecture events logged\n");
|
||||
dev_info(adev->dev, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_STATUS]);
|
||||
dev_info(adev->dev, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_ADDR]);
|
||||
dev_info(adev->dev, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_MISC0]);
|
||||
dev_info(adev->dev, HW_ERR "aca entry[%02d].IPID=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_IPID]);
|
||||
dev_info(adev->dev, HW_ERR "aca entry[%02d].SYND=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_SYND]);
|
||||
u64 event_id = qctx->event_id;
|
||||
|
||||
RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");
|
||||
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_STATUS]);
|
||||
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_ADDR]);
|
||||
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_MISC0]);
|
||||
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].IPID=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_IPID]);
|
||||
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].SYND=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_SYND]);
|
||||
}
|
||||
|
||||
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data)
|
||||
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
|
||||
struct ras_err_data *err_data, struct ras_query_context *qctx)
|
||||
{
|
||||
struct amdgpu_smuio_mcm_config_info mcm_info;
|
||||
struct ras_err_addr err_addr = {0};
|
||||
@ -244,7 +248,7 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo
|
||||
list_for_each_entry(node, &mca_set.list, node) {
|
||||
entry = &node->entry;
|
||||
|
||||
amdgpu_mca_smu_mca_bank_dump(adev, i++, entry);
|
||||
amdgpu_mca_smu_mca_bank_dump(adev, i++, entry, qctx);
|
||||
|
||||
count = 0;
|
||||
ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count);
|
||||
|
@ -169,6 +169,7 @@ void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root
|
||||
void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set);
|
||||
int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_entry *entry);
|
||||
void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set);
|
||||
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data);
|
||||
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
|
||||
struct ras_err_data *err_data, struct ras_query_context *qctx);
|
||||
|
||||
#endif
|
||||
|
@ -40,7 +40,6 @@ int amdgpu_mes_doorbell_process_slice(struct amdgpu_device *adev)
|
||||
}
|
||||
|
||||
static int amdgpu_mes_kernel_doorbell_get(struct amdgpu_device *adev,
|
||||
struct amdgpu_mes_process *process,
|
||||
int ip_type, uint64_t *doorbell_index)
|
||||
{
|
||||
unsigned int offset, found;
|
||||
@ -65,7 +64,6 @@ static int amdgpu_mes_kernel_doorbell_get(struct amdgpu_device *adev,
|
||||
}
|
||||
|
||||
static void amdgpu_mes_kernel_doorbell_free(struct amdgpu_device *adev,
|
||||
struct amdgpu_mes_process *process,
|
||||
uint32_t doorbell_index)
|
||||
{
|
||||
unsigned int old, rel_index;
|
||||
@ -102,7 +100,10 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device *adev)
|
||||
{
|
||||
int r;
|
||||
|
||||
r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE,
|
||||
if (!amdgpu_mes_log_enable)
|
||||
return 0;
|
||||
|
||||
r = amdgpu_bo_create_kernel(adev, AMDGPU_MES_LOG_BUFFER_SIZE, PAGE_SIZE,
|
||||
AMDGPU_GEM_DOMAIN_GTT,
|
||||
&adev->mes.event_log_gpu_obj,
|
||||
&adev->mes.event_log_gpu_addr,
|
||||
@ -653,7 +654,7 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int gang_id,
|
||||
*queue_id = queue->queue_id = r;
|
||||
|
||||
/* allocate a doorbell index for the queue */
|
||||
r = amdgpu_mes_kernel_doorbell_get(adev, gang->process,
|
||||
r = amdgpu_mes_kernel_doorbell_get(adev,
|
||||
qprops->queue_type,
|
||||
&qprops->doorbell_off);
|
||||
if (r)
|
||||
@ -711,8 +712,7 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int gang_id,
|
||||
return 0;
|
||||
|
||||
clean_up_doorbell:
|
||||
amdgpu_mes_kernel_doorbell_free(adev, gang->process,
|
||||
qprops->doorbell_off);
|
||||
amdgpu_mes_kernel_doorbell_free(adev, qprops->doorbell_off);
|
||||
clean_up_queue_id:
|
||||
spin_lock_irqsave(&adev->mes.queue_id_lock, flags);
|
||||
idr_remove(&adev->mes.queue_id_idr, queue->queue_id);
|
||||
@ -766,8 +766,7 @@ int amdgpu_mes_remove_hw_queue(struct amdgpu_device *adev, int queue_id)
|
||||
queue_id);
|
||||
|
||||
list_del(&queue->list);
|
||||
amdgpu_mes_kernel_doorbell_free(adev, gang->process,
|
||||
queue->doorbell_off);
|
||||
amdgpu_mes_kernel_doorbell_free(adev, queue->doorbell_off);
|
||||
amdgpu_mes_unlock(&adev->mes);
|
||||
|
||||
amdgpu_mes_queue_free_mqd(queue);
|
||||
@ -1471,7 +1470,7 @@ int amdgpu_mes_init_microcode(struct amdgpu_device *adev, int pipe)
|
||||
const struct mes_firmware_header_v1_0 *mes_hdr;
|
||||
struct amdgpu_firmware_info *info;
|
||||
char ucode_prefix[30];
|
||||
char fw_name[40];
|
||||
char fw_name[50];
|
||||
bool need_retry = false;
|
||||
int r;
|
||||
|
||||
@ -1549,12 +1548,11 @@ static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused)
|
||||
uint32_t *mem = (uint32_t *)(adev->mes.event_log_cpu_addr);
|
||||
|
||||
seq_hex_dump(m, "", DUMP_PREFIX_OFFSET, 32, 4,
|
||||
mem, PAGE_SIZE, false);
|
||||
mem, AMDGPU_MES_LOG_BUFFER_SIZE, false);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_mes_event_log);
|
||||
|
||||
#endif
|
||||
@ -1565,7 +1563,7 @@ void amdgpu_debugfs_mes_event_log_init(struct amdgpu_device *adev)
|
||||
#if defined(CONFIG_DEBUG_FS)
|
||||
struct drm_minor *minor = adev_to_drm(adev)->primary;
|
||||
struct dentry *root = minor->debugfs_root;
|
||||
if (adev->enable_mes)
|
||||
if (adev->enable_mes && amdgpu_mes_log_enable)
|
||||
debugfs_create_file("amdgpu_mes_event_log", 0444, root,
|
||||
adev, &amdgpu_debugfs_mes_event_log_fops);
|
||||
|
||||
|
@ -52,6 +52,7 @@ enum amdgpu_mes_priority_level {
|
||||
|
||||
#define AMDGPU_MES_PROC_CTX_SIZE 0x1000 /* one page area */
|
||||
#define AMDGPU_MES_GANG_CTX_SIZE 0x1000 /* one page area */
|
||||
#define AMDGPU_MES_LOG_BUFFER_SIZE 0x4000 /* Maximu log buffer size for MES */
|
||||
|
||||
struct amdgpu_mes_funcs;
|
||||
|
||||
@ -140,6 +141,12 @@ struct amdgpu_mes {
|
||||
|
||||
/* ip specific functions */
|
||||
const struct amdgpu_mes_funcs *funcs;
|
||||
|
||||
/* mes resource_1 bo*/
|
||||
struct amdgpu_bo *resource_1;
|
||||
uint64_t resource_1_gpu_addr;
|
||||
void *resource_1_addr;
|
||||
|
||||
};
|
||||
|
||||
struct amdgpu_mes_process {
|
||||
|
@ -63,6 +63,8 @@ struct amdgpu_mmhub_funcs {
|
||||
uint64_t page_table_base);
|
||||
void (*update_power_gating)(struct amdgpu_device *adev,
|
||||
bool enable);
|
||||
bool (*query_utcl2_poison_status)(struct amdgpu_device *adev,
|
||||
int hub_inst);
|
||||
};
|
||||
|
||||
struct amdgpu_mmhub {
|
||||
|
@ -1053,6 +1053,11 @@ static int psp_asd_initialize(struct psp_context *psp)
|
||||
if (amdgpu_sriov_vf(psp->adev) || !psp->asd_context.bin_desc.size_bytes)
|
||||
return 0;
|
||||
|
||||
/* bypass asd if display hardware is not available */
|
||||
if (!amdgpu_device_has_display_hardware(psp->adev) &&
|
||||
amdgpu_ip_version(psp->adev, MP0_HWIP, 0) >= IP_VERSION(13, 0, 10))
|
||||
return 0;
|
||||
|
||||
psp->asd_context.mem_context.shared_mc_addr = 0;
|
||||
psp->asd_context.mem_context.shared_mem_size = PSP_ASD_SHARED_MEM_SIZE;
|
||||
psp->asd_context.ta_load_type = GFX_CMD_ID_LOAD_ASD;
|
||||
@ -2617,7 +2622,8 @@ static int psp_load_p2s_table(struct psp_context *psp)
|
||||
struct amdgpu_firmware_info *ucode =
|
||||
&adev->firmware.ucode[AMDGPU_UCODE_ID_P2S_TABLE];
|
||||
|
||||
if (adev->in_runpm && (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO))
|
||||
if (adev->in_runpm && ((adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) ||
|
||||
(adev->pm.rpm_mode == AMDGPU_RUNPM_BAMACO)))
|
||||
return 0;
|
||||
|
||||
if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) {
|
||||
@ -2647,7 +2653,8 @@ static int psp_load_smu_fw(struct psp_context *psp)
|
||||
* Skip SMU FW reloading in case of using BACO for runpm only,
|
||||
* as SMU is always alive.
|
||||
*/
|
||||
if (adev->in_runpm && (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO))
|
||||
if (adev->in_runpm && ((adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) ||
|
||||
(adev->pm.rpm_mode == AMDGPU_RUNPM_BAMACO)))
|
||||
return 0;
|
||||
|
||||
if (!ucode->fw || amdgpu_sriov_vf(psp->adev))
|
||||
|
@ -1045,6 +1045,7 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d
|
||||
static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
|
||||
struct ras_manager *ras_mgr,
|
||||
struct ras_err_data *err_data,
|
||||
struct ras_query_context *qctx,
|
||||
const char *blk_name,
|
||||
bool is_ue,
|
||||
bool is_de)
|
||||
@ -1052,27 +1053,28 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
|
||||
struct amdgpu_smuio_mcm_config_info *mcm_info;
|
||||
struct ras_err_node *err_node;
|
||||
struct ras_err_info *err_info;
|
||||
u64 event_id = qctx->event_id;
|
||||
|
||||
if (is_ue) {
|
||||
for_each_ras_error(err_node, err_data) {
|
||||
err_info = &err_node->err_info;
|
||||
mcm_info = &err_info->mcm_info;
|
||||
if (err_info->ue_count) {
|
||||
dev_info(adev->dev, "socket: %d, die: %d, "
|
||||
"%lld new uncorrectable hardware errors detected in %s block\n",
|
||||
mcm_info->socket_id,
|
||||
mcm_info->die_id,
|
||||
err_info->ue_count,
|
||||
blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
|
||||
"%lld new uncorrectable hardware errors detected in %s block\n",
|
||||
mcm_info->socket_id,
|
||||
mcm_info->die_id,
|
||||
err_info->ue_count,
|
||||
blk_name);
|
||||
}
|
||||
}
|
||||
|
||||
for_each_ras_error(err_node, &ras_mgr->err_data) {
|
||||
err_info = &err_node->err_info;
|
||||
mcm_info = &err_info->mcm_info;
|
||||
dev_info(adev->dev, "socket: %d, die: %d, "
|
||||
"%lld uncorrectable hardware errors detected in total in %s block\n",
|
||||
mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
|
||||
"%lld uncorrectable hardware errors detected in total in %s block\n",
|
||||
mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name);
|
||||
}
|
||||
|
||||
} else {
|
||||
@ -1081,44 +1083,44 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
|
||||
err_info = &err_node->err_info;
|
||||
mcm_info = &err_info->mcm_info;
|
||||
if (err_info->de_count) {
|
||||
dev_info(adev->dev, "socket: %d, die: %d, "
|
||||
"%lld new deferred hardware errors detected in %s block\n",
|
||||
mcm_info->socket_id,
|
||||
mcm_info->die_id,
|
||||
err_info->de_count,
|
||||
blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
|
||||
"%lld new deferred hardware errors detected in %s block\n",
|
||||
mcm_info->socket_id,
|
||||
mcm_info->die_id,
|
||||
err_info->de_count,
|
||||
blk_name);
|
||||
}
|
||||
}
|
||||
|
||||
for_each_ras_error(err_node, &ras_mgr->err_data) {
|
||||
err_info = &err_node->err_info;
|
||||
mcm_info = &err_info->mcm_info;
|
||||
dev_info(adev->dev, "socket: %d, die: %d, "
|
||||
"%lld deferred hardware errors detected in total in %s block\n",
|
||||
mcm_info->socket_id, mcm_info->die_id,
|
||||
err_info->de_count, blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
|
||||
"%lld deferred hardware errors detected in total in %s block\n",
|
||||
mcm_info->socket_id, mcm_info->die_id,
|
||||
err_info->de_count, blk_name);
|
||||
}
|
||||
} else {
|
||||
for_each_ras_error(err_node, err_data) {
|
||||
err_info = &err_node->err_info;
|
||||
mcm_info = &err_info->mcm_info;
|
||||
if (err_info->ce_count) {
|
||||
dev_info(adev->dev, "socket: %d, die: %d, "
|
||||
"%lld new correctable hardware errors detected in %s block\n",
|
||||
mcm_info->socket_id,
|
||||
mcm_info->die_id,
|
||||
err_info->ce_count,
|
||||
blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
|
||||
"%lld new correctable hardware errors detected in %s block\n",
|
||||
mcm_info->socket_id,
|
||||
mcm_info->die_id,
|
||||
err_info->ce_count,
|
||||
blk_name);
|
||||
}
|
||||
}
|
||||
|
||||
for_each_ras_error(err_node, &ras_mgr->err_data) {
|
||||
err_info = &err_node->err_info;
|
||||
mcm_info = &err_info->mcm_info;
|
||||
dev_info(adev->dev, "socket: %d, die: %d, "
|
||||
"%lld correctable hardware errors detected in total in %s block\n",
|
||||
mcm_info->socket_id, mcm_info->die_id,
|
||||
err_info->ce_count, blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
|
||||
"%lld correctable hardware errors detected in total in %s block\n",
|
||||
mcm_info->socket_id, mcm_info->die_id,
|
||||
err_info->ce_count, blk_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1131,77 +1133,79 @@ static inline bool err_data_has_source_info(struct ras_err_data *data)
|
||||
|
||||
static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
|
||||
struct ras_query_if *query_if,
|
||||
struct ras_err_data *err_data)
|
||||
struct ras_err_data *err_data,
|
||||
struct ras_query_context *qctx)
|
||||
{
|
||||
struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);
|
||||
const char *blk_name = get_ras_block_str(&query_if->head);
|
||||
u64 event_id = qctx->event_id;
|
||||
|
||||
if (err_data->ce_count) {
|
||||
if (err_data_has_source_info(err_data)) {
|
||||
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
|
||||
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
|
||||
blk_name, false, false);
|
||||
} else if (!adev->aid_mask &&
|
||||
adev->smuio.funcs &&
|
||||
adev->smuio.funcs->get_socket_id &&
|
||||
adev->smuio.funcs->get_die_id) {
|
||||
dev_info(adev->dev, "socket: %d, die: %d "
|
||||
"%ld correctable hardware errors "
|
||||
"detected in %s block\n",
|
||||
adev->smuio.funcs->get_socket_id(adev),
|
||||
adev->smuio.funcs->get_die_id(adev),
|
||||
ras_mgr->err_data.ce_count,
|
||||
blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
|
||||
"%ld correctable hardware errors "
|
||||
"detected in %s block\n",
|
||||
adev->smuio.funcs->get_socket_id(adev),
|
||||
adev->smuio.funcs->get_die_id(adev),
|
||||
ras_mgr->err_data.ce_count,
|
||||
blk_name);
|
||||
} else {
|
||||
dev_info(adev->dev, "%ld correctable hardware errors "
|
||||
"detected in %s block\n",
|
||||
ras_mgr->err_data.ce_count,
|
||||
blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "%ld correctable hardware errors "
|
||||
"detected in %s block\n",
|
||||
ras_mgr->err_data.ce_count,
|
||||
blk_name);
|
||||
}
|
||||
}
|
||||
|
||||
if (err_data->ue_count) {
|
||||
if (err_data_has_source_info(err_data)) {
|
||||
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
|
||||
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
|
||||
blk_name, true, false);
|
||||
} else if (!adev->aid_mask &&
|
||||
adev->smuio.funcs &&
|
||||
adev->smuio.funcs->get_socket_id &&
|
||||
adev->smuio.funcs->get_die_id) {
|
||||
dev_info(adev->dev, "socket: %d, die: %d "
|
||||
"%ld uncorrectable hardware errors "
|
||||
"detected in %s block\n",
|
||||
adev->smuio.funcs->get_socket_id(adev),
|
||||
adev->smuio.funcs->get_die_id(adev),
|
||||
ras_mgr->err_data.ue_count,
|
||||
blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
|
||||
"%ld uncorrectable hardware errors "
|
||||
"detected in %s block\n",
|
||||
adev->smuio.funcs->get_socket_id(adev),
|
||||
adev->smuio.funcs->get_die_id(adev),
|
||||
ras_mgr->err_data.ue_count,
|
||||
blk_name);
|
||||
} else {
|
||||
dev_info(adev->dev, "%ld uncorrectable hardware errors "
|
||||
"detected in %s block\n",
|
||||
ras_mgr->err_data.ue_count,
|
||||
blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "%ld uncorrectable hardware errors "
|
||||
"detected in %s block\n",
|
||||
ras_mgr->err_data.ue_count,
|
||||
blk_name);
|
||||
}
|
||||
}
|
||||
|
||||
if (err_data->de_count) {
|
||||
if (err_data_has_source_info(err_data)) {
|
||||
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
|
||||
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
|
||||
blk_name, false, true);
|
||||
} else if (!adev->aid_mask &&
|
||||
adev->smuio.funcs &&
|
||||
adev->smuio.funcs->get_socket_id &&
|
||||
adev->smuio.funcs->get_die_id) {
|
||||
dev_info(adev->dev, "socket: %d, die: %d "
|
||||
"%ld deferred hardware errors "
|
||||
"detected in %s block\n",
|
||||
adev->smuio.funcs->get_socket_id(adev),
|
||||
adev->smuio.funcs->get_die_id(adev),
|
||||
ras_mgr->err_data.de_count,
|
||||
blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
|
||||
"%ld deferred hardware errors "
|
||||
"detected in %s block\n",
|
||||
adev->smuio.funcs->get_socket_id(adev),
|
||||
adev->smuio.funcs->get_die_id(adev),
|
||||
ras_mgr->err_data.de_count,
|
||||
blk_name);
|
||||
} else {
|
||||
dev_info(adev->dev, "%ld deferred hardware errors "
|
||||
"detected in %s block\n",
|
||||
ras_mgr->err_data.de_count,
|
||||
blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "%ld deferred hardware errors "
|
||||
"detected in %s block\n",
|
||||
ras_mgr->err_data.de_count,
|
||||
blk_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1265,7 +1269,8 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
|
||||
}
|
||||
|
||||
static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
|
||||
enum aca_error_type type, struct ras_err_data *err_data)
|
||||
enum aca_error_type type, struct ras_err_data *err_data,
|
||||
struct ras_query_context *qctx)
|
||||
{
|
||||
struct ras_manager *obj;
|
||||
|
||||
@ -1273,7 +1278,7 @@ static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu
|
||||
if (!obj)
|
||||
return -EINVAL;
|
||||
|
||||
return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data);
|
||||
return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data, qctx);
|
||||
}
|
||||
|
||||
ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr,
|
||||
@ -1287,13 +1292,14 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *a
|
||||
if (amdgpu_ras_query_error_status(obj->adev, &info))
|
||||
return -EINVAL;
|
||||
|
||||
return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
|
||||
"ce", info.ce_count);
|
||||
return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count,
|
||||
"ce", info.ce_count, "de", info.ue_count);
|
||||
}
|
||||
|
||||
static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
|
||||
struct ras_query_if *info,
|
||||
struct ras_err_data *err_data,
|
||||
struct ras_query_context *qctx,
|
||||
unsigned int error_query_mode)
|
||||
{
|
||||
enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
|
||||
@ -1329,17 +1335,21 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
|
||||
}
|
||||
} else {
|
||||
if (amdgpu_aca_is_enabled(adev)) {
|
||||
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data);
|
||||
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data, qctx);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data);
|
||||
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data, qctx);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_DEFERRED, err_data, qctx);
|
||||
if (ret)
|
||||
return ret;
|
||||
} else {
|
||||
/* FIXME: add code to check return value later */
|
||||
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
|
||||
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
|
||||
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data, qctx);
|
||||
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data, qctx);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1351,6 +1361,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
|
||||
{
|
||||
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
|
||||
struct ras_err_data err_data;
|
||||
struct ras_query_context qctx;
|
||||
unsigned int error_query_mode;
|
||||
int ret;
|
||||
|
||||
@ -1364,8 +1375,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
|
||||
if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode))
|
||||
return -EINVAL;
|
||||
|
||||
memset(&qctx, 0, sizeof(qctx));
|
||||
qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ?
|
||||
RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID);
|
||||
ret = amdgpu_ras_query_error_status_helper(adev, info,
|
||||
&err_data,
|
||||
&qctx,
|
||||
error_query_mode);
|
||||
if (ret)
|
||||
goto out_fini_err_data;
|
||||
@ -1376,7 +1391,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
|
||||
info->ce_count = obj->err_data.ce_count;
|
||||
info->de_count = obj->err_data.de_count;
|
||||
|
||||
amdgpu_ras_error_generate_report(adev, info, &err_data);
|
||||
amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx);
|
||||
|
||||
out_fini_err_data:
|
||||
amdgpu_ras_error_data_fini(&err_data);
|
||||
@ -2041,7 +2056,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
|
||||
}
|
||||
}
|
||||
|
||||
amdgpu_umc_poison_handler(adev, obj->head.block, false);
|
||||
amdgpu_umc_poison_handler(adev, obj->head.block, 0);
|
||||
|
||||
if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
|
||||
poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
|
||||
@ -2384,6 +2399,19 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,
|
||||
struct amdgpu_hive_info *hive, bool status)
|
||||
{
|
||||
struct amdgpu_device *tmp_adev;
|
||||
|
||||
if (hive) {
|
||||
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
|
||||
amdgpu_ras_set_fed(tmp_adev, status);
|
||||
} else {
|
||||
amdgpu_ras_set_fed(adev, status);
|
||||
}
|
||||
}
|
||||
|
||||
static void amdgpu_ras_do_recovery(struct work_struct *work)
|
||||
{
|
||||
struct amdgpu_ras *ras =
|
||||
@ -2393,8 +2421,21 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
|
||||
struct list_head device_list, *device_list_handle = NULL;
|
||||
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
|
||||
|
||||
if (hive)
|
||||
if (hive) {
|
||||
atomic_set(&hive->ras_recovery, 1);
|
||||
|
||||
/* If any device which is part of the hive received RAS fatal
|
||||
* error interrupt, set fatal error status on all. This
|
||||
* condition will need a recovery, and flag will be cleared
|
||||
* as part of recovery.
|
||||
*/
|
||||
list_for_each_entry(remote_adev, &hive->device_list,
|
||||
gmc.xgmi.head)
|
||||
if (amdgpu_ras_get_fed_status(remote_adev)) {
|
||||
amdgpu_ras_set_fed_all(adev, hive, true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!ras->disable_ras_err_cnt_harvest) {
|
||||
|
||||
/* Build list of devices to query RAS related errors */
|
||||
@ -2439,18 +2480,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
|
||||
ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
|
||||
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||
|
||||
/* For any RAS error that needs a full reset to
|
||||
* recover, set the fatal error status
|
||||
*/
|
||||
if (hive) {
|
||||
list_for_each_entry(remote_adev,
|
||||
&hive->device_list,
|
||||
gmc.xgmi.head)
|
||||
amdgpu_ras_set_fed(remote_adev,
|
||||
true);
|
||||
} else {
|
||||
amdgpu_ras_set_fed(adev, true);
|
||||
}
|
||||
psp_fatal_error_recovery_quirk(&adev->psp);
|
||||
}
|
||||
}
|
||||
@ -2694,7 +2723,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
|
||||
atomic_dec(&con->page_retirement_req_cnt);
|
||||
|
||||
amdgpu_umc_bad_page_polling_timeout(adev,
|
||||
false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
|
||||
0, MAX_UMC_POISON_POLLING_TIME_ASYNC);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -3036,6 +3065,35 @@ static int amdgpu_get_ras_schema(struct amdgpu_device *adev)
|
||||
AMDGPU_RAS_ERROR__PARITY;
|
||||
}
|
||||
|
||||
static void ras_event_mgr_init(struct ras_event_manager *mgr)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(mgr->seqnos); i++)
|
||||
atomic64_set(&mgr->seqnos[i], 0);
|
||||
}
|
||||
|
||||
static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||
struct amdgpu_hive_info *hive;
|
||||
|
||||
if (!ras)
|
||||
return;
|
||||
|
||||
hive = amdgpu_get_xgmi_hive(adev);
|
||||
ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr;
|
||||
|
||||
/* init event manager with node 0 on xgmi system */
|
||||
if (!amdgpu_in_reset(adev)) {
|
||||
if (!hive || adev->gmc.xgmi.node_id == 0)
|
||||
ras_event_mgr_init(ras->event_mgr);
|
||||
}
|
||||
|
||||
if (hive)
|
||||
amdgpu_put_xgmi_hive(hive);
|
||||
}
|
||||
|
||||
int amdgpu_ras_init(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
@ -3356,6 +3414,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
|
||||
if (amdgpu_sriov_vf(adev))
|
||||
return 0;
|
||||
|
||||
amdgpu_ras_event_mgr_init(adev);
|
||||
|
||||
if (amdgpu_aca_is_enabled(adev)) {
|
||||
if (amdgpu_in_reset(adev))
|
||||
r = amdgpu_aca_reset(adev);
|
||||
@ -3472,14 +3532,39 @@ void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status)
|
||||
atomic_set(&ras->fed, !!status);
|
||||
}
|
||||
|
||||
bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id)
|
||||
{
|
||||
return !(id & BIT_ULL(63));
|
||||
}
|
||||
|
||||
u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type)
|
||||
{
|
||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||
u64 id;
|
||||
|
||||
switch (type) {
|
||||
case RAS_EVENT_TYPE_ISR:
|
||||
id = (u64)atomic64_read(&ras->event_mgr->seqnos[type]);
|
||||
break;
|
||||
case RAS_EVENT_TYPE_INVALID:
|
||||
default:
|
||||
id = BIT_ULL(63) | 0ULL;
|
||||
break;
|
||||
}
|
||||
|
||||
return id;
|
||||
}
|
||||
|
||||
void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
|
||||
{
|
||||
if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
|
||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||
u64 event_id = (u64)atomic64_inc_return(&ras->event_mgr->seqnos[RAS_EVENT_TYPE_ISR]);
|
||||
|
||||
dev_info(adev->dev, "uncorrectable hardware error"
|
||||
"(ERREVENT_ATHUB_INTERRUPT) detected!\n");
|
||||
RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"
|
||||
"(ERREVENT_ATHUB_INTERRUPT) detected!\n");
|
||||
|
||||
amdgpu_ras_set_fed(adev, true);
|
||||
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
|
||||
amdgpu_ras_reset_gpu(adev);
|
||||
}
|
||||
|
@ -64,6 +64,14 @@ struct amdgpu_iv_entry;
|
||||
/* The high three bits indicates socketid */
|
||||
#define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
|
||||
|
||||
#define RAS_EVENT_LOG(_adev, _id, _fmt, ...) \
|
||||
do { \
|
||||
if (amdgpu_ras_event_id_is_valid((_adev), (_id))) \
|
||||
dev_info((_adev)->dev, "{%llu}" _fmt, (_id), ##__VA_ARGS__); \
|
||||
else \
|
||||
dev_info((_adev)->dev, _fmt, ##__VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
enum amdgpu_ras_block {
|
||||
AMDGPU_RAS_BLOCK__UMC = 0,
|
||||
AMDGPU_RAS_BLOCK__SDMA,
|
||||
@ -419,6 +427,21 @@ struct umc_ecc_info {
|
||||
int record_ce_addr_supported;
|
||||
};
|
||||
|
||||
enum ras_event_type {
|
||||
RAS_EVENT_TYPE_INVALID = -1,
|
||||
RAS_EVENT_TYPE_ISR = 0,
|
||||
RAS_EVENT_TYPE_COUNT,
|
||||
};
|
||||
|
||||
struct ras_event_manager {
|
||||
atomic64_t seqnos[RAS_EVENT_TYPE_COUNT];
|
||||
};
|
||||
|
||||
struct ras_query_context {
|
||||
enum ras_event_type type;
|
||||
u64 event_id;
|
||||
};
|
||||
|
||||
struct amdgpu_ras {
|
||||
/* ras infrastructure */
|
||||
/* for ras itself. */
|
||||
@ -479,6 +502,11 @@ struct amdgpu_ras {
|
||||
atomic_t page_retirement_req_cnt;
|
||||
/* Fatal error detected flag */
|
||||
atomic_t fed;
|
||||
|
||||
/* RAS event manager */
|
||||
struct ras_event_manager __event_mgr;
|
||||
struct ras_event_manager *event_mgr;
|
||||
|
||||
};
|
||||
|
||||
struct ras_fs_data {
|
||||
@ -879,4 +907,6 @@ void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info,
|
||||
void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status);
|
||||
bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
|
||||
|
||||
bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id);
|
||||
u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type);
|
||||
#endif
|
||||
|
@ -404,6 +404,22 @@ static int amdgpu_ras_eeprom_correct_header_tag(
|
||||
return res;
|
||||
}
|
||||
|
||||
static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control *control)
|
||||
{
|
||||
struct amdgpu_device *adev = to_amdgpu_device(control);
|
||||
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
|
||||
|
||||
switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
|
||||
case IP_VERSION(8, 10, 0):
|
||||
case IP_VERSION(12, 0, 0):
|
||||
hdr->version = RAS_TABLE_VER_V2_1;
|
||||
return;
|
||||
default:
|
||||
hdr->version = RAS_TABLE_VER_V1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_ras_eeprom_reset_table -- Reset the RAS EEPROM table
|
||||
* @control: pointer to control structure
|
||||
@ -423,11 +439,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
|
||||
mutex_lock(&control->ras_tbl_mutex);
|
||||
|
||||
hdr->header = RAS_TABLE_HDR_VAL;
|
||||
if (adev->umc.ras &&
|
||||
adev->umc.ras->set_eeprom_table_version)
|
||||
adev->umc.ras->set_eeprom_table_version(hdr);
|
||||
else
|
||||
hdr->version = RAS_TABLE_VER_V1;
|
||||
amdgpu_ras_set_eeprom_table_version(control);
|
||||
|
||||
if (hdr->version == RAS_TABLE_VER_V2_1) {
|
||||
hdr->first_rec_offset = RAS_RECORD_START_V2_1;
|
||||
|
@ -21,9 +21,6 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/devcoredump.h>
|
||||
#include <generated/utsrelease.h>
|
||||
|
||||
#include "amdgpu_reset.h"
|
||||
#include "aldebaran.h"
|
||||
#include "sienna_cichlid.h"
|
||||
@ -161,105 +158,3 @@ void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)
|
||||
atomic_set(&reset_domain->in_gpu_reset, 0);
|
||||
up_write(&reset_domain->sem);
|
||||
}
|
||||
|
||||
#ifndef CONFIG_DEV_COREDUMP
|
||||
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
|
||||
struct amdgpu_reset_context *reset_context)
|
||||
{
|
||||
}
|
||||
#else
|
||||
static ssize_t
|
||||
amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
|
||||
void *data, size_t datalen)
|
||||
{
|
||||
struct drm_printer p;
|
||||
struct amdgpu_coredump_info *coredump = data;
|
||||
struct drm_print_iterator iter;
|
||||
int i;
|
||||
|
||||
iter.data = buffer;
|
||||
iter.offset = 0;
|
||||
iter.start = offset;
|
||||
iter.remain = count;
|
||||
|
||||
p = drm_coredump_printer(&iter);
|
||||
|
||||
drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
|
||||
drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n");
|
||||
drm_printf(&p, "kernel: " UTS_RELEASE "\n");
|
||||
drm_printf(&p, "module: " KBUILD_MODNAME "\n");
|
||||
drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec,
|
||||
coredump->reset_time.tv_nsec);
|
||||
|
||||
if (coredump->reset_task_info.pid)
|
||||
drm_printf(&p, "process_name: %s PID: %d\n",
|
||||
coredump->reset_task_info.process_name,
|
||||
coredump->reset_task_info.pid);
|
||||
|
||||
if (coredump->ring) {
|
||||
drm_printf(&p, "\nRing timed out details\n");
|
||||
drm_printf(&p, "IP Type: %d Ring Name: %s\n",
|
||||
coredump->ring->funcs->type,
|
||||
coredump->ring->name);
|
||||
}
|
||||
|
||||
if (coredump->reset_vram_lost)
|
||||
drm_printf(&p, "VRAM is lost due to GPU reset!\n");
|
||||
if (coredump->adev->reset_info.num_regs) {
|
||||
drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
|
||||
|
||||
for (i = 0; i < coredump->adev->reset_info.num_regs; i++)
|
||||
drm_printf(&p, "0x%08x: 0x%08x\n",
|
||||
coredump->adev->reset_info.reset_dump_reg_list[i],
|
||||
coredump->adev->reset_info.reset_dump_reg_value[i]);
|
||||
}
|
||||
|
||||
return count - iter.remain;
|
||||
}
|
||||
|
||||
static void amdgpu_devcoredump_free(void *data)
|
||||
{
|
||||
kfree(data);
|
||||
}
|
||||
|
||||
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
|
||||
struct amdgpu_reset_context *reset_context)
|
||||
{
|
||||
struct amdgpu_coredump_info *coredump;
|
||||
struct drm_device *dev = adev_to_drm(adev);
|
||||
struct amdgpu_job *job = reset_context->job;
|
||||
struct drm_sched_job *s_job;
|
||||
|
||||
coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
|
||||
|
||||
if (!coredump) {
|
||||
DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
coredump->reset_vram_lost = vram_lost;
|
||||
|
||||
if (reset_context->job && reset_context->job->vm) {
|
||||
struct amdgpu_task_info *ti;
|
||||
struct amdgpu_vm *vm = reset_context->job->vm;
|
||||
|
||||
ti = amdgpu_vm_get_task_info_vm(vm);
|
||||
if (ti) {
|
||||
coredump->reset_task_info = *ti;
|
||||
amdgpu_vm_put_task_info(ti);
|
||||
}
|
||||
}
|
||||
|
||||
if (job) {
|
||||
s_job = &job->base;
|
||||
coredump->ring = to_amdgpu_ring(s_job->sched);
|
||||
}
|
||||
|
||||
coredump->adev = adev;
|
||||
|
||||
ktime_get_ts64(&coredump->reset_time);
|
||||
|
||||
dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
|
||||
amdgpu_devcoredump_read, amdgpu_devcoredump_free);
|
||||
}
|
||||
#endif
|
||||
|
@ -88,19 +88,6 @@ struct amdgpu_reset_domain {
|
||||
atomic_t reset_res;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_DEV_COREDUMP
|
||||
|
||||
#define AMDGPU_COREDUMP_VERSION "1"
|
||||
|
||||
struct amdgpu_coredump_info {
|
||||
struct amdgpu_device *adev;
|
||||
struct amdgpu_task_info reset_task_info;
|
||||
struct timespec64 reset_time;
|
||||
bool reset_vram_lost;
|
||||
struct amdgpu_ring *ring;
|
||||
};
|
||||
#endif
|
||||
|
||||
int amdgpu_reset_init(struct amdgpu_device *adev);
|
||||
int amdgpu_reset_fini(struct amdgpu_device *adev);
|
||||
|
||||
@ -141,9 +128,6 @@ void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);
|
||||
|
||||
void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain);
|
||||
|
||||
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
|
||||
struct amdgpu_reset_context *reset_context);
|
||||
|
||||
#define for_each_handler(i, handler, reset_ctl) \
|
||||
for (i = 0; (i < AMDGPU_RESET_MAX_HANDLERS) && \
|
||||
(handler = (*reset_ctl->reset_handlers)[i]); \
|
||||
|
@ -44,6 +44,7 @@ struct amdgpu_smuio_funcs {
|
||||
u32 (*get_socket_id)(struct amdgpu_device *adev);
|
||||
enum amdgpu_pkg_type (*get_pkg_type)(struct amdgpu_device *adev);
|
||||
bool (*is_host_gpu_xgmi_supported)(struct amdgpu_device *adev);
|
||||
u64 (*get_gpu_clock_counter)(struct amdgpu_device *adev);
|
||||
};
|
||||
|
||||
struct amdgpu_smuio {
|
||||
|
@ -557,7 +557,6 @@ static int amdgpu_ttm_io_mem_reserve(struct ttm_device *bdev,
|
||||
struct ttm_resource *mem)
|
||||
{
|
||||
struct amdgpu_device *adev = amdgpu_ttm_adev(bdev);
|
||||
size_t bus_size = (size_t)mem->size;
|
||||
|
||||
switch (mem->mem_type) {
|
||||
case TTM_PL_SYSTEM:
|
||||
@ -568,9 +567,6 @@ static int amdgpu_ttm_io_mem_reserve(struct ttm_device *bdev,
|
||||
break;
|
||||
case TTM_PL_VRAM:
|
||||
mem->bus.offset = mem->start << PAGE_SHIFT;
|
||||
/* check if it's visible */
|
||||
if ((mem->bus.offset + bus_size) > adev->gmc.visible_vram_size)
|
||||
return -EINVAL;
|
||||
|
||||
if (adev->mman.aper_base_kaddr &&
|
||||
mem->placement & TTM_PL_FLAG_CONTIGUOUS)
|
||||
|
@ -177,7 +177,7 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
|
||||
static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
|
||||
void *ras_error_status,
|
||||
struct amdgpu_iv_entry *entry,
|
||||
bool reset)
|
||||
uint32_t reset)
|
||||
{
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
@ -186,9 +186,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
|
||||
amdgpu_umc_handle_bad_pages(adev, ras_error_status);
|
||||
|
||||
if (err_data->ue_count && reset) {
|
||||
/* use mode-2 reset for poison consumption */
|
||||
if (!entry)
|
||||
con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
con->gpu_reset_flags |= reset;
|
||||
amdgpu_ras_reset_gpu(adev);
|
||||
}
|
||||
|
||||
@ -196,7 +194,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
|
||||
}
|
||||
|
||||
int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
|
||||
bool reset, uint32_t timeout_ms)
|
||||
uint32_t reset, uint32_t timeout_ms)
|
||||
{
|
||||
struct ras_err_data err_data;
|
||||
struct ras_common_if head = {
|
||||
@ -238,8 +236,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
|
||||
if (reset) {
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
|
||||
/* use mode-2 reset for poison consumption */
|
||||
con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
con->gpu_reset_flags |= reset;
|
||||
amdgpu_ras_reset_gpu(adev);
|
||||
}
|
||||
|
||||
@ -247,7 +244,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
|
||||
}
|
||||
|
||||
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, bool reset)
|
||||
enum amdgpu_ras_block block, uint32_t reset)
|
||||
{
|
||||
int ret = AMDGPU_RAS_SUCCESS;
|
||||
|
||||
@ -311,7 +308,8 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
|
||||
void *ras_error_status,
|
||||
struct amdgpu_iv_entry *entry)
|
||||
{
|
||||
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
|
||||
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry,
|
||||
AMDGPU_RAS_GPU_RESET_MODE1_RESET);
|
||||
}
|
||||
|
||||
int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev)
|
||||
|
@ -66,8 +66,6 @@ struct amdgpu_umc_ras {
|
||||
void *ras_error_status);
|
||||
bool (*check_ecc_err_status)(struct amdgpu_device *adev,
|
||||
enum amdgpu_mca_error_type type, void *ras_error_status);
|
||||
/* support different eeprom table version for different asic */
|
||||
void (*set_eeprom_table_version)(struct amdgpu_ras_eeprom_table_header *hdr);
|
||||
};
|
||||
|
||||
struct amdgpu_umc_funcs {
|
||||
@ -103,7 +101,7 @@ struct amdgpu_umc {
|
||||
int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);
|
||||
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);
|
||||
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, bool reset);
|
||||
enum amdgpu_ras_block block, uint32_t reset);
|
||||
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
|
||||
struct amdgpu_irq_src *source,
|
||||
struct amdgpu_iv_entry *entry);
|
||||
@ -123,5 +121,5 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
|
||||
umc_func func, void *data);
|
||||
|
||||
int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
|
||||
bool reset, uint32_t timeout_ms);
|
||||
uint32_t reset, uint32_t timeout_ms);
|
||||
#endif
|
||||
|
@ -93,7 +93,7 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work);
|
||||
|
||||
int amdgpu_vcn_early_init(struct amdgpu_device *adev)
|
||||
{
|
||||
char ucode_prefix[30];
|
||||
char ucode_prefix[25];
|
||||
char fw_name[40];
|
||||
int r, i;
|
||||
|
||||
|
@ -32,6 +32,7 @@
|
||||
|
||||
#include "amdgpu.h"
|
||||
#include "amdgpu_ras.h"
|
||||
#include "amdgpu_reset.h"
|
||||
#include "vi.h"
|
||||
#include "soc15.h"
|
||||
#include "nv.h"
|
||||
@ -424,7 +425,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
|
||||
return -EINVAL;
|
||||
|
||||
if (pf2vf_info->size > 1024) {
|
||||
DRM_ERROR("invalid pf2vf message size\n");
|
||||
dev_err(adev->dev, "invalid pf2vf message size: 0x%x\n", pf2vf_info->size);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@ -435,7 +436,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
|
||||
adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
|
||||
adev->virt.fw_reserve.checksum_key, checksum);
|
||||
if (checksum != checkval) {
|
||||
DRM_ERROR("invalid pf2vf message\n");
|
||||
dev_err(adev->dev,
|
||||
"invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n",
|
||||
checksum, checkval);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@ -449,7 +452,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
|
||||
adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
|
||||
0, checksum);
|
||||
if (checksum != checkval) {
|
||||
DRM_ERROR("invalid pf2vf message\n");
|
||||
dev_err(adev->dev,
|
||||
"invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n",
|
||||
checksum, checkval);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@ -485,7 +490,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
|
||||
((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid;
|
||||
break;
|
||||
default:
|
||||
DRM_ERROR("invalid pf2vf version\n");
|
||||
dev_err(adev->dev, "invalid pf2vf version: 0x%x\n", pf2vf_info->version);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@ -571,6 +576,11 @@ static int amdgpu_virt_write_vf2pf_data(struct amdgpu_device *adev)
|
||||
vf2pf_info->decode_usage = 0;
|
||||
|
||||
vf2pf_info->dummy_page_addr = (uint64_t)adev->dummy_page_addr;
|
||||
vf2pf_info->mes_info_addr = (uint64_t)adev->mes.resource_1_gpu_addr;
|
||||
|
||||
if (adev->mes.resource_1) {
|
||||
vf2pf_info->mes_info_size = adev->mes.resource_1->tbo.base.size;
|
||||
}
|
||||
vf2pf_info->checksum =
|
||||
amd_sriov_msg_checksum(
|
||||
vf2pf_info, vf2pf_info->header.size, 0, 0);
|
||||
@ -584,8 +594,22 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)
|
||||
int ret;
|
||||
|
||||
ret = amdgpu_virt_read_pf2vf_data(adev);
|
||||
if (ret)
|
||||
if (ret) {
|
||||
adev->virt.vf2pf_update_retry_cnt++;
|
||||
if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
|
||||
amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) {
|
||||
amdgpu_ras_set_fed(adev, true);
|
||||
if (amdgpu_reset_domain_schedule(adev->reset_domain,
|
||||
&adev->virt.flr_work))
|
||||
return;
|
||||
else
|
||||
dev_err(adev->dev, "Failed to queue work! at %s", __func__);
|
||||
}
|
||||
|
||||
goto out;
|
||||
}
|
||||
|
||||
adev->virt.vf2pf_update_retry_cnt = 0;
|
||||
amdgpu_virt_write_vf2pf_data(adev);
|
||||
|
||||
out:
|
||||
@ -606,6 +630,7 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev)
|
||||
adev->virt.fw_reserve.p_pf2vf = NULL;
|
||||
adev->virt.fw_reserve.p_vf2pf = NULL;
|
||||
adev->virt.vf2pf_update_interval_ms = 0;
|
||||
adev->virt.vf2pf_update_retry_cnt = 0;
|
||||
|
||||
if (adev->mman.fw_vram_usage_va && adev->mman.drv_vram_usage_va) {
|
||||
DRM_WARN("Currently fw_vram and drv_vram should not have values at the same time!");
|
||||
@ -705,12 +730,6 @@ void amdgpu_detect_virtualization(struct amdgpu_device *adev)
|
||||
adev->virt.caps |= AMDGPU_PASSTHROUGH_MODE;
|
||||
}
|
||||
|
||||
if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
|
||||
/* VF MMIO access (except mailbox range) from CPU
|
||||
* will be blocked during sriov runtime
|
||||
*/
|
||||
adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
|
||||
|
||||
/* we have the ability to check now */
|
||||
if (amdgpu_sriov_vf(adev)) {
|
||||
switch (adev->asic_type) {
|
||||
|
@ -52,6 +52,8 @@
|
||||
/* tonga/fiji use this offset */
|
||||
#define mmBIF_IOV_FUNC_IDENTIFIER 0x1503
|
||||
|
||||
#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 5
|
||||
|
||||
enum amdgpu_sriov_vf_mode {
|
||||
SRIOV_VF_MODE_BARE_METAL = 0,
|
||||
SRIOV_VF_MODE_ONE_VF,
|
||||
@ -130,6 +132,8 @@ enum AMDGIM_FEATURE_FLAG {
|
||||
AMDGIM_FEATURE_AV1_SUPPORT = (1 << 6),
|
||||
/* VCN RB decouple */
|
||||
AMDGIM_FEATURE_VCN_RB_DECOUPLE = (1 << 7),
|
||||
/* MES info */
|
||||
AMDGIM_FEATURE_MES_INFO_ENABLE = (1 << 8),
|
||||
};
|
||||
|
||||
enum AMDGIM_REG_ACCESS_FLAG {
|
||||
@ -257,6 +261,7 @@ struct amdgpu_virt {
|
||||
/* vf2pf message */
|
||||
struct delayed_work vf2pf_work;
|
||||
uint32_t vf2pf_update_interval_ms;
|
||||
int vf2pf_update_retry_cnt;
|
||||
|
||||
/* multimedia bandwidth config */
|
||||
bool is_mm_bw_enabled;
|
||||
@ -332,6 +337,8 @@ static inline bool is_virtual_machine(void)
|
||||
((adev)->virt.gim_feature & AMDGIM_FEATURE_AV1_SUPPORT)
|
||||
#define amdgpu_sriov_is_vcn_rb_decouple(adev) \
|
||||
((adev)->virt.gim_feature & AMDGIM_FEATURE_VCN_RB_DECOUPLE)
|
||||
#define amdgpu_sriov_is_mes_info_enable(adev) \
|
||||
((adev)->virt.gim_feature & AMDGIM_FEATURE_MES_INFO_ENABLE)
|
||||
bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev);
|
||||
void amdgpu_virt_init_setting(struct amdgpu_device *adev);
|
||||
int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
|
||||
|
@ -885,6 +885,44 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
|
||||
kfree(tlb_cb);
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_vm_tlb_flush - prepare TLB flush
|
||||
*
|
||||
* @params: parameters for update
|
||||
* @fence: input fence to sync TLB flush with
|
||||
* @tlb_cb: the callback structure
|
||||
*
|
||||
* Increments the tlb sequence to make sure that future CS execute a VM flush.
|
||||
*/
|
||||
static void
|
||||
amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params,
|
||||
struct dma_fence **fence,
|
||||
struct amdgpu_vm_tlb_seq_struct *tlb_cb)
|
||||
{
|
||||
struct amdgpu_vm *vm = params->vm;
|
||||
|
||||
if (!fence || !*fence)
|
||||
return;
|
||||
|
||||
tlb_cb->vm = vm;
|
||||
if (!dma_fence_add_callback(*fence, &tlb_cb->cb,
|
||||
amdgpu_vm_tlb_seq_cb)) {
|
||||
dma_fence_put(vm->last_tlb_flush);
|
||||
vm->last_tlb_flush = dma_fence_get(*fence);
|
||||
} else {
|
||||
amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
|
||||
}
|
||||
|
||||
/* Prepare a TLB flush fence to be attached to PTs */
|
||||
if (!params->unlocked && vm->is_compute_context) {
|
||||
amdgpu_vm_tlb_fence_create(params->adev, vm, fence);
|
||||
|
||||
/* Makes sure no PD/PT is freed before the flush */
|
||||
dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence,
|
||||
DMA_RESV_USAGE_BOOKKEEP);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_vm_update_range - update a range in the vm page table
|
||||
*
|
||||
@ -916,8 +954,8 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
|
||||
struct ttm_resource *res, dma_addr_t *pages_addr,
|
||||
struct dma_fence **fence)
|
||||
{
|
||||
struct amdgpu_vm_update_params params;
|
||||
struct amdgpu_vm_tlb_seq_struct *tlb_cb;
|
||||
struct amdgpu_vm_update_params params;
|
||||
struct amdgpu_res_cursor cursor;
|
||||
enum amdgpu_sync_mode sync_mode;
|
||||
int r, idx;
|
||||
@ -927,8 +965,8 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
|
||||
|
||||
tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL);
|
||||
if (!tlb_cb) {
|
||||
r = -ENOMEM;
|
||||
goto error_unlock;
|
||||
drm_dev_exit(idx);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/* Vega20+XGMI where PTEs get inadvertently cached in L2 texture cache,
|
||||
@ -948,7 +986,9 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
|
||||
params.immediate = immediate;
|
||||
params.pages_addr = pages_addr;
|
||||
params.unlocked = unlocked;
|
||||
params.needs_flush = flush_tlb;
|
||||
params.allow_override = allow_override;
|
||||
INIT_LIST_HEAD(¶ms.tlb_flush_waitlist);
|
||||
|
||||
/* Implicitly sync to command submissions in the same VM before
|
||||
* unmapping. Sync to moving fences before mapping.
|
||||
@ -1031,24 +1071,18 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
|
||||
}
|
||||
|
||||
r = vm->update_funcs->commit(¶ms, fence);
|
||||
if (r)
|
||||
goto error_free;
|
||||
|
||||
if (flush_tlb || params.table_freed) {
|
||||
tlb_cb->vm = vm;
|
||||
if (fence && *fence &&
|
||||
!dma_fence_add_callback(*fence, &tlb_cb->cb,
|
||||
amdgpu_vm_tlb_seq_cb)) {
|
||||
dma_fence_put(vm->last_tlb_flush);
|
||||
vm->last_tlb_flush = dma_fence_get(*fence);
|
||||
} else {
|
||||
amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
|
||||
}
|
||||
if (params.needs_flush) {
|
||||
amdgpu_vm_tlb_flush(¶ms, fence, tlb_cb);
|
||||
tlb_cb = NULL;
|
||||
}
|
||||
|
||||
amdgpu_vm_pt_free_list(adev, ¶ms);
|
||||
|
||||
error_free:
|
||||
kfree(tlb_cb);
|
||||
|
||||
error_unlock:
|
||||
amdgpu_vm_eviction_unlock(vm);
|
||||
drm_dev_exit(idx);
|
||||
return r;
|
||||
@ -2391,6 +2425,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
|
||||
|
||||
mutex_init(&vm->eviction_lock);
|
||||
vm->evicting = false;
|
||||
vm->tlb_fence_context = dma_fence_context_alloc(1);
|
||||
|
||||
r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,
|
||||
false, &root, xcp_id);
|
||||
@ -2924,6 +2959,14 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
|
||||
if (vm && status) {
|
||||
vm->fault_info.addr = addr;
|
||||
vm->fault_info.status = status;
|
||||
/*
|
||||
* Update the fault information globally for later usage
|
||||
* when vm could be stale or freed.
|
||||
*/
|
||||
adev->vm_manager.fault_info.addr = addr;
|
||||
adev->vm_manager.fault_info.vmhub = vmhub;
|
||||
adev->vm_manager.fault_info.status = status;
|
||||
|
||||
if (AMDGPU_IS_GFXHUB(vmhub)) {
|
||||
vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_GFX;
|
||||
vm->fault_info.vmhub |=
|
||||
|
@ -257,15 +257,20 @@ struct amdgpu_vm_update_params {
|
||||
unsigned int num_dw_left;
|
||||
|
||||
/**
|
||||
* @table_freed: return true if page table is freed when updating
|
||||
* @needs_flush: true whenever we need to invalidate the TLB
|
||||
*/
|
||||
bool table_freed;
|
||||
bool needs_flush;
|
||||
|
||||
/**
|
||||
* @allow_override: true for memory that is not uncached: allows MTYPE
|
||||
* to be overridden for NUMA local memory.
|
||||
*/
|
||||
bool allow_override;
|
||||
|
||||
/**
|
||||
* @tlb_flush_waitlist: temporary storage for BOs until tlb_flush
|
||||
*/
|
||||
struct list_head tlb_flush_waitlist;
|
||||
};
|
||||
|
||||
struct amdgpu_vm_update_funcs {
|
||||
@ -342,6 +347,7 @@ struct amdgpu_vm {
|
||||
atomic64_t tlb_seq;
|
||||
struct dma_fence *last_tlb_flush;
|
||||
atomic64_t kfd_last_flushed_seq;
|
||||
uint64_t tlb_fence_context;
|
||||
|
||||
/* How many times we had to re-generate the page tables */
|
||||
uint64_t generation;
|
||||
@ -422,6 +428,8 @@ struct amdgpu_vm_manager {
|
||||
* look up VM of a page fault
|
||||
*/
|
||||
struct xarray pasids;
|
||||
/* Global registration of recent page fault information */
|
||||
struct amdgpu_vm_fault_info fault_info;
|
||||
};
|
||||
|
||||
struct amdgpu_bo_va_mapping;
|
||||
@ -544,6 +552,8 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
|
||||
uint64_t start, uint64_t end,
|
||||
uint64_t dst, uint64_t flags);
|
||||
void amdgpu_vm_pt_free_work(struct work_struct *work);
|
||||
void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
|
||||
struct amdgpu_vm_update_params *params);
|
||||
|
||||
#if defined(CONFIG_DEBUG_FS)
|
||||
void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m);
|
||||
@ -609,5 +619,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
|
||||
uint64_t addr,
|
||||
uint32_t status,
|
||||
unsigned int vmhub);
|
||||
void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev,
|
||||
struct amdgpu_vm *vm,
|
||||
struct dma_fence **fence);
|
||||
|
||||
#endif
|
||||
|
@ -108,7 +108,9 @@ static int amdgpu_vm_cpu_update(struct amdgpu_vm_update_params *p,
|
||||
static int amdgpu_vm_cpu_commit(struct amdgpu_vm_update_params *p,
|
||||
struct dma_fence **fence)
|
||||
{
|
||||
/* Flush HDP */
|
||||
if (p->needs_flush)
|
||||
atomic64_inc(&p->vm->tlb_seq);
|
||||
|
||||
mb();
|
||||
amdgpu_device_flush_hdp(p->adev, NULL);
|
||||
return 0;
|
||||
|
@ -622,40 +622,58 @@ void amdgpu_vm_pt_free_work(struct work_struct *work)
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_vm_pt_free_dfs - free PD/PT levels
|
||||
* amdgpu_vm_pt_free_list - free PD/PT levels
|
||||
*
|
||||
* @adev: amdgpu device structure
|
||||
* @vm: amdgpu vm structure
|
||||
* @start: optional cursor where to start freeing PDs/PTs
|
||||
* @unlocked: vm resv unlock status
|
||||
* @params: see amdgpu_vm_update_params definition
|
||||
*
|
||||
* Free the page directory or page table level and all sub levels.
|
||||
* Free the page directory objects saved in the flush list
|
||||
*/
|
||||
static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
|
||||
struct amdgpu_vm *vm,
|
||||
struct amdgpu_vm_pt_cursor *start,
|
||||
bool unlocked)
|
||||
void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
|
||||
struct amdgpu_vm_update_params *params)
|
||||
{
|
||||
struct amdgpu_vm_pt_cursor cursor;
|
||||
struct amdgpu_vm_bo_base *entry;
|
||||
struct amdgpu_vm_bo_base *entry, *next;
|
||||
struct amdgpu_vm *vm = params->vm;
|
||||
bool unlocked = params->unlocked;
|
||||
|
||||
if (list_empty(¶ms->tlb_flush_waitlist))
|
||||
return;
|
||||
|
||||
if (unlocked) {
|
||||
spin_lock(&vm->status_lock);
|
||||
for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
|
||||
list_move(&entry->vm_status, &vm->pt_freed);
|
||||
|
||||
if (start)
|
||||
list_move(&start->entry->vm_status, &vm->pt_freed);
|
||||
list_splice_init(¶ms->tlb_flush_waitlist, &vm->pt_freed);
|
||||
spin_unlock(&vm->status_lock);
|
||||
schedule_work(&vm->pt_free_work);
|
||||
return;
|
||||
}
|
||||
|
||||
for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
|
||||
list_for_each_entry_safe(entry, next, ¶ms->tlb_flush_waitlist, vm_status)
|
||||
amdgpu_vm_pt_free(entry);
|
||||
}
|
||||
|
||||
if (start)
|
||||
amdgpu_vm_pt_free(start->entry);
|
||||
/**
|
||||
* amdgpu_vm_pt_add_list - add PD/PT level to the flush list
|
||||
*
|
||||
* @params: parameters for the update
|
||||
* @cursor: first PT entry to start DF search from, non NULL
|
||||
*
|
||||
* This list will be freed after TLB flush.
|
||||
*/
|
||||
static void amdgpu_vm_pt_add_list(struct amdgpu_vm_update_params *params,
|
||||
struct amdgpu_vm_pt_cursor *cursor)
|
||||
{
|
||||
struct amdgpu_vm_pt_cursor seek;
|
||||
struct amdgpu_vm_bo_base *entry;
|
||||
|
||||
spin_lock(¶ms->vm->status_lock);
|
||||
for_each_amdgpu_vm_pt_dfs_safe(params->adev, params->vm, cursor, seek, entry) {
|
||||
if (entry && entry->bo)
|
||||
list_move(&entry->vm_status, ¶ms->tlb_flush_waitlist);
|
||||
}
|
||||
|
||||
/* enter start node now */
|
||||
list_move(&cursor->entry->vm_status, ¶ms->tlb_flush_waitlist);
|
||||
spin_unlock(¶ms->vm->status_lock);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -667,7 +685,13 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
|
||||
*/
|
||||
void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct amdgpu_vm *vm)
|
||||
{
|
||||
amdgpu_vm_pt_free_dfs(adev, vm, NULL, false);
|
||||
struct amdgpu_vm_pt_cursor cursor;
|
||||
struct amdgpu_vm_bo_base *entry;
|
||||
|
||||
for_each_amdgpu_vm_pt_dfs_safe(adev, vm, NULL, cursor, entry) {
|
||||
if (entry)
|
||||
amdgpu_vm_pt_free(entry);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -972,10 +996,8 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
|
||||
while (cursor.pfn < frag_start) {
|
||||
/* Make sure previous mapping is freed */
|
||||
if (cursor.entry->bo) {
|
||||
params->table_freed = true;
|
||||
amdgpu_vm_pt_free_dfs(adev, params->vm,
|
||||
&cursor,
|
||||
params->unlocked);
|
||||
params->needs_flush = true;
|
||||
amdgpu_vm_pt_add_list(params, &cursor);
|
||||
}
|
||||
amdgpu_vm_pt_next(adev, &cursor);
|
||||
}
|
||||
|
@ -126,6 +126,10 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
|
||||
|
||||
WARN_ON(ib->length_dw == 0);
|
||||
amdgpu_ring_pad_ib(ring, ib);
|
||||
|
||||
if (p->needs_flush)
|
||||
atomic64_inc(&p->vm->tlb_seq);
|
||||
|
||||
WARN_ON(ib->length_dw > p->num_dw_left);
|
||||
f = amdgpu_job_submit(p->job);
|
||||
|
||||
|
112
drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
Normal file
112
drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
Normal file
@ -0,0 +1,112 @@
|
||||
// SPDX-License-Identifier: GPL-2.0 OR MIT
|
||||
/*
|
||||
* Copyright 2023 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <linux/dma-fence.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
#include "amdgpu.h"
|
||||
#include "amdgpu_vm.h"
|
||||
#include "amdgpu_gmc.h"
|
||||
|
||||
struct amdgpu_tlb_fence {
|
||||
struct dma_fence base;
|
||||
struct amdgpu_device *adev;
|
||||
struct dma_fence *dependency;
|
||||
struct work_struct work;
|
||||
spinlock_t lock;
|
||||
uint16_t pasid;
|
||||
|
||||
};
|
||||
|
||||
static const char *amdgpu_tlb_fence_get_driver_name(struct dma_fence *fence)
|
||||
{
|
||||
return "amdgpu tlb fence";
|
||||
}
|
||||
|
||||
static const char *amdgpu_tlb_fence_get_timeline_name(struct dma_fence *f)
|
||||
{
|
||||
return "amdgpu tlb timeline";
|
||||
}
|
||||
|
||||
static void amdgpu_tlb_fence_work(struct work_struct *work)
|
||||
{
|
||||
struct amdgpu_tlb_fence *f = container_of(work, typeof(*f), work);
|
||||
int r;
|
||||
|
||||
if (f->dependency) {
|
||||
dma_fence_wait(f->dependency, false);
|
||||
dma_fence_put(f->dependency);
|
||||
f->dependency = NULL;
|
||||
}
|
||||
|
||||
r = amdgpu_gmc_flush_gpu_tlb_pasid(f->adev, f->pasid, 2, true, 0);
|
||||
if (r) {
|
||||
dev_err(f->adev->dev, "TLB flush failed for PASID %d.\n",
|
||||
f->pasid);
|
||||
dma_fence_set_error(&f->base, r);
|
||||
}
|
||||
|
||||
dma_fence_signal(&f->base);
|
||||
dma_fence_put(&f->base);
|
||||
}
|
||||
|
||||
static const struct dma_fence_ops amdgpu_tlb_fence_ops = {
|
||||
.use_64bit_seqno = true,
|
||||
.get_driver_name = amdgpu_tlb_fence_get_driver_name,
|
||||
.get_timeline_name = amdgpu_tlb_fence_get_timeline_name
|
||||
};
|
||||
|
||||
void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, struct amdgpu_vm *vm,
|
||||
struct dma_fence **fence)
|
||||
{
|
||||
struct amdgpu_tlb_fence *f;
|
||||
|
||||
f = kmalloc(sizeof(*f), GFP_KERNEL);
|
||||
if (!f) {
|
||||
/*
|
||||
* We can't fail since the PDEs and PTEs are already updated, so
|
||||
* just block for the dependency and execute the TLB flush
|
||||
*/
|
||||
if (*fence)
|
||||
dma_fence_wait(*fence, false);
|
||||
|
||||
amdgpu_gmc_flush_gpu_tlb_pasid(adev, vm->pasid, 2, true, 0);
|
||||
*fence = dma_fence_get_stub();
|
||||
return;
|
||||
}
|
||||
|
||||
f->adev = adev;
|
||||
f->dependency = *fence;
|
||||
f->pasid = vm->pasid;
|
||||
INIT_WORK(&f->work, amdgpu_tlb_fence_work);
|
||||
spin_lock_init(&f->lock);
|
||||
|
||||
dma_fence_init(&f->base, &amdgpu_tlb_fence_ops, &f->lock,
|
||||
vm->tlb_fence_context, atomic64_read(&vm->tlb_seq));
|
||||
|
||||
/* TODO: We probably need a separate wq here */
|
||||
dma_fence_get(&f->base);
|
||||
schedule_work(&f->work);
|
||||
|
||||
*fence = &f->base;
|
||||
}
|
@ -1035,15 +1035,16 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
|
||||
struct aca_bank_report *report, void *data)
|
||||
static int xgmi_v6_4_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
|
||||
enum aca_smu_type type, void *data)
|
||||
{
|
||||
struct amdgpu_device *adev = handle->adev;
|
||||
struct aca_bank_info info;
|
||||
const char *error_str;
|
||||
u64 status;
|
||||
u64 status, count;
|
||||
int ret, ext_error_code;
|
||||
|
||||
ret = aca_bank_info_decode(bank, &report->info);
|
||||
ret = aca_bank_info_decode(bank, &info);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@ -1055,15 +1056,28 @@ static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struc
|
||||
if (error_str)
|
||||
dev_info(adev->dev, "%s detected\n", error_str);
|
||||
|
||||
if ((type == ACA_ERROR_TYPE_UE && ext_error_code == 0) ||
|
||||
(type == ACA_ERROR_TYPE_CE && ext_error_code == 6))
|
||||
report->count[type] = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
|
||||
count = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
|
||||
|
||||
return 0;
|
||||
switch (type) {
|
||||
case ACA_SMU_TYPE_UE:
|
||||
if (ext_error_code != 0 && ext_error_code != 9)
|
||||
count = 0ULL;
|
||||
|
||||
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, count);
|
||||
break;
|
||||
case ACA_SMU_TYPE_CE:
|
||||
count = ext_error_code == 6 ? count : 0ULL;
|
||||
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE, count);
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const struct aca_bank_ops xgmi_v6_4_0_aca_bank_ops = {
|
||||
.aca_bank_generate_report = xgmi_v6_4_0_aca_bank_generate_report,
|
||||
.aca_bank_parser = xgmi_v6_4_0_aca_bank_parser,
|
||||
};
|
||||
|
||||
static const struct aca_info xgmi_v6_4_0_aca_info = {
|
||||
|
@ -44,6 +44,7 @@ struct amdgpu_hive_info {
|
||||
|
||||
struct amdgpu_reset_domain *reset_domain;
|
||||
atomic_t ras_recovery;
|
||||
struct ras_event_manager event_mgr;
|
||||
};
|
||||
|
||||
struct amdgpu_pcs_ras_field {
|
||||
|
@ -94,7 +94,8 @@ union amd_sriov_msg_feature_flags {
|
||||
uint32_t reg_indirect_acc : 1;
|
||||
uint32_t av1_support : 1;
|
||||
uint32_t vcn_rb_decouple : 1;
|
||||
uint32_t reserved : 24;
|
||||
uint32_t mes_info_enable : 1;
|
||||
uint32_t reserved : 23;
|
||||
} flags;
|
||||
uint32_t all;
|
||||
};
|
||||
@ -157,7 +158,7 @@ struct amd_sriov_msg_pf2vf_info_header {
|
||||
uint32_t reserved[2];
|
||||
};
|
||||
|
||||
#define AMD_SRIOV_MSG_PF2VF_INFO_FILLED_SIZE (48)
|
||||
#define AMD_SRIOV_MSG_PF2VF_INFO_FILLED_SIZE (49)
|
||||
struct amd_sriov_msg_pf2vf_info {
|
||||
/* header contains size and version */
|
||||
struct amd_sriov_msg_pf2vf_info_header header;
|
||||
@ -208,6 +209,8 @@ struct amd_sriov_msg_pf2vf_info {
|
||||
struct amd_sriov_msg_uuid_info uuid_info;
|
||||
/* PCIE atomic ops support flag */
|
||||
uint32_t pcie_atomic_ops_support_flags;
|
||||
/* Portion of GPU memory occupied by VF. MAX value is 65535, but set to uint32_t to maintain alignment with reserved size */
|
||||
uint32_t gpu_capacity;
|
||||
/* reserved */
|
||||
uint32_t reserved[256 - AMD_SRIOV_MSG_PF2VF_INFO_FILLED_SIZE];
|
||||
};
|
||||
@ -221,7 +224,7 @@ struct amd_sriov_msg_vf2pf_info_header {
|
||||
uint32_t reserved[2];
|
||||
};
|
||||
|
||||
#define AMD_SRIOV_MSG_VF2PF_INFO_FILLED_SIZE (70)
|
||||
#define AMD_SRIOV_MSG_VF2PF_INFO_FILLED_SIZE (73)
|
||||
struct amd_sriov_msg_vf2pf_info {
|
||||
/* header contains size and version */
|
||||
struct amd_sriov_msg_vf2pf_info_header header;
|
||||
@ -265,7 +268,9 @@ struct amd_sriov_msg_vf2pf_info {
|
||||
uint32_t version;
|
||||
} ucode_info[AMD_SRIOV_MSG_RESERVE_UCODE];
|
||||
uint64_t dummy_page_addr;
|
||||
|
||||
/* FB allocated for guest MES to record UQ info */
|
||||
uint64_t mes_info_addr;
|
||||
uint32_t mes_info_size;
|
||||
/* reserved */
|
||||
uint32_t reserved[256 - AMD_SRIOV_MSG_VF2PF_INFO_FILLED_SIZE];
|
||||
};
|
||||
|
@ -62,6 +62,11 @@ void aqua_vanjaram_doorbell_index_init(struct amdgpu_device *adev)
|
||||
adev->doorbell_index.max_assignment = AMDGPU_DOORBELL_LAYOUT1_MAX_ASSIGNMENT << 1;
|
||||
}
|
||||
|
||||
static bool aqua_vanjaram_xcp_vcn_shared(struct amdgpu_device *adev)
|
||||
{
|
||||
return (adev->xcp_mgr->num_xcps > adev->vcn.num_vcn_inst);
|
||||
}
|
||||
|
||||
static void aqua_vanjaram_set_xcp_id(struct amdgpu_device *adev,
|
||||
uint32_t inst_idx, struct amdgpu_ring *ring)
|
||||
{
|
||||
@ -87,7 +92,7 @@ static void aqua_vanjaram_set_xcp_id(struct amdgpu_device *adev,
|
||||
case AMDGPU_RING_TYPE_VCN_ENC:
|
||||
case AMDGPU_RING_TYPE_VCN_JPEG:
|
||||
ip_blk = AMDGPU_XCP_VCN;
|
||||
if (adev->xcp_mgr->mode == AMDGPU_CPX_PARTITION_MODE)
|
||||
if (aqua_vanjaram_xcp_vcn_shared(adev))
|
||||
inst_mask = 1 << (inst_idx * 2);
|
||||
break;
|
||||
default:
|
||||
@ -140,10 +145,12 @@ static int aqua_vanjaram_xcp_sched_list_update(
|
||||
|
||||
aqua_vanjaram_xcp_gpu_sched_update(adev, ring, ring->xcp_id);
|
||||
|
||||
/* VCN is shared by two partitions under CPX MODE */
|
||||
/* VCN may be shared by two partitions under CPX MODE in certain
|
||||
* configs.
|
||||
*/
|
||||
if ((ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC ||
|
||||
ring->funcs->type == AMDGPU_RING_TYPE_VCN_JPEG) &&
|
||||
adev->xcp_mgr->mode == AMDGPU_CPX_PARTITION_MODE)
|
||||
ring->funcs->type == AMDGPU_RING_TYPE_VCN_JPEG) &&
|
||||
aqua_vanjaram_xcp_vcn_shared(adev))
|
||||
aqua_vanjaram_xcp_gpu_sched_update(adev, ring, ring->xcp_id + 1);
|
||||
}
|
||||
|
||||
|
@ -1375,14 +1375,14 @@ static int cik_asic_pci_config_reset(struct amdgpu_device *adev)
|
||||
return r;
|
||||
}
|
||||
|
||||
static bool cik_asic_supports_baco(struct amdgpu_device *adev)
|
||||
static int cik_asic_supports_baco(struct amdgpu_device *adev)
|
||||
{
|
||||
switch (adev->asic_type) {
|
||||
case CHIP_BONAIRE:
|
||||
case CHIP_HAWAII:
|
||||
return amdgpu_dpm_is_baco_supported(adev);
|
||||
default:
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3964,7 +3964,7 @@ static void gfx_v10_0_check_gfxoff_flag(struct amdgpu_device *adev)
|
||||
|
||||
static int gfx_v10_0_init_microcode(struct amdgpu_device *adev)
|
||||
{
|
||||
char fw_name[40];
|
||||
char fw_name[53];
|
||||
char ucode_prefix[30];
|
||||
const char *wks = "";
|
||||
int err;
|
||||
@ -4518,7 +4518,7 @@ static int gfx_v10_0_sw_init(void *handle)
|
||||
case IP_VERSION(10, 3, 3):
|
||||
case IP_VERSION(10, 3, 7):
|
||||
adev->gfx.me.num_me = 1;
|
||||
adev->gfx.me.num_pipe_per_me = 1;
|
||||
adev->gfx.me.num_pipe_per_me = 2;
|
||||
adev->gfx.me.num_queue_per_pipe = 1;
|
||||
adev->gfx.mec.num_mec = 2;
|
||||
adev->gfx.mec.num_pipe_per_mec = 4;
|
||||
@ -8317,7 +8317,7 @@ static void gfx_v10_0_ring_emit_hdp_flush(struct amdgpu_ring *ring)
|
||||
}
|
||||
reg_mem_engine = 0;
|
||||
} else {
|
||||
ref_and_mask = nbio_hf_reg->ref_and_mask_cp0;
|
||||
ref_and_mask = nbio_hf_reg->ref_and_mask_cp0 << ring->pipe;
|
||||
reg_mem_engine = 1; /* pfp */
|
||||
}
|
||||
|
||||
|
@ -510,7 +510,7 @@ static void gfx_v11_0_check_fw_cp_gfx_shadow(struct amdgpu_device *adev)
|
||||
static int gfx_v11_0_init_microcode(struct amdgpu_device *adev)
|
||||
{
|
||||
char fw_name[40];
|
||||
char ucode_prefix[30];
|
||||
char ucode_prefix[25];
|
||||
int err;
|
||||
const struct rlc_firmware_header_v2_0 *rlc_hdr;
|
||||
uint16_t version_major;
|
||||
@ -1635,7 +1635,7 @@ static void gfx_v11_0_setup_rb(struct amdgpu_device *adev)
|
||||
active_rb_bitmap |= (0x3 << (i * rb_bitmap_width_per_sa));
|
||||
}
|
||||
|
||||
active_rb_bitmap |= global_active_rb_bitmap;
|
||||
active_rb_bitmap &= global_active_rb_bitmap;
|
||||
adev->gfx.config.backend_enable_mask = active_rb_bitmap;
|
||||
adev->gfx.config.num_rbs = hweight32(active_rb_bitmap);
|
||||
}
|
||||
@ -5465,6 +5465,7 @@ static void gfx_v11_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
|
||||
/* Make sure that we can't skip the SET_Q_MODE packets when the VM
|
||||
* changed in any way.
|
||||
*/
|
||||
ring->set_q_mode_offs = 0;
|
||||
ring->set_q_mode_ptr = NULL;
|
||||
}
|
||||
|
||||
|
@ -1249,7 +1249,7 @@ static void gfx_v9_0_check_if_need_gfxoff(struct amdgpu_device *adev)
|
||||
static int gfx_v9_0_init_cp_gfx_microcode(struct amdgpu_device *adev,
|
||||
char *chip_name)
|
||||
{
|
||||
char fw_name[30];
|
||||
char fw_name[50];
|
||||
int err;
|
||||
|
||||
snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_pfp.bin", chip_name);
|
||||
@ -1282,7 +1282,7 @@ out:
|
||||
static int gfx_v9_0_init_rlc_microcode(struct amdgpu_device *adev,
|
||||
char *chip_name)
|
||||
{
|
||||
char fw_name[30];
|
||||
char fw_name[53];
|
||||
int err;
|
||||
const struct rlc_firmware_header_v2_0 *rlc_hdr;
|
||||
uint16_t version_major;
|
||||
@ -1337,7 +1337,7 @@ static bool gfx_v9_0_load_mec2_fw_bin_support(struct amdgpu_device *adev)
|
||||
static int gfx_v9_0_init_cp_compute_microcode(struct amdgpu_device *adev,
|
||||
char *chip_name)
|
||||
{
|
||||
char fw_name[30];
|
||||
char fw_name[50];
|
||||
int err;
|
||||
|
||||
if (amdgpu_sriov_vf(adev) && (adev->asic_type == CHIP_ALDEBARAN))
|
||||
|
@ -1909,18 +1909,7 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev)
|
||||
mutex_unlock(&adev->grbm_idx_mutex);
|
||||
}
|
||||
|
||||
static bool gfx_v9_4_2_query_uctl2_poison_status(struct amdgpu_device *adev)
|
||||
{
|
||||
u32 status = 0;
|
||||
struct amdgpu_vmhub *hub;
|
||||
|
||||
hub = &adev->vmhub[AMDGPU_GFXHUB(0)];
|
||||
status = RREG32(hub->vm_l2_pro_fault_status);
|
||||
/* reset page fault status */
|
||||
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
|
||||
|
||||
return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
|
||||
}
|
||||
|
||||
struct amdgpu_ras_block_hw_ops gfx_v9_4_2_ras_ops = {
|
||||
.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
|
||||
@ -1934,5 +1923,4 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
|
||||
.hw_ops = &gfx_v9_4_2_ras_ops,
|
||||
},
|
||||
.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
|
||||
.query_utcl2_poison_status = gfx_v9_4_2_query_uctl2_poison_status,
|
||||
};
|
||||
|
@ -680,38 +680,44 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_3_gfx_funcs = {
|
||||
.ih_node_to_logical_xcc = &gfx_v9_4_3_ih_to_xcc_inst,
|
||||
};
|
||||
|
||||
static int gfx_v9_4_3_aca_bank_generate_report(struct aca_handle *handle,
|
||||
struct aca_bank *bank, enum aca_error_type type,
|
||||
struct aca_bank_report *report, void *data)
|
||||
static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle,
|
||||
struct aca_bank *bank, enum aca_smu_type type,
|
||||
void *data)
|
||||
{
|
||||
u64 status, misc0;
|
||||
struct aca_bank_info info;
|
||||
u64 misc0;
|
||||
u32 instlo;
|
||||
int ret;
|
||||
|
||||
status = bank->regs[ACA_REG_IDX_STATUS];
|
||||
if ((type == ACA_ERROR_TYPE_UE &&
|
||||
ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_FAULT) ||
|
||||
(type == ACA_ERROR_TYPE_CE &&
|
||||
ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_CE)) {
|
||||
ret = aca_bank_info_decode(bank, &info);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = aca_bank_info_decode(bank, &report->info);
|
||||
if (ret)
|
||||
return ret;
|
||||
/* NOTE: overwrite info.die_id with xcd id for gfx */
|
||||
instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]);
|
||||
instlo &= GENMASK(31, 1);
|
||||
info.die_id = instlo == mmSMNAID_XCD0_MCA_SMU ? 0 : 1;
|
||||
|
||||
/* NOTE: overwrite info.die_id with xcd id for gfx */
|
||||
instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]);
|
||||
instlo &= GENMASK(31, 1);
|
||||
report->info.die_id = instlo == mmSMNAID_XCD0_MCA_SMU ? 0 : 1;
|
||||
misc0 = bank->regs[ACA_REG_IDX_MISC0];
|
||||
|
||||
misc0 = bank->regs[ACA_REG_IDX_MISC0];
|
||||
report->count[type] = ACA_REG__MISC0__ERRCNT(misc0);
|
||||
switch (type) {
|
||||
case ACA_SMU_TYPE_UE:
|
||||
ret = aca_error_cache_log_bank_error(handle, &info,
|
||||
ACA_ERROR_TYPE_UE, 1ULL);
|
||||
break;
|
||||
case ACA_SMU_TYPE_CE:
|
||||
ret = aca_error_cache_log_bank_error(handle, &info,
|
||||
ACA_ERROR_TYPE_CE, ACA_REG__MISC0__ERRCNT(misc0));
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool gfx_v9_4_3_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
|
||||
enum aca_error_type type, void *data)
|
||||
enum aca_smu_type type, void *data)
|
||||
{
|
||||
u32 instlo;
|
||||
|
||||
@ -730,7 +736,7 @@ static bool gfx_v9_4_3_aca_bank_is_valid(struct aca_handle *handle, struct aca_b
|
||||
}
|
||||
|
||||
static const struct aca_bank_ops gfx_v9_4_3_aca_bank_ops = {
|
||||
.aca_bank_generate_report = gfx_v9_4_3_aca_bank_generate_report,
|
||||
.aca_bank_parser = gfx_v9_4_3_aca_bank_parser,
|
||||
.aca_bank_is_valid = gfx_v9_4_3_aca_bank_is_valid,
|
||||
};
|
||||
|
||||
|
@ -443,6 +443,22 @@ static void gfxhub_v1_0_init(struct amdgpu_device *adev)
|
||||
mmVM_INVALIDATE_ENG0_ADDR_RANGE_LO32;
|
||||
}
|
||||
|
||||
static bool gfxhub_v1_0_query_utcl2_poison_status(struct amdgpu_device *adev,
|
||||
int xcc_id)
|
||||
{
|
||||
u32 status = 0;
|
||||
struct amdgpu_vmhub *hub;
|
||||
|
||||
if (amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(9, 4, 2))
|
||||
return false;
|
||||
|
||||
hub = &adev->vmhub[AMDGPU_GFXHUB(0)];
|
||||
status = RREG32(hub->vm_l2_pro_fault_status);
|
||||
/* reset page fault status */
|
||||
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
|
||||
|
||||
return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
|
||||
}
|
||||
|
||||
const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = {
|
||||
.get_mc_fb_offset = gfxhub_v1_0_get_mc_fb_offset,
|
||||
@ -452,4 +468,5 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = {
|
||||
.set_fault_enable_default = gfxhub_v1_0_set_fault_enable_default,
|
||||
.init = gfxhub_v1_0_init,
|
||||
.get_xgmi_info = gfxhub_v1_1_get_xgmi_info,
|
||||
.query_utcl2_poison_status = gfxhub_v1_0_query_utcl2_poison_status,
|
||||
};
|
||||
|
@ -620,6 +620,20 @@ static int gfxhub_v1_2_get_xgmi_info(struct amdgpu_device *adev)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool gfxhub_v1_2_query_utcl2_poison_status(struct amdgpu_device *adev,
|
||||
int xcc_id)
|
||||
{
|
||||
u32 fed, status;
|
||||
|
||||
status = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regVM_L2_PROTECTION_FAULT_STATUS);
|
||||
fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
|
||||
/* reset page fault status */
|
||||
WREG32_P(SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id),
|
||||
regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1);
|
||||
|
||||
return fed;
|
||||
}
|
||||
|
||||
const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = {
|
||||
.get_mc_fb_offset = gfxhub_v1_2_get_mc_fb_offset,
|
||||
.setup_vm_pt_regs = gfxhub_v1_2_setup_vm_pt_regs,
|
||||
@ -628,6 +642,7 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = {
|
||||
.set_fault_enable_default = gfxhub_v1_2_set_fault_enable_default,
|
||||
.init = gfxhub_v1_2_init,
|
||||
.get_xgmi_info = gfxhub_v1_2_get_xgmi_info,
|
||||
.query_utcl2_poison_status = gfxhub_v1_2_query_utcl2_poison_status,
|
||||
};
|
||||
|
||||
static int gfxhub_v1_2_xcp_resume(void *handle, uint32_t inst_mask)
|
||||
|
@ -548,7 +548,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
|
||||
{
|
||||
bool retry_fault = !!(entry->src_data[1] & 0x80);
|
||||
bool write_fault = !!(entry->src_data[1] & 0x20);
|
||||
uint32_t status = 0, cid = 0, rw = 0;
|
||||
uint32_t status = 0, cid = 0, rw = 0, fed = 0;
|
||||
struct amdgpu_task_info *task_info;
|
||||
struct amdgpu_vmhub *hub;
|
||||
const char *mmhub_cid;
|
||||
@ -664,6 +664,13 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
|
||||
status = RREG32(hub->vm_l2_pro_fault_status);
|
||||
cid = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, CID);
|
||||
rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW);
|
||||
fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
|
||||
|
||||
/* for fed error, kfd will handle it, return directly */
|
||||
if (fed && amdgpu_ras_is_poison_mode_supported(adev) &&
|
||||
(amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2)))
|
||||
return 0;
|
||||
|
||||
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
|
||||
|
||||
amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub);
|
||||
@ -1450,7 +1457,6 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev)
|
||||
adev->umc.channel_offs = UMC_V12_0_PER_CHANNEL_OFFSET;
|
||||
adev->umc.active_mask = adev->aid_mask;
|
||||
adev->umc.retire_unit = UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL;
|
||||
adev->umc.channel_idx_tbl = &umc_v12_0_channel_idx_tbl[0][0][0];
|
||||
if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu)
|
||||
adev->umc.ras = &umc_v12_0_ras;
|
||||
break;
|
||||
|
@ -411,14 +411,47 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes)
|
||||
mes_set_hw_res_pkt.enable_reg_active_poll = 1;
|
||||
mes_set_hw_res_pkt.enable_level_process_quantum_check = 1;
|
||||
mes_set_hw_res_pkt.oversubscription_timer = 50;
|
||||
mes_set_hw_res_pkt.enable_mes_event_int_logging = 1;
|
||||
mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = mes->event_log_gpu_addr;
|
||||
if (amdgpu_mes_log_enable) {
|
||||
mes_set_hw_res_pkt.enable_mes_event_int_logging = 1;
|
||||
mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr =
|
||||
mes->event_log_gpu_addr;
|
||||
}
|
||||
|
||||
return mes_v11_0_submit_pkt_and_poll_completion(mes,
|
||||
&mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt),
|
||||
offsetof(union MESAPI_SET_HW_RESOURCES, api_status));
|
||||
}
|
||||
|
||||
static int mes_v11_0_set_hw_resources_1(struct amdgpu_mes *mes)
|
||||
{
|
||||
int size = 128 * PAGE_SIZE;
|
||||
int ret = 0;
|
||||
struct amdgpu_device *adev = mes->adev;
|
||||
union MESAPI_SET_HW_RESOURCES_1 mes_set_hw_res_pkt;
|
||||
memset(&mes_set_hw_res_pkt, 0, sizeof(mes_set_hw_res_pkt));
|
||||
|
||||
mes_set_hw_res_pkt.header.type = MES_API_TYPE_SCHEDULER;
|
||||
mes_set_hw_res_pkt.header.opcode = MES_SCH_API_SET_HW_RSRC_1;
|
||||
mes_set_hw_res_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
|
||||
mes_set_hw_res_pkt.enable_mes_info_ctx = 1;
|
||||
|
||||
ret = amdgpu_bo_create_kernel(adev, size, PAGE_SIZE,
|
||||
AMDGPU_GEM_DOMAIN_VRAM,
|
||||
&mes->resource_1,
|
||||
&mes->resource_1_gpu_addr,
|
||||
&mes->resource_1_addr);
|
||||
if (ret) {
|
||||
dev_err(adev->dev, "(%d) failed to create mes resource_1 bo\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
mes_set_hw_res_pkt.mes_info_ctx_mc_addr = mes->resource_1_gpu_addr;
|
||||
mes_set_hw_res_pkt.mes_info_ctx_size = mes->resource_1->tbo.base.size;
|
||||
return mes_v11_0_submit_pkt_and_poll_completion(mes,
|
||||
&mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt),
|
||||
offsetof(union MESAPI_SET_HW_RESOURCES_1, api_status));
|
||||
}
|
||||
|
||||
static const struct amdgpu_mes_funcs mes_v11_0_funcs = {
|
||||
.add_hw_queue = mes_v11_0_add_hw_queue,
|
||||
.remove_hw_queue = mes_v11_0_remove_hw_queue,
|
||||
@ -1200,6 +1233,14 @@ static int mes_v11_0_hw_init(void *handle)
|
||||
if (r)
|
||||
goto failure;
|
||||
|
||||
if (amdgpu_sriov_is_mes_info_enable(adev)) {
|
||||
r = mes_v11_0_set_hw_resources_1(&adev->mes);
|
||||
if (r) {
|
||||
DRM_ERROR("failed mes_v11_0_set_hw_resources_1, r=%d\n", r);
|
||||
goto failure;
|
||||
}
|
||||
}
|
||||
|
||||
r = mes_v11_0_query_sched_status(&adev->mes);
|
||||
if (r) {
|
||||
DRM_ERROR("MES is busy\n");
|
||||
@ -1223,6 +1264,11 @@ failure:
|
||||
|
||||
static int mes_v11_0_hw_fini(void *handle)
|
||||
{
|
||||
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
|
||||
if (amdgpu_sriov_is_mes_info_enable(adev)) {
|
||||
amdgpu_bo_free_kernel(&adev->mes.resource_1, &adev->mes.resource_1_gpu_addr,
|
||||
&adev->mes.resource_1_addr);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -559,6 +559,20 @@ static void mmhub_v1_8_get_clockgating(struct amdgpu_device *adev, u64 *flags)
|
||||
|
||||
}
|
||||
|
||||
static bool mmhub_v1_8_query_utcl2_poison_status(struct amdgpu_device *adev,
|
||||
int hub_inst)
|
||||
{
|
||||
u32 fed, status;
|
||||
|
||||
status = RREG32_SOC15(MMHUB, hub_inst, regVM_L2_PROTECTION_FAULT_STATUS);
|
||||
fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
|
||||
/* reset page fault status */
|
||||
WREG32_P(SOC15_REG_OFFSET(MMHUB, hub_inst,
|
||||
regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1);
|
||||
|
||||
return fed;
|
||||
}
|
||||
|
||||
const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = {
|
||||
.get_fb_location = mmhub_v1_8_get_fb_location,
|
||||
.init = mmhub_v1_8_init,
|
||||
@ -568,6 +582,7 @@ const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = {
|
||||
.setup_vm_pt_regs = mmhub_v1_8_setup_vm_pt_regs,
|
||||
.set_clockgating = mmhub_v1_8_set_clockgating,
|
||||
.get_clockgating = mmhub_v1_8_get_clockgating,
|
||||
.query_utcl2_poison_status = mmhub_v1_8_query_utcl2_poison_status,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ras_err_status_reg_entry mmhub_v1_8_ce_reg_list[] = {
|
||||
@ -706,28 +721,32 @@ static const struct amdgpu_ras_block_hw_ops mmhub_v1_8_ras_hw_ops = {
|
||||
.reset_ras_error_count = mmhub_v1_8_reset_ras_error_count,
|
||||
};
|
||||
|
||||
static int mmhub_v1_8_aca_bank_generate_report(struct aca_handle *handle,
|
||||
struct aca_bank *bank, enum aca_error_type type,
|
||||
struct aca_bank_report *report, void *data)
|
||||
static int mmhub_v1_8_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
|
||||
enum aca_smu_type type, void *data)
|
||||
{
|
||||
u64 status, misc0;
|
||||
struct aca_bank_info info;
|
||||
u64 misc0;
|
||||
int ret;
|
||||
|
||||
status = bank->regs[ACA_REG_IDX_STATUS];
|
||||
if ((type == ACA_ERROR_TYPE_UE &&
|
||||
ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_FAULT) ||
|
||||
(type == ACA_ERROR_TYPE_CE &&
|
||||
ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_CE)) {
|
||||
ret = aca_bank_info_decode(bank, &info);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = aca_bank_info_decode(bank, &report->info);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
misc0 = bank->regs[ACA_REG_IDX_MISC0];
|
||||
report->count[type] = ACA_REG__MISC0__ERRCNT(misc0);
|
||||
misc0 = bank->regs[ACA_REG_IDX_MISC0];
|
||||
switch (type) {
|
||||
case ACA_SMU_TYPE_UE:
|
||||
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE,
|
||||
1ULL);
|
||||
break;
|
||||
case ACA_SMU_TYPE_CE:
|
||||
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE,
|
||||
ACA_REG__MISC0__ERRCNT(misc0));
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* reference to smu driver if header file */
|
||||
@ -741,7 +760,7 @@ static int mmhub_v1_8_err_codes[] = {
|
||||
};
|
||||
|
||||
static bool mmhub_v1_8_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
|
||||
enum aca_error_type type, void *data)
|
||||
enum aca_smu_type type, void *data)
|
||||
{
|
||||
u32 instlo;
|
||||
|
||||
@ -760,7 +779,7 @@ static bool mmhub_v1_8_aca_bank_is_valid(struct aca_handle *handle, struct aca_b
|
||||
}
|
||||
|
||||
static const struct aca_bank_ops mmhub_v1_8_aca_bank_ops = {
|
||||
.aca_bank_generate_report = mmhub_v1_8_aca_bank_generate_report,
|
||||
.aca_bank_parser = mmhub_v1_8_aca_bank_parser,
|
||||
.aca_bank_is_valid = mmhub_v1_8_aca_bank_is_valid,
|
||||
};
|
||||
|
||||
|
@ -276,6 +276,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
|
||||
timeout -= 10;
|
||||
} while (timeout > 1);
|
||||
|
||||
dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");
|
||||
|
||||
flr_done:
|
||||
atomic_set(&adev->reset_domain->in_gpu_reset, 0);
|
||||
up_write(&adev->reset_domain->sem);
|
||||
|
@ -309,6 +309,8 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
|
||||
timeout -= 10;
|
||||
} while (timeout > 1);
|
||||
|
||||
dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");
|
||||
|
||||
flr_done:
|
||||
atomic_set(&adev->reset_domain->in_gpu_reset, 0);
|
||||
up_write(&adev->reset_domain->sem);
|
||||
@ -444,7 +446,8 @@ static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev,
|
||||
amdgpu_virt_fini_data_exchange(adev);
|
||||
xgpu_nv_send_access_requests_with_param(adev,
|
||||
IDH_RAS_POISON, block, 0, 0);
|
||||
amdgpu_virt_init_data_exchange(adev);
|
||||
if (block != AMDGPU_RAS_BLOCK__SDMA)
|
||||
amdgpu_virt_init_data_exchange(adev);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1602,19 +1602,9 @@ static int sdma_v4_4_2_set_ecc_irq_state(struct amdgpu_device *adev,
|
||||
u32 sdma_cntl;
|
||||
|
||||
sdma_cntl = RREG32_SDMA(type, regSDMA_CNTL);
|
||||
switch (state) {
|
||||
case AMDGPU_IRQ_STATE_DISABLE:
|
||||
sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA_CNTL,
|
||||
DRAM_ECC_INT_ENABLE, 0);
|
||||
WREG32_SDMA(type, regSDMA_CNTL, sdma_cntl);
|
||||
break;
|
||||
/* sdma ecc interrupt is enabled by default
|
||||
* driver doesn't need to do anything to
|
||||
* enable the interrupt */
|
||||
case AMDGPU_IRQ_STATE_ENABLE:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA_CNTL, DRAM_ECC_INT_ENABLE,
|
||||
state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
|
||||
WREG32_SDMA(type, regSDMA_CNTL, sdma_cntl);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -2189,35 +2179,39 @@ static const struct amdgpu_ras_block_hw_ops sdma_v4_4_2_ras_hw_ops = {
|
||||
.reset_ras_error_count = sdma_v4_4_2_reset_ras_error_count,
|
||||
};
|
||||
|
||||
static int sdma_v4_4_2_aca_bank_generate_report(struct aca_handle *handle,
|
||||
struct aca_bank *bank, enum aca_error_type type,
|
||||
struct aca_bank_report *report, void *data)
|
||||
static int sdma_v4_4_2_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
|
||||
enum aca_smu_type type, void *data)
|
||||
{
|
||||
u64 status, misc0;
|
||||
struct aca_bank_info info;
|
||||
u64 misc0;
|
||||
int ret;
|
||||
|
||||
status = bank->regs[ACA_REG_IDX_STATUS];
|
||||
if ((type == ACA_ERROR_TYPE_UE &&
|
||||
ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_FAULT) ||
|
||||
(type == ACA_ERROR_TYPE_CE &&
|
||||
ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_CE)) {
|
||||
ret = aca_bank_info_decode(bank, &info);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = aca_bank_info_decode(bank, &report->info);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
misc0 = bank->regs[ACA_REG_IDX_MISC0];
|
||||
report->count[type] = ACA_REG__MISC0__ERRCNT(misc0);
|
||||
misc0 = bank->regs[ACA_REG_IDX_MISC0];
|
||||
switch (type) {
|
||||
case ACA_SMU_TYPE_UE:
|
||||
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE,
|
||||
1ULL);
|
||||
break;
|
||||
case ACA_SMU_TYPE_CE:
|
||||
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE,
|
||||
ACA_REG__MISC0__ERRCNT(misc0));
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* CODE_SDMA0 - CODE_SDMA4, reference to smu driver if header file */
|
||||
static int sdma_v4_4_2_err_codes[] = { 33, 34, 35, 36 };
|
||||
|
||||
static bool sdma_v4_4_2_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
|
||||
enum aca_error_type type, void *data)
|
||||
enum aca_smu_type type, void *data)
|
||||
{
|
||||
u32 instlo;
|
||||
|
||||
@ -2236,7 +2230,7 @@ static bool sdma_v4_4_2_aca_bank_is_valid(struct aca_handle *handle, struct aca_
|
||||
}
|
||||
|
||||
static const struct aca_bank_ops sdma_v4_4_2_aca_bank_ops = {
|
||||
.aca_bank_generate_report = sdma_v4_4_2_aca_bank_generate_report,
|
||||
.aca_bank_parser = sdma_v4_4_2_aca_bank_parser,
|
||||
.aca_bank_is_valid = sdma_v4_4_2_aca_bank_is_valid,
|
||||
};
|
||||
|
||||
|
@ -507,6 +507,13 @@ static int sdma_v6_0_gfx_resume(struct amdgpu_device *adev)
|
||||
/* set minor_ptr_update to 0 after wptr programed */
|
||||
WREG32_SOC15_IP(GC, sdma_v6_0_get_reg_offset(adev, i, regSDMA0_QUEUE0_MINOR_PTR_UPDATE), 0);
|
||||
|
||||
/* Set up sdma hang watchdog */
|
||||
temp = RREG32_SOC15_IP(GC, sdma_v6_0_get_reg_offset(adev, i, regSDMA0_WATCHDOG_CNTL));
|
||||
/* 100ms per unit */
|
||||
temp = REG_SET_FIELD(temp, SDMA0_WATCHDOG_CNTL, QUEUE_HANG_COUNT,
|
||||
max(adev->usec_timeout/100000, 1));
|
||||
WREG32_SOC15_IP(GC, sdma_v6_0_get_reg_offset(adev, i, regSDMA0_WATCHDOG_CNTL), temp);
|
||||
|
||||
/* Set up RESP_MODE to non-copy addresses */
|
||||
temp = RREG32_SOC15_IP(GC, sdma_v6_0_get_reg_offset(adev, i, regSDMA0_UTCL1_CNTL));
|
||||
temp = REG_SET_FIELD(temp, SDMA0_UTCL1_CNTL, RESP_MODE, 3);
|
||||
|
@ -1409,9 +1409,9 @@ static int si_gpu_pci_config_reset(struct amdgpu_device *adev)
|
||||
return r;
|
||||
}
|
||||
|
||||
static bool si_asic_supports_baco(struct amdgpu_device *adev)
|
||||
static int si_asic_supports_baco(struct amdgpu_device *adev)
|
||||
{
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static enum amd_reset_method
|
||||
|
62
drivers/gpu/drm/amd/amdgpu/smuio_v14_0_2.c
Normal file
62
drivers/gpu/drm/amd/amdgpu/smuio_v14_0_2.c
Normal file
@ -0,0 +1,62 @@
|
||||
/*
|
||||
* Copyright 2023 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
#include "amdgpu.h"
|
||||
#include "smuio_v14_0_2.h"
|
||||
#include "smuio/smuio_14_0_2_offset.h"
|
||||
#include "smuio/smuio_14_0_2_sh_mask.h"
|
||||
#include <linux/preempt.h>
|
||||
|
||||
static u32 smuio_v14_0_2_get_rom_index_offset(struct amdgpu_device *adev)
|
||||
{
|
||||
return SOC15_REG_OFFSET(SMUIO, 0, regROM_INDEX);
|
||||
}
|
||||
|
||||
static u32 smuio_v14_0_2_get_rom_data_offset(struct amdgpu_device *adev)
|
||||
{
|
||||
return SOC15_REG_OFFSET(SMUIO, 0, regROM_DATA);
|
||||
}
|
||||
|
||||
static u64 smuio_v14_0_2_get_gpu_clock_counter(struct amdgpu_device *adev)
|
||||
{
|
||||
u64 clock;
|
||||
u64 clock_counter_lo, clock_counter_hi_pre, clock_counter_hi_after;
|
||||
|
||||
preempt_disable();
|
||||
clock_counter_hi_pre = (u64)RREG32_SOC15(SMUIO, 0, regGOLDEN_TSC_COUNT_UPPER);
|
||||
clock_counter_lo = (u64)RREG32_SOC15(SMUIO, 0, regGOLDEN_TSC_COUNT_LOWER);
|
||||
/* the clock counter may be udpated during polling the counters */
|
||||
clock_counter_hi_after = (u64)RREG32_SOC15(SMUIO, 0, regGOLDEN_TSC_COUNT_UPPER);
|
||||
if (clock_counter_hi_pre != clock_counter_hi_after)
|
||||
clock_counter_lo = (u64)RREG32_SOC15(SMUIO, 0, regGOLDEN_TSC_COUNT_LOWER);
|
||||
preempt_enable();
|
||||
|
||||
clock = clock_counter_lo | (clock_counter_hi_after << 32ULL);
|
||||
|
||||
return clock;
|
||||
}
|
||||
|
||||
const struct amdgpu_smuio_funcs smuio_v14_0_2_funcs = {
|
||||
.get_rom_index_offset = smuio_v14_0_2_get_rom_index_offset,
|
||||
.get_rom_data_offset = smuio_v14_0_2_get_rom_data_offset,
|
||||
.get_gpu_clock_counter = smuio_v14_0_2_get_gpu_clock_counter,
|
||||
};
|
30
drivers/gpu/drm/amd/amdgpu/smuio_v14_0_2.h
Normal file
30
drivers/gpu/drm/amd/amdgpu/smuio_v14_0_2.h
Normal file
@ -0,0 +1,30 @@
|
||||
/*
|
||||
* Copyright 2023 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
#ifndef __SMUIO_V14_0_2_H__
|
||||
#define __SMUIO_V14_0_2_H__
|
||||
|
||||
#include "soc15_common.h"
|
||||
|
||||
extern const struct amdgpu_smuio_funcs smuio_v14_0_2_funcs;
|
||||
|
||||
#endif /* __SMUIO_V14_0_2_H__ */
|
@ -502,7 +502,7 @@ static int soc15_asic_baco_reset(struct amdgpu_device *adev)
|
||||
static enum amd_reset_method
|
||||
soc15_asic_reset_method(struct amdgpu_device *adev)
|
||||
{
|
||||
bool baco_reset = false;
|
||||
int baco_reset = 0;
|
||||
bool connected_to_cpu = false;
|
||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||
|
||||
@ -540,7 +540,7 @@ soc15_asic_reset_method(struct amdgpu_device *adev)
|
||||
*/
|
||||
if (ras && adev->ras_enabled &&
|
||||
adev->pm.fw_version <= 0x283400)
|
||||
baco_reset = false;
|
||||
baco_reset = 0;
|
||||
} else {
|
||||
baco_reset = amdgpu_dpm_is_baco_supported(adev);
|
||||
}
|
||||
@ -620,7 +620,7 @@ static int soc15_asic_reset(struct amdgpu_device *adev)
|
||||
}
|
||||
}
|
||||
|
||||
static bool soc15_supports_baco(struct amdgpu_device *adev)
|
||||
static int soc15_supports_baco(struct amdgpu_device *adev)
|
||||
{
|
||||
switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
|
||||
case IP_VERSION(9, 0, 0):
|
||||
@ -628,13 +628,13 @@ static bool soc15_supports_baco(struct amdgpu_device *adev)
|
||||
if (adev->asic_type == CHIP_VEGA20) {
|
||||
if (adev->psp.sos.fw_version >= 0x80067)
|
||||
return amdgpu_dpm_is_baco_supported(adev);
|
||||
return false;
|
||||
return 0;
|
||||
} else {
|
||||
return amdgpu_dpm_is_baco_supported(adev);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -457,10 +457,8 @@ static bool soc21_need_full_reset(struct amdgpu_device *adev)
|
||||
{
|
||||
switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
|
||||
case IP_VERSION(11, 0, 0):
|
||||
return amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC);
|
||||
case IP_VERSION(11, 0, 2):
|
||||
case IP_VERSION(11, 0, 3):
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
@ -722,7 +720,10 @@ static int soc21_common_early_init(void *handle)
|
||||
AMD_PG_SUPPORT_VCN |
|
||||
AMD_PG_SUPPORT_JPEG |
|
||||
AMD_PG_SUPPORT_GFX_PG;
|
||||
adev->external_rev_id = adev->rev_id + 0x1;
|
||||
if (adev->rev_id == 0)
|
||||
adev->external_rev_id = 0x1;
|
||||
else
|
||||
adev->external_rev_id = adev->rev_id + 0x10;
|
||||
break;
|
||||
case IP_VERSION(11, 5, 1):
|
||||
adev->cg_flags =
|
||||
@ -869,10 +870,35 @@ static int soc21_common_suspend(void *handle)
|
||||
return soc21_common_hw_fini(adev);
|
||||
}
|
||||
|
||||
static bool soc21_need_reset_on_resume(struct amdgpu_device *adev)
|
||||
{
|
||||
u32 sol_reg1, sol_reg2;
|
||||
|
||||
/* Will reset for the following suspend abort cases.
|
||||
* 1) Only reset dGPU side.
|
||||
* 2) S3 suspend got aborted and TOS is active.
|
||||
*/
|
||||
if (!(adev->flags & AMD_IS_APU) && adev->in_s3 &&
|
||||
!adev->suspend_complete) {
|
||||
sol_reg1 = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_81);
|
||||
msleep(100);
|
||||
sol_reg2 = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_81);
|
||||
|
||||
return (sol_reg1 != sol_reg2);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static int soc21_common_resume(void *handle)
|
||||
{
|
||||
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
|
||||
|
||||
if (soc21_need_reset_on_resume(adev)) {
|
||||
dev_info(adev->dev, "S3 suspend aborted, resetting...");
|
||||
soc21_asic_reset(adev);
|
||||
}
|
||||
|
||||
return soc21_common_hw_init(adev);
|
||||
}
|
||||
|
||||
|
@ -146,6 +146,7 @@ struct ta_ras_mca_addr {
|
||||
uint32_t ch_inst;
|
||||
uint32_t umc_inst;
|
||||
uint32_t node_inst;
|
||||
uint32_t socket_id;
|
||||
};
|
||||
|
||||
struct ta_ras_phy_addr {
|
||||
|
@ -28,28 +28,6 @@
|
||||
#include "umc/umc_12_0_0_sh_mask.h"
|
||||
#include "mp/mp_13_0_6_sh_mask.h"
|
||||
|
||||
const uint32_t
|
||||
umc_v12_0_channel_idx_tbl[]
|
||||
[UMC_V12_0_UMC_INSTANCE_NUM]
|
||||
[UMC_V12_0_CHANNEL_INSTANCE_NUM] = {
|
||||
{{3, 7, 11, 15, 2, 6, 10, 14}, {1, 5, 9, 13, 0, 4, 8, 12},
|
||||
{19, 23, 27, 31, 18, 22, 26, 30}, {17, 21, 25, 29, 16, 20, 24, 28}},
|
||||
{{47, 43, 39, 35, 46, 42, 38, 34}, {45, 41, 37, 33, 44, 40, 36, 32},
|
||||
{63, 59, 55, 51, 62, 58, 54, 50}, {61, 57, 53, 49, 60, 56, 52, 48}},
|
||||
{{79, 75, 71, 67, 78, 74, 70, 66}, {77, 73, 69, 65, 76, 72, 68, 64},
|
||||
{95, 91, 87, 83, 94, 90, 86, 82}, {93, 89, 85, 81, 92, 88, 84, 80}},
|
||||
{{99, 103, 107, 111, 98, 102, 106, 110}, {97, 101, 105, 109, 96, 100, 104, 108},
|
||||
{115, 119, 123, 127, 114, 118, 122, 126}, {113, 117, 121, 125, 112, 116, 120, 124}}
|
||||
};
|
||||
|
||||
/* mapping of MCA error address to normalized address */
|
||||
static const uint32_t umc_v12_0_ma2na_mapping[] = {
|
||||
0, 5, 6, 8, 9, 14, 12, 13,
|
||||
10, 11, 15, 16, 17, 18, 19, 20,
|
||||
21, 22, 23, 24, 25, 26, 27, 28,
|
||||
24, 7, 29, 30,
|
||||
};
|
||||
|
||||
static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
|
||||
uint32_t node_inst,
|
||||
uint32_t umc_inst,
|
||||
@ -192,99 +170,22 @@ static void umc_v12_0_query_ras_error_count(struct amdgpu_device *adev,
|
||||
umc_v12_0_reset_error_count(adev);
|
||||
}
|
||||
|
||||
static bool umc_v12_0_bit_wise_xor(uint32_t val)
|
||||
{
|
||||
bool result = 0;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 32; i++)
|
||||
result = result ^ ((val >> i) & 0x1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static void umc_v12_0_mca_addr_to_pa(struct amdgpu_device *adev,
|
||||
uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst,
|
||||
uint32_t node_inst,
|
||||
struct ta_ras_query_address_output *addr_out)
|
||||
{
|
||||
uint32_t channel_index, i;
|
||||
uint64_t na, soc_pa;
|
||||
uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row;
|
||||
uint32_t bank0, bank1, bank2, bank3, bank;
|
||||
|
||||
bank_hash0 = (err_addr >> UMC_V12_0_MCA_B0_BIT) & 0x1ULL;
|
||||
bank_hash1 = (err_addr >> UMC_V12_0_MCA_B1_BIT) & 0x1ULL;
|
||||
bank_hash2 = (err_addr >> UMC_V12_0_MCA_B2_BIT) & 0x1ULL;
|
||||
bank_hash3 = (err_addr >> UMC_V12_0_MCA_B3_BIT) & 0x1ULL;
|
||||
col = (err_addr >> 1) & 0x1fULL;
|
||||
row = (err_addr >> 10) & 0x3fffULL;
|
||||
|
||||
/* apply bank hash algorithm */
|
||||
bank0 =
|
||||
bank_hash0 ^ (UMC_V12_0_XOR_EN0 &
|
||||
(umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR0) ^
|
||||
(umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR0))));
|
||||
bank1 =
|
||||
bank_hash1 ^ (UMC_V12_0_XOR_EN1 &
|
||||
(umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR1) ^
|
||||
(umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR1))));
|
||||
bank2 =
|
||||
bank_hash2 ^ (UMC_V12_0_XOR_EN2 &
|
||||
(umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR2) ^
|
||||
(umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR2))));
|
||||
bank3 =
|
||||
bank_hash3 ^ (UMC_V12_0_XOR_EN3 &
|
||||
(umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR3) ^
|
||||
(umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR3))));
|
||||
|
||||
bank = bank0 | (bank1 << 1) | (bank2 << 2) | (bank3 << 3);
|
||||
err_addr &= ~0x3c0ULL;
|
||||
err_addr |= (bank << UMC_V12_0_MCA_B0_BIT);
|
||||
|
||||
na = 0x0;
|
||||
/* convert mca error address to normalized address */
|
||||
for (i = 1; i < ARRAY_SIZE(umc_v12_0_ma2na_mapping); i++)
|
||||
na |= ((err_addr >> i) & 0x1ULL) << umc_v12_0_ma2na_mapping[i];
|
||||
|
||||
channel_index =
|
||||
adev->umc.channel_idx_tbl[node_inst * adev->umc.umc_inst_num *
|
||||
adev->umc.channel_inst_num +
|
||||
umc_inst * adev->umc.channel_inst_num +
|
||||
ch_inst];
|
||||
/* translate umc channel address to soc pa, 3 parts are included */
|
||||
soc_pa = ADDR_OF_32KB_BLOCK(na) |
|
||||
ADDR_OF_256B_BLOCK(channel_index) |
|
||||
OFFSET_IN_256B_BLOCK(na);
|
||||
|
||||
/* the umc channel bits are not original values, they are hashed */
|
||||
UMC_V12_0_SET_CHANNEL_HASH(channel_index, soc_pa);
|
||||
|
||||
addr_out->pa.pa = soc_pa;
|
||||
addr_out->pa.bank = bank;
|
||||
addr_out->pa.channel_idx = channel_index;
|
||||
}
|
||||
|
||||
static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
|
||||
struct ras_err_data *err_data, uint64_t err_addr,
|
||||
uint32_t ch_inst, uint32_t umc_inst,
|
||||
uint32_t node_inst)
|
||||
struct ras_err_data *err_data,
|
||||
struct ta_ras_query_address_input *addr_in)
|
||||
{
|
||||
uint32_t col, row, row_xor, bank, channel_index;
|
||||
uint64_t soc_pa, retired_page, column;
|
||||
struct ta_ras_query_address_input addr_in;
|
||||
uint64_t soc_pa, retired_page, column, err_addr;
|
||||
struct ta_ras_query_address_output addr_out;
|
||||
|
||||
addr_in.addr_type = TA_RAS_MCA_TO_PA;
|
||||
addr_in.ma.err_addr = err_addr;
|
||||
addr_in.ma.ch_inst = ch_inst;
|
||||
addr_in.ma.umc_inst = umc_inst;
|
||||
addr_in.ma.node_inst = node_inst;
|
||||
err_addr = addr_in->ma.err_addr;
|
||||
addr_in->addr_type = TA_RAS_MCA_TO_PA;
|
||||
if (psp_ras_query_address(&adev->psp, addr_in, &addr_out)) {
|
||||
dev_warn(adev->dev, "Failed to query RAS physical address for 0x%llx",
|
||||
err_addr);
|
||||
|
||||
if (psp_ras_query_address(&adev->psp, &addr_in, &addr_out))
|
||||
/* fallback to old path if fail to get pa from psp */
|
||||
umc_v12_0_mca_addr_to_pa(adev, err_addr, ch_inst, umc_inst,
|
||||
node_inst, &addr_out);
|
||||
return;
|
||||
}
|
||||
|
||||
soc_pa = addr_out.pa.pa;
|
||||
bank = addr_out.pa.bank;
|
||||
@ -309,7 +210,7 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
|
||||
"Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
|
||||
retired_page, row, col, bank, channel_index);
|
||||
amdgpu_umc_fill_error_record(err_data, err_addr,
|
||||
retired_page, channel_index, umc_inst);
|
||||
retired_page, channel_index, addr_in->ma.umc_inst);
|
||||
|
||||
/* shift R13 bit */
|
||||
retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
|
||||
@ -317,7 +218,7 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
|
||||
"Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
|
||||
retired_page, row_xor, col, bank, channel_index);
|
||||
amdgpu_umc_fill_error_record(err_data, err_addr,
|
||||
retired_page, channel_index, umc_inst);
|
||||
retired_page, channel_index, addr_in->ma.umc_inst);
|
||||
}
|
||||
}
|
||||
|
||||
@ -325,10 +226,11 @@ static int umc_v12_0_query_error_address(struct amdgpu_device *adev,
|
||||
uint32_t node_inst, uint32_t umc_inst,
|
||||
uint32_t ch_inst, void *data)
|
||||
{
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)data;
|
||||
struct ta_ras_query_address_input addr_in;
|
||||
uint64_t mc_umc_status_addr;
|
||||
uint64_t mc_umc_status, err_addr;
|
||||
uint64_t mc_umc_addrt0;
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)data;
|
||||
uint64_t umc_reg_offset =
|
||||
get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
|
||||
|
||||
@ -357,8 +259,19 @@ static int umc_v12_0_query_error_address(struct amdgpu_device *adev,
|
||||
|
||||
err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
|
||||
|
||||
umc_v12_0_convert_error_address(adev, err_data, err_addr,
|
||||
ch_inst, umc_inst, node_inst);
|
||||
if (!adev->aid_mask &&
|
||||
adev->smuio.funcs &&
|
||||
adev->smuio.funcs->get_socket_id)
|
||||
addr_in.ma.socket_id = adev->smuio.funcs->get_socket_id(adev);
|
||||
else
|
||||
addr_in.ma.socket_id = 0;
|
||||
|
||||
addr_in.ma.err_addr = err_addr;
|
||||
addr_in.ma.ch_inst = ch_inst;
|
||||
addr_in.ma.umc_inst = umc_inst;
|
||||
addr_in.ma.node_inst = node_inst;
|
||||
|
||||
umc_v12_0_convert_error_address(adev, err_data, &addr_in);
|
||||
}
|
||||
|
||||
/* clear umc status */
|
||||
@ -404,10 +317,16 @@ static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev,
|
||||
static void umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
|
||||
void *ras_error_status)
|
||||
{
|
||||
struct ras_query_context qctx;
|
||||
|
||||
memset(&qctx, 0, sizeof(qctx));
|
||||
qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ?
|
||||
RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID);
|
||||
|
||||
amdgpu_mca_smu_log_ras_error(adev,
|
||||
AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status);
|
||||
AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status, &qctx);
|
||||
amdgpu_mca_smu_log_ras_error(adev,
|
||||
AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status);
|
||||
AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status, &qctx);
|
||||
}
|
||||
|
||||
static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
|
||||
@ -418,12 +337,16 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade
|
||||
struct ras_err_info *err_info;
|
||||
struct ras_err_addr *mca_err_addr, *tmp;
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
|
||||
struct ta_ras_query_address_input addr_in;
|
||||
|
||||
for_each_ras_error(err_node, err_data) {
|
||||
err_info = &err_node->err_info;
|
||||
if (list_empty(&err_info->err_addr_list))
|
||||
continue;
|
||||
|
||||
addr_in.ma.node_inst = err_info->mcm_info.die_id;
|
||||
addr_in.ma.socket_id = err_info->mcm_info.socket_id;
|
||||
|
||||
list_for_each_entry_safe(mca_err_addr, tmp, &err_info->err_addr_list, node) {
|
||||
mc_umc_status = mca_err_addr->err_status;
|
||||
if (mc_umc_status &&
|
||||
@ -439,6 +362,10 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade
|
||||
MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
|
||||
InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, InstanceIdLo);
|
||||
|
||||
addr_in.ma.err_addr = err_addr;
|
||||
addr_in.ma.ch_inst = MCA_IPID_LO_2_UMC_CH(InstanceIdLo);
|
||||
addr_in.ma.umc_inst = MCA_IPID_LO_2_UMC_INST(InstanceIdLo);
|
||||
|
||||
dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, ch:%d, err_addr:0x%llx\n",
|
||||
mca_ipid,
|
||||
err_info->mcm_info.die_id,
|
||||
@ -447,10 +374,7 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade
|
||||
err_addr);
|
||||
|
||||
umc_v12_0_convert_error_address(adev,
|
||||
err_data, err_addr,
|
||||
MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
|
||||
MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
|
||||
err_info->mcm_info.die_id);
|
||||
err_data, &addr_in);
|
||||
}
|
||||
|
||||
/* Delete error address node from list and free memory */
|
||||
@ -498,43 +422,44 @@ const struct amdgpu_ras_block_hw_ops umc_v12_0_ras_hw_ops = {
|
||||
.query_ras_error_address = umc_v12_0_query_ras_error_address,
|
||||
};
|
||||
|
||||
static int umc_v12_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
|
||||
struct aca_bank_report *report, void *data)
|
||||
static int umc_v12_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
|
||||
enum aca_smu_type type, void *data)
|
||||
{
|
||||
struct amdgpu_device *adev = handle->adev;
|
||||
u64 status;
|
||||
struct aca_bank_info info;
|
||||
enum aca_error_type err_type;
|
||||
u64 status, count;
|
||||
u32 ext_error_code;
|
||||
int ret;
|
||||
|
||||
ret = aca_bank_info_decode(bank, &report->info);
|
||||
status = bank->regs[ACA_REG_IDX_STATUS];
|
||||
if (umc_v12_0_is_deferred_error(adev, status))
|
||||
err_type = ACA_ERROR_TYPE_DEFERRED;
|
||||
else if (umc_v12_0_is_uncorrectable_error(adev, status))
|
||||
err_type = ACA_ERROR_TYPE_UE;
|
||||
else if (umc_v12_0_is_correctable_error(adev, status))
|
||||
err_type = ACA_ERROR_TYPE_CE;
|
||||
else
|
||||
return 0;
|
||||
|
||||
ret = aca_bank_info_decode(bank, &info);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
status = bank->regs[ACA_REG_IDX_STATUS];
|
||||
switch (type) {
|
||||
case ACA_ERROR_TYPE_UE:
|
||||
if (umc_v12_0_is_uncorrectable_error(adev, status)) {
|
||||
report->count[type] = 1;
|
||||
}
|
||||
break;
|
||||
case ACA_ERROR_TYPE_CE:
|
||||
if (umc_v12_0_is_correctable_error(adev, status)) {
|
||||
report->count[type] = 1;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status);
|
||||
count = ext_error_code == 0 ?
|
||||
ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]) : 1ULL;
|
||||
|
||||
return 0;
|
||||
return aca_error_cache_log_bank_error(handle, &info, err_type, count);
|
||||
}
|
||||
|
||||
static const struct aca_bank_ops umc_v12_0_aca_bank_ops = {
|
||||
.aca_bank_generate_report = umc_v12_0_aca_bank_generate_report,
|
||||
.aca_bank_parser = umc_v12_0_aca_bank_parser,
|
||||
};
|
||||
|
||||
const struct aca_info umc_v12_0_aca_info = {
|
||||
.hwip = ACA_HWIP_TYPE_UMC,
|
||||
.mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK,
|
||||
.mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK | ACA_ERROR_DEFERRED_MASK,
|
||||
.bank_ops = &umc_v12_0_aca_bank_ops,
|
||||
};
|
||||
|
||||
|
@ -55,67 +55,12 @@
|
||||
#define UMC_V12_0_NA_MAP_PA_NUM 8
|
||||
/* R13 bit shift should be considered, double the number */
|
||||
#define UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL (UMC_V12_0_NA_MAP_PA_NUM * 2)
|
||||
/* bank bits in MCA error address */
|
||||
#define UMC_V12_0_MCA_B0_BIT 6
|
||||
#define UMC_V12_0_MCA_B1_BIT 7
|
||||
#define UMC_V12_0_MCA_B2_BIT 8
|
||||
#define UMC_V12_0_MCA_B3_BIT 9
|
||||
|
||||
/* column bits in SOC physical address */
|
||||
#define UMC_V12_0_PA_C2_BIT 15
|
||||
#define UMC_V12_0_PA_C4_BIT 21
|
||||
/* row bits in SOC physical address */
|
||||
#define UMC_V12_0_PA_R13_BIT 35
|
||||
/* channel index bits in SOC physical address */
|
||||
#define UMC_V12_0_PA_CH4_BIT 12
|
||||
#define UMC_V12_0_PA_CH5_BIT 13
|
||||
#define UMC_V12_0_PA_CH6_BIT 14
|
||||
|
||||
/* bank hash settings */
|
||||
#define UMC_V12_0_XOR_EN0 1
|
||||
#define UMC_V12_0_XOR_EN1 1
|
||||
#define UMC_V12_0_XOR_EN2 1
|
||||
#define UMC_V12_0_XOR_EN3 1
|
||||
#define UMC_V12_0_COL_XOR0 0x0
|
||||
#define UMC_V12_0_COL_XOR1 0x0
|
||||
#define UMC_V12_0_COL_XOR2 0x800
|
||||
#define UMC_V12_0_COL_XOR3 0x1000
|
||||
#define UMC_V12_0_ROW_XOR0 0x11111
|
||||
#define UMC_V12_0_ROW_XOR1 0x22222
|
||||
#define UMC_V12_0_ROW_XOR2 0x4444
|
||||
#define UMC_V12_0_ROW_XOR3 0x8888
|
||||
|
||||
/* channel hash settings */
|
||||
#define UMC_V12_0_HASH_4K 0
|
||||
#define UMC_V12_0_HASH_64K 1
|
||||
#define UMC_V12_0_HASH_2M 1
|
||||
#define UMC_V12_0_HASH_1G 1
|
||||
#define UMC_V12_0_HASH_1T 1
|
||||
|
||||
/* XOR some bits of PA into CH4~CH6 bits (bits 12~14 of PA),
|
||||
* hash bit is only effective when related setting is enabled
|
||||
*/
|
||||
#define UMC_V12_0_CHANNEL_HASH_CH4(channel_idx, pa) ((((channel_idx) >> 5) & 0x1) ^ \
|
||||
(((pa) >> 20) & 0x1ULL & UMC_V12_0_HASH_64K) ^ \
|
||||
(((pa) >> 27) & 0x1ULL & UMC_V12_0_HASH_2M) ^ \
|
||||
(((pa) >> 34) & 0x1ULL & UMC_V12_0_HASH_1G) ^ \
|
||||
(((pa) >> 41) & 0x1ULL & UMC_V12_0_HASH_1T))
|
||||
#define UMC_V12_0_CHANNEL_HASH_CH5(channel_idx, pa) ((((channel_idx) >> 6) & 0x1) ^ \
|
||||
(((pa) >> 21) & 0x1ULL & UMC_V12_0_HASH_64K) ^ \
|
||||
(((pa) >> 28) & 0x1ULL & UMC_V12_0_HASH_2M) ^ \
|
||||
(((pa) >> 35) & 0x1ULL & UMC_V12_0_HASH_1G) ^ \
|
||||
(((pa) >> 42) & 0x1ULL & UMC_V12_0_HASH_1T))
|
||||
#define UMC_V12_0_CHANNEL_HASH_CH6(channel_idx, pa) ((((channel_idx) >> 4) & 0x1) ^ \
|
||||
(((pa) >> 19) & 0x1ULL & UMC_V12_0_HASH_64K) ^ \
|
||||
(((pa) >> 26) & 0x1ULL & UMC_V12_0_HASH_2M) ^ \
|
||||
(((pa) >> 33) & 0x1ULL & UMC_V12_0_HASH_1G) ^ \
|
||||
(((pa) >> 40) & 0x1ULL & UMC_V12_0_HASH_1T) ^ \
|
||||
(((pa) >> 47) & 0x1ULL & UMC_V12_0_HASH_4K))
|
||||
#define UMC_V12_0_SET_CHANNEL_HASH(channel_idx, pa) do { \
|
||||
(pa) &= ~(0x7ULL << UMC_V12_0_PA_CH4_BIT); \
|
||||
(pa) |= (UMC_V12_0_CHANNEL_HASH_CH4(channel_idx, pa) << UMC_V12_0_PA_CH4_BIT); \
|
||||
(pa) |= (UMC_V12_0_CHANNEL_HASH_CH5(channel_idx, pa) << UMC_V12_0_PA_CH5_BIT); \
|
||||
(pa) |= (UMC_V12_0_CHANNEL_HASH_CH6(channel_idx, pa) << UMC_V12_0_PA_CH6_BIT); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_IPID_LO_2_UMC_CH(_ipid_lo) (((((_ipid_lo) >> 20) & 0x1) * 4) + \
|
||||
(((_ipid_lo) >> 12) & 0xF))
|
||||
@ -127,11 +72,6 @@ bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_
|
||||
|
||||
typedef bool (*check_error_type_func)(struct amdgpu_device *adev, uint64_t mc_umc_status);
|
||||
|
||||
extern const uint32_t
|
||||
umc_v12_0_channel_idx_tbl[]
|
||||
[UMC_V12_0_UMC_INSTANCE_NUM]
|
||||
[UMC_V12_0_CHANNEL_INSTANCE_NUM];
|
||||
|
||||
extern struct amdgpu_umc_ras umc_v12_0_ras;
|
||||
|
||||
#endif
|
||||
|
@ -442,11 +442,6 @@ static void umc_v8_10_ecc_info_query_ras_error_address(struct amdgpu_device *ade
|
||||
umc_v8_10_ecc_info_query_error_address, ras_error_status);
|
||||
}
|
||||
|
||||
static void umc_v8_10_set_eeprom_table_version(struct amdgpu_ras_eeprom_table_header *hdr)
|
||||
{
|
||||
hdr->version = RAS_TABLE_VER_V2_1;
|
||||
}
|
||||
|
||||
const struct amdgpu_ras_block_hw_ops umc_v8_10_ras_hw_ops = {
|
||||
.query_ras_error_count = umc_v8_10_query_ras_error_count,
|
||||
.query_ras_error_address = umc_v8_10_query_ras_error_address,
|
||||
@ -460,5 +455,4 @@ struct amdgpu_umc_ras umc_v8_10_ras = {
|
||||
.query_ras_poison_mode = umc_v8_10_query_ras_poison_mode,
|
||||
.ecc_info_query_ras_error_count = umc_v8_10_ecc_info_query_ras_error_count,
|
||||
.ecc_info_query_ras_error_address = umc_v8_10_ecc_info_query_ras_error_address,
|
||||
.set_eeprom_table_version = umc_v8_10_set_eeprom_table_version,
|
||||
};
|
||||
|
@ -225,6 +225,8 @@ static int umsch_mm_v4_0_ring_start(struct amdgpu_umsch_mm *umsch)
|
||||
|
||||
WREG32_SOC15(VCN, 0, regVCN_UMSCH_RB_SIZE, ring->ring_size);
|
||||
|
||||
ring->wptr = 0;
|
||||
|
||||
data = RREG32_SOC15(VCN, 0, regVCN_RB_ENABLE);
|
||||
data &= ~(VCN_RB_ENABLE__AUDIO_RB_EN_MASK);
|
||||
WREG32_SOC15(VCN, 0, regVCN_RB_ENABLE, data);
|
||||
|
@ -897,7 +897,7 @@ static int vi_asic_pci_config_reset(struct amdgpu_device *adev)
|
||||
return r;
|
||||
}
|
||||
|
||||
static bool vi_asic_supports_baco(struct amdgpu_device *adev)
|
||||
static int vi_asic_supports_baco(struct amdgpu_device *adev)
|
||||
{
|
||||
switch (adev->asic_type) {
|
||||
case CHIP_FIJI:
|
||||
@ -908,14 +908,14 @@ static bool vi_asic_supports_baco(struct amdgpu_device *adev)
|
||||
case CHIP_TOPAZ:
|
||||
return amdgpu_dpm_is_baco_supported(adev);
|
||||
default:
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static enum amd_reset_method
|
||||
vi_asic_reset_method(struct amdgpu_device *adev)
|
||||
{
|
||||
bool baco_reset;
|
||||
int baco_reset;
|
||||
|
||||
if (amdgpu_reset_method == AMD_RESET_METHOD_LEGACY ||
|
||||
amdgpu_reset_method == AMD_RESET_METHOD_BACO)
|
||||
@ -935,7 +935,7 @@ vi_asic_reset_method(struct amdgpu_device *adev)
|
||||
baco_reset = amdgpu_dpm_is_baco_supported(adev);
|
||||
break;
|
||||
default:
|
||||
baco_reset = false;
|
||||
baco_reset = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -960,7 +960,6 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
|
||||
{
|
||||
struct kfd_node *node;
|
||||
int i;
|
||||
int count;
|
||||
|
||||
if (!kfd->init_complete)
|
||||
return;
|
||||
@ -968,12 +967,10 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
|
||||
/* for runtime suspend, skip locking kfd */
|
||||
if (!run_pm) {
|
||||
mutex_lock(&kfd_processes_mutex);
|
||||
count = ++kfd_locked;
|
||||
mutex_unlock(&kfd_processes_mutex);
|
||||
|
||||
/* For first KFD device suspend all the KFD processes */
|
||||
if (count == 1)
|
||||
if (++kfd_locked == 1)
|
||||
kfd_suspend_all_processes();
|
||||
mutex_unlock(&kfd_processes_mutex);
|
||||
}
|
||||
|
||||
for (i = 0; i < kfd->num_nodes; i++) {
|
||||
@ -984,7 +981,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
|
||||
|
||||
int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
|
||||
{
|
||||
int ret, count, i;
|
||||
int ret, i;
|
||||
|
||||
if (!kfd->init_complete)
|
||||
return 0;
|
||||
@ -998,12 +995,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
|
||||
/* for runtime resume, skip unlocking kfd */
|
||||
if (!run_pm) {
|
||||
mutex_lock(&kfd_processes_mutex);
|
||||
count = --kfd_locked;
|
||||
mutex_unlock(&kfd_processes_mutex);
|
||||
|
||||
WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
|
||||
if (count == 0)
|
||||
if (--kfd_locked == 0)
|
||||
ret = kfd_resume_all_processes();
|
||||
WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error");
|
||||
mutex_unlock(&kfd_processes_mutex);
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
@ -1997,10 +1997,10 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
|
||||
* check those fields
|
||||
*/
|
||||
mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];
|
||||
if (mqd_mgr->read_doorbell_id(dqm->packet_mgr.priv_queue->queue->mqd)) {
|
||||
dev_err(dev, "HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out\n");
|
||||
if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) {
|
||||
while (halt_if_hws_hang)
|
||||
schedule();
|
||||
kfd_hws_hang(dqm);
|
||||
return -ETIME;
|
||||
}
|
||||
|
||||
|
@ -134,6 +134,7 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
|
||||
{
|
||||
enum amdgpu_ras_block block = 0;
|
||||
int old_poison, ret = -EINVAL;
|
||||
uint32_t reset = 0;
|
||||
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
|
||||
|
||||
if (!p)
|
||||
@ -153,6 +154,8 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
|
||||
case SOC15_IH_CLIENTID_UTCL2:
|
||||
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
|
||||
block = AMDGPU_RAS_BLOCK__GFX;
|
||||
if (ret)
|
||||
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
break;
|
||||
case SOC15_IH_CLIENTID_SDMA0:
|
||||
case SOC15_IH_CLIENTID_SDMA1:
|
||||
@ -160,6 +163,7 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
|
||||
case SOC15_IH_CLIENTID_SDMA3:
|
||||
case SOC15_IH_CLIENTID_SDMA4:
|
||||
block = AMDGPU_RAS_BLOCK__SDMA;
|
||||
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@ -170,17 +174,16 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
|
||||
/* resetting queue passes, do page retirement without gpu reset
|
||||
* resetting queue fails, fallback to gpu reset solution
|
||||
*/
|
||||
if (!ret) {
|
||||
if (!ret)
|
||||
dev_warn(dev->adev->dev,
|
||||
"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
|
||||
client_id);
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
|
||||
} else {
|
||||
else
|
||||
dev_warn(dev->adev->dev,
|
||||
"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
|
||||
client_id);
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
|
||||
}
|
||||
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
|
||||
}
|
||||
|
||||
static bool event_interrupt_isr_v10(struct kfd_node *dev,
|
||||
@ -368,10 +371,25 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
|
||||
client_id == SOC15_IH_CLIENTID_UTCL2) {
|
||||
struct kfd_vm_fault_info info = {0};
|
||||
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
|
||||
uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
|
||||
uint32_t vmid_type = SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry);
|
||||
int hub_inst = 0;
|
||||
struct kfd_hsa_memory_exception_data exception_data;
|
||||
|
||||
if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
|
||||
amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
|
||||
/* gfxhub */
|
||||
if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) {
|
||||
hub_inst = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
|
||||
node_id);
|
||||
if (hub_inst < 0)
|
||||
hub_inst = 0;
|
||||
}
|
||||
|
||||
/* mmhub */
|
||||
if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC)
|
||||
hub_inst = node_id / 4;
|
||||
|
||||
if (amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev,
|
||||
hub_inst, vmid_type)) {
|
||||
event_interrupt_poison_consumption(dev, pasid, client_id);
|
||||
return;
|
||||
}
|
||||
|
@ -193,6 +193,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
|
||||
{
|
||||
enum amdgpu_ras_block block = 0;
|
||||
int ret = -EINVAL;
|
||||
uint32_t reset = 0;
|
||||
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
|
||||
|
||||
if (!p)
|
||||
@ -212,10 +213,13 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
|
||||
if (dev->dqm->ops.reset_queues)
|
||||
ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
|
||||
block = AMDGPU_RAS_BLOCK__GFX;
|
||||
if (ret)
|
||||
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
break;
|
||||
case SOC21_INTSRC_SDMA_ECC:
|
||||
default:
|
||||
block = AMDGPU_RAS_BLOCK__GFX;
|
||||
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
break;
|
||||
}
|
||||
|
||||
@ -223,10 +227,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
|
||||
|
||||
/* resetting queue passes, do page retirement without gpu reset
|
||||
resetting queue fails, fallback to gpu reset solution */
|
||||
if (!ret)
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
|
||||
else
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
|
||||
}
|
||||
|
||||
static bool event_interrupt_isr_v11(struct kfd_node *dev,
|
||||
|
@ -145,6 +145,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
|
||||
{
|
||||
enum amdgpu_ras_block block = 0;
|
||||
int old_poison, ret = -EINVAL;
|
||||
uint32_t reset = 0;
|
||||
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
|
||||
|
||||
if (!p)
|
||||
@ -164,6 +165,15 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
|
||||
case SOC15_IH_CLIENTID_UTCL2:
|
||||
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
|
||||
block = AMDGPU_RAS_BLOCK__GFX;
|
||||
if (ret)
|
||||
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
break;
|
||||
case SOC15_IH_CLIENTID_VMC:
|
||||
case SOC15_IH_CLIENTID_VMC1:
|
||||
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
|
||||
block = AMDGPU_RAS_BLOCK__MMHUB;
|
||||
if (ret)
|
||||
reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
|
||||
break;
|
||||
case SOC15_IH_CLIENTID_SDMA0:
|
||||
case SOC15_IH_CLIENTID_SDMA1:
|
||||
@ -171,6 +181,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
|
||||
case SOC15_IH_CLIENTID_SDMA3:
|
||||
case SOC15_IH_CLIENTID_SDMA4:
|
||||
block = AMDGPU_RAS_BLOCK__SDMA;
|
||||
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@ -181,17 +192,16 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
|
||||
/* resetting queue passes, do page retirement without gpu reset
|
||||
* resetting queue fails, fallback to gpu reset solution
|
||||
*/
|
||||
if (!ret) {
|
||||
if (!ret)
|
||||
dev_warn(dev->adev->dev,
|
||||
"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
|
||||
client_id);
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
|
||||
} else {
|
||||
else
|
||||
dev_warn(dev->adev->dev,
|
||||
"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
|
||||
client_id);
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
|
||||
}
|
||||
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
|
||||
}
|
||||
|
||||
static bool context_id_expected(struct kfd_dev *dev)
|
||||
@ -414,10 +424,25 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
|
||||
client_id == SOC15_IH_CLIENTID_UTCL2) {
|
||||
struct kfd_vm_fault_info info = {0};
|
||||
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
|
||||
uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
|
||||
uint32_t vmid_type = SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry);
|
||||
int hub_inst = 0;
|
||||
struct kfd_hsa_memory_exception_data exception_data;
|
||||
|
||||
if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
|
||||
amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
|
||||
/* gfxhub */
|
||||
if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) {
|
||||
hub_inst = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
|
||||
node_id);
|
||||
if (hub_inst < 0)
|
||||
hub_inst = 0;
|
||||
}
|
||||
|
||||
/* mmhub */
|
||||
if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC)
|
||||
hub_inst = node_id / 4;
|
||||
|
||||
if (amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev,
|
||||
hub_inst, vmid_type)) {
|
||||
event_interrupt_poison_consumption_v9(dev, pasid, client_id);
|
||||
return;
|
||||
}
|
||||
|
@ -104,6 +104,8 @@ void kfd_interrupt_exit(struct kfd_node *node)
|
||||
*/
|
||||
flush_workqueue(node->ih_wq);
|
||||
|
||||
destroy_workqueue(node->ih_wq);
|
||||
|
||||
kfifo_free(&node->ih_fifo);
|
||||
}
|
||||
|
||||
|
@ -290,3 +290,21 @@ uint64_t kfd_mqd_stride(struct mqd_manager *mm,
|
||||
{
|
||||
return mm->mqd_size;
|
||||
}
|
||||
|
||||
bool kfd_check_hiq_mqd_doorbell_id(struct kfd_node *node, uint32_t doorbell_id,
|
||||
uint32_t inst)
|
||||
{
|
||||
if (doorbell_id) {
|
||||
struct device *dev = node->adev->dev;
|
||||
|
||||
if (node->adev->xcp_mgr && node->adev->xcp_mgr->num_xcps > 0)
|
||||
dev_err(dev, "XCC %d: Queue preemption failed for queue with doorbell_id: %x\n",
|
||||
inst, doorbell_id);
|
||||
else
|
||||
dev_err(dev, "Queue preemption failed for queue with doorbell_id: %x\n",
|
||||
doorbell_id);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -119,7 +119,7 @@ struct mqd_manager {
|
||||
#if defined(CONFIG_DEBUG_FS)
|
||||
int (*debugfs_show_mqd)(struct seq_file *m, void *data);
|
||||
#endif
|
||||
uint32_t (*read_doorbell_id)(void *mqd);
|
||||
bool (*check_preemption_failed)(struct mqd_manager *mm, void *mqd);
|
||||
uint64_t (*mqd_stride)(struct mqd_manager *mm,
|
||||
struct queue_properties *p);
|
||||
|
||||
@ -198,4 +198,6 @@ void kfd_get_hiq_xcc_mqd(struct kfd_node *dev,
|
||||
uint64_t kfd_hiq_mqd_stride(struct kfd_node *dev);
|
||||
uint64_t kfd_mqd_stride(struct mqd_manager *mm,
|
||||
struct queue_properties *q);
|
||||
bool kfd_check_hiq_mqd_doorbell_id(struct kfd_node *node, uint32_t doorbell_id,
|
||||
uint32_t inst);
|
||||
#endif /* KFD_MQD_MANAGER_H_ */
|
||||
|
@ -206,11 +206,11 @@ static void __update_mqd(struct mqd_manager *mm, void *mqd,
|
||||
q->is_active = QUEUE_IS_ACTIVE(*q);
|
||||
}
|
||||
|
||||
static uint32_t read_doorbell_id(void *mqd)
|
||||
static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)
|
||||
{
|
||||
struct cik_mqd *m = (struct cik_mqd *)mqd;
|
||||
|
||||
return m->queue_doorbell_id0;
|
||||
return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);
|
||||
}
|
||||
|
||||
static void update_mqd(struct mqd_manager *mm, void *mqd,
|
||||
@ -423,7 +423,7 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
|
||||
#if defined(CONFIG_DEBUG_FS)
|
||||
mqd->debugfs_show_mqd = debugfs_show_mqd;
|
||||
#endif
|
||||
mqd->read_doorbell_id = read_doorbell_id;
|
||||
mqd->check_preemption_failed = check_preemption_failed;
|
||||
break;
|
||||
case KFD_MQD_TYPE_DIQ:
|
||||
mqd->allocate_mqd = allocate_mqd;
|
||||
|
@ -224,11 +224,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,
|
||||
q->is_active = QUEUE_IS_ACTIVE(*q);
|
||||
}
|
||||
|
||||
static uint32_t read_doorbell_id(void *mqd)
|
||||
static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)
|
||||
{
|
||||
struct v10_compute_mqd *m = (struct v10_compute_mqd *)mqd;
|
||||
|
||||
return m->queue_doorbell_id0;
|
||||
return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);
|
||||
}
|
||||
|
||||
static int get_wave_state(struct mqd_manager *mm, void *mqd,
|
||||
@ -488,7 +488,7 @@ struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type,
|
||||
#if defined(CONFIG_DEBUG_FS)
|
||||
mqd->debugfs_show_mqd = debugfs_show_mqd;
|
||||
#endif
|
||||
mqd->read_doorbell_id = read_doorbell_id;
|
||||
mqd->check_preemption_failed = check_preemption_failed;
|
||||
pr_debug("%s@%i\n", __func__, __LINE__);
|
||||
break;
|
||||
case KFD_MQD_TYPE_DIQ:
|
||||
|
@ -278,11 +278,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,
|
||||
q->is_active = QUEUE_IS_ACTIVE(*q);
|
||||
}
|
||||
|
||||
static uint32_t read_doorbell_id(void *mqd)
|
||||
static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)
|
||||
{
|
||||
struct v11_compute_mqd *m = (struct v11_compute_mqd *)mqd;
|
||||
|
||||
return m->queue_doorbell_id0;
|
||||
return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);
|
||||
}
|
||||
|
||||
static int get_wave_state(struct mqd_manager *mm, void *mqd,
|
||||
@ -517,7 +517,7 @@ struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type,
|
||||
#if defined(CONFIG_DEBUG_FS)
|
||||
mqd->debugfs_show_mqd = debugfs_show_mqd;
|
||||
#endif
|
||||
mqd->read_doorbell_id = read_doorbell_id;
|
||||
mqd->check_preemption_failed = check_preemption_failed;
|
||||
pr_debug("%s@%i\n", __func__, __LINE__);
|
||||
break;
|
||||
case KFD_MQD_TYPE_DIQ:
|
||||
|
@ -316,11 +316,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,
|
||||
}
|
||||
|
||||
|
||||
static uint32_t read_doorbell_id(void *mqd)
|
||||
static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)
|
||||
{
|
||||
struct v9_mqd *m = (struct v9_mqd *)mqd;
|
||||
|
||||
return m->queue_doorbell_id0;
|
||||
return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);
|
||||
}
|
||||
|
||||
static int get_wave_state(struct mqd_manager *mm, void *mqd,
|
||||
@ -607,6 +607,24 @@ static int destroy_hiq_mqd_v9_4_3(struct mqd_manager *mm, void *mqd,
|
||||
return err;
|
||||
}
|
||||
|
||||
static bool check_preemption_failed_v9_4_3(struct mqd_manager *mm, void *mqd)
|
||||
{
|
||||
uint64_t hiq_mqd_size = kfd_hiq_mqd_stride(mm->dev);
|
||||
uint32_t xcc_mask = mm->dev->xcc_mask;
|
||||
int inst = 0, xcc_id;
|
||||
struct v9_mqd *m;
|
||||
bool ret = false;
|
||||
|
||||
for_each_inst(xcc_id, xcc_mask) {
|
||||
m = get_mqd(mqd + hiq_mqd_size * inst);
|
||||
ret |= kfd_check_hiq_mqd_doorbell_id(mm->dev,
|
||||
m->queue_doorbell_id0, inst);
|
||||
++inst;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void get_xcc_mqd(struct kfd_mem_obj *mqd_mem_obj,
|
||||
struct kfd_mem_obj *xcc_mqd_mem_obj,
|
||||
uint64_t offset)
|
||||
@ -881,15 +899,16 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
|
||||
#if defined(CONFIG_DEBUG_FS)
|
||||
mqd->debugfs_show_mqd = debugfs_show_mqd;
|
||||
#endif
|
||||
mqd->read_doorbell_id = read_doorbell_id;
|
||||
if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3)) {
|
||||
mqd->init_mqd = init_mqd_hiq_v9_4_3;
|
||||
mqd->load_mqd = hiq_load_mqd_kiq_v9_4_3;
|
||||
mqd->destroy_mqd = destroy_hiq_mqd_v9_4_3;
|
||||
mqd->check_preemption_failed = check_preemption_failed_v9_4_3;
|
||||
} else {
|
||||
mqd->init_mqd = init_mqd_hiq;
|
||||
mqd->load_mqd = kfd_hiq_load_mqd_kiq;
|
||||
mqd->destroy_mqd = destroy_hiq_mqd;
|
||||
mqd->check_preemption_failed = check_preemption_failed;
|
||||
}
|
||||
break;
|
||||
case KFD_MQD_TYPE_DIQ:
|
||||
|
@ -237,11 +237,11 @@ static void __update_mqd(struct mqd_manager *mm, void *mqd,
|
||||
q->is_active = QUEUE_IS_ACTIVE(*q);
|
||||
}
|
||||
|
||||
static uint32_t read_doorbell_id(void *mqd)
|
||||
static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)
|
||||
{
|
||||
struct vi_mqd *m = (struct vi_mqd *)mqd;
|
||||
|
||||
return m->queue_doorbell_id0;
|
||||
return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);
|
||||
}
|
||||
|
||||
static void update_mqd(struct mqd_manager *mm, void *mqd,
|
||||
@ -482,7 +482,7 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
|
||||
#if defined(CONFIG_DEBUG_FS)
|
||||
mqd->debugfs_show_mqd = debugfs_show_mqd;
|
||||
#endif
|
||||
mqd->read_doorbell_id = read_doorbell_id;
|
||||
mqd->check_preemption_failed = check_preemption_failed;
|
||||
break;
|
||||
case KFD_MQD_TYPE_DIQ:
|
||||
mqd->allocate_mqd = allocate_mqd;
|
||||
|
@ -819,9 +819,9 @@ struct kfd_process *kfd_create_process(struct task_struct *thread)
|
||||
mutex_lock(&kfd_processes_mutex);
|
||||
|
||||
if (kfd_is_locked()) {
|
||||
mutex_unlock(&kfd_processes_mutex);
|
||||
pr_debug("KFD is locked! Cannot create process");
|
||||
return ERR_PTR(-EINVAL);
|
||||
process = ERR_PTR(-EINVAL);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* A prior open of /dev/kfd could have already created the process. */
|
||||
|
@ -33,6 +33,7 @@ subdir-ccflags-y += -I$(FULL_AMD_DISPLAY_PATH)/dc/hwss
|
||||
subdir-ccflags-y += -I$(FULL_AMD_DISPLAY_PATH)/dc/resource
|
||||
subdir-ccflags-y += -I$(FULL_AMD_DISPLAY_PATH)/dc/dsc
|
||||
subdir-ccflags-y += -I$(FULL_AMD_DISPLAY_PATH)/dc/optc
|
||||
subdir-ccflags-y += -I$(FULL_AMD_DISPLAY_PATH)/dc/dpp
|
||||
subdir-ccflags-y += -I$(FULL_AMD_DISPLAY_PATH)/modules/inc
|
||||
subdir-ccflags-y += -I$(FULL_AMD_DISPLAY_PATH)/modules/freesync
|
||||
subdir-ccflags-y += -I$(FULL_AMD_DISPLAY_PATH)/modules/color
|
||||
|
@ -148,6 +148,9 @@ MODULE_FIRMWARE(FIRMWARE_NAVI12_DMCU);
|
||||
#define FIRMWARE_DCN_35_DMUB "amdgpu/dcn_3_5_dmcub.bin"
|
||||
MODULE_FIRMWARE(FIRMWARE_DCN_35_DMUB);
|
||||
|
||||
#define FIRMWARE_DCN_351_DMUB "amdgpu/dcn_3_5_1_dmcub.bin"
|
||||
MODULE_FIRMWARE(FIRMWARE_DCN_351_DMUB);
|
||||
|
||||
/* Number of bytes in PSP header for firmware. */
|
||||
#define PSP_HEADER_BYTES 0x100
|
||||
|
||||
@ -1723,8 +1726,10 @@ static int amdgpu_dm_init(struct amdgpu_device *adev)
|
||||
|
||||
if (amdgpu_dc_debug_mask & DC_DISABLE_IPS)
|
||||
init_data.flags.disable_ips = DMUB_IPS_DISABLE_ALL;
|
||||
else
|
||||
init_data.flags.disable_ips = DMUB_IPS_RCG_IN_ACTIVE_IPS2_IN_OFF;
|
||||
|
||||
init_data.flags.disable_ips_in_vpb = 1;
|
||||
init_data.flags.disable_ips_in_vpb = 0;
|
||||
|
||||
/* Enable DWB for tested platforms only */
|
||||
if (amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0))
|
||||
@ -2626,6 +2631,7 @@ static enum dc_status amdgpu_dm_commit_zero_streams(struct dc *dc)
|
||||
int i;
|
||||
struct dc_stream_state *del_streams[MAX_PIPES];
|
||||
int del_streams_count = 0;
|
||||
struct dc_commit_streams_params params = {};
|
||||
|
||||
memset(del_streams, 0, sizeof(del_streams));
|
||||
|
||||
@ -2652,7 +2658,9 @@ static enum dc_status amdgpu_dm_commit_zero_streams(struct dc *dc)
|
||||
goto fail;
|
||||
}
|
||||
|
||||
res = dc_commit_streams(dc, context->streams, context->stream_count);
|
||||
params.streams = context->streams;
|
||||
params.stream_count = context->stream_count;
|
||||
res = dc_commit_streams(dc, ¶ms);
|
||||
|
||||
fail:
|
||||
dc_state_release(context);
|
||||
@ -2874,6 +2882,7 @@ static int dm_resume(void *handle)
|
||||
struct dc_state *dc_state;
|
||||
int i, r, j, ret;
|
||||
bool need_hotplug = false;
|
||||
struct dc_commit_streams_params commit_params = {};
|
||||
|
||||
if (dm->dc->caps.ips_support) {
|
||||
dc_dmub_srv_apply_idle_power_optimizations(dm->dc, false);
|
||||
@ -2923,7 +2932,9 @@ static int dm_resume(void *handle)
|
||||
dc_enable_dmub_outbox(adev->dm.dc);
|
||||
}
|
||||
|
||||
WARN_ON(!dc_commit_streams(dm->dc, dc_state->streams, dc_state->stream_count));
|
||||
commit_params.streams = dc_state->streams;
|
||||
commit_params.stream_count = dc_state->stream_count;
|
||||
WARN_ON(!dc_commit_streams(dm->dc, &commit_params));
|
||||
|
||||
dm_gpureset_commit_state(dm->cached_dc_state, dm);
|
||||
|
||||
@ -2940,7 +2951,7 @@ static int dm_resume(void *handle)
|
||||
}
|
||||
/* Recreate dc_state - DC invalidates it when setting power state to S3. */
|
||||
dc_state_release(dm_state->context);
|
||||
dm_state->context = dc_state_create(dm->dc);
|
||||
dm_state->context = dc_state_create(dm->dc, NULL);
|
||||
/* TODO: Remove dc_state->dccg, use dc->dccg directly. */
|
||||
|
||||
/* Before powering on DC we need to re-initialize DMUB. */
|
||||
@ -3044,6 +3055,10 @@ static int dm_resume(void *handle)
|
||||
/* Do mst topology probing after resuming cached state*/
|
||||
drm_connector_list_iter_begin(ddev, &iter);
|
||||
drm_for_each_connector_iter(connector, &iter) {
|
||||
|
||||
if (connector->connector_type == DRM_MODE_CONNECTOR_WRITEBACK)
|
||||
continue;
|
||||
|
||||
aconnector = to_amdgpu_dm_connector(connector);
|
||||
if (aconnector->dc_link->type != dc_connection_mst_branch ||
|
||||
aconnector->mst_root)
|
||||
@ -4820,9 +4835,11 @@ static int dm_init_microcode(struct amdgpu_device *adev)
|
||||
fw_name_dmub = FIRMWARE_DCN_V3_2_1_DMCUB;
|
||||
break;
|
||||
case IP_VERSION(3, 5, 0):
|
||||
case IP_VERSION(3, 5, 1):
|
||||
fw_name_dmub = FIRMWARE_DCN_35_DMUB;
|
||||
break;
|
||||
case IP_VERSION(3, 5, 1):
|
||||
fw_name_dmub = FIRMWARE_DCN_351_DMUB;
|
||||
break;
|
||||
default:
|
||||
/* ASIC doesn't support DMUB. */
|
||||
return 0;
|
||||
@ -5700,8 +5717,8 @@ static void fill_stream_properties_from_drm_display_mode(
|
||||
|
||||
timing_out->aspect_ratio = get_aspect_ratio(mode_in);
|
||||
|
||||
stream->out_transfer_func->type = TF_TYPE_PREDEFINED;
|
||||
stream->out_transfer_func->tf = TRANSFER_FUNCTION_SRGB;
|
||||
stream->out_transfer_func.type = TF_TYPE_PREDEFINED;
|
||||
stream->out_transfer_func.tf = TRANSFER_FUNCTION_SRGB;
|
||||
if (stream->signal == SIGNAL_TYPE_HDMI_TYPE_A) {
|
||||
if (!adjust_colour_depth_from_display_info(timing_out, info) &&
|
||||
drm_mode_is_420_also(info, mode_in) &&
|
||||
@ -5921,6 +5938,9 @@ get_highest_refresh_rate_mode(struct amdgpu_dm_connector *aconnector,
|
||||
&aconnector->base.probed_modes :
|
||||
&aconnector->base.modes;
|
||||
|
||||
if (aconnector->base.connector_type == DRM_MODE_CONNECTOR_WRITEBACK)
|
||||
return NULL;
|
||||
|
||||
if (aconnector->freesync_vid_base.clock != 0)
|
||||
return &aconnector->freesync_vid_base;
|
||||
|
||||
@ -6306,20 +6326,17 @@ create_stream_for_sink(struct drm_connector *connector,
|
||||
if (stream->signal == SIGNAL_TYPE_HDMI_TYPE_A)
|
||||
mod_build_hf_vsif_infopacket(stream, &stream->vsp_infopacket);
|
||||
|
||||
if (stream->link->psr_settings.psr_feature_enabled || stream->link->replay_settings.replay_feature_enabled) {
|
||||
if (stream->signal == SIGNAL_TYPE_DISPLAY_PORT ||
|
||||
stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST ||
|
||||
stream->signal == SIGNAL_TYPE_EDP) {
|
||||
//
|
||||
// should decide stream support vsc sdp colorimetry capability
|
||||
// before building vsc info packet
|
||||
//
|
||||
stream->use_vsc_sdp_for_colorimetry = false;
|
||||
if (aconnector->dc_sink->sink_signal == SIGNAL_TYPE_DISPLAY_PORT_MST) {
|
||||
stream->use_vsc_sdp_for_colorimetry =
|
||||
aconnector->dc_sink->is_vsc_sdp_colorimetry_supported;
|
||||
} else {
|
||||
if (stream->link->dpcd_caps.dprx_feature.bits.VSC_SDP_COLORIMETRY_SUPPORTED)
|
||||
stream->use_vsc_sdp_for_colorimetry = true;
|
||||
}
|
||||
if (stream->out_transfer_func->tf == TRANSFER_FUNCTION_GAMMA22)
|
||||
stream->use_vsc_sdp_for_colorimetry = stream->link->dpcd_caps.dpcd_rev.raw >= 0x14 &&
|
||||
stream->link->dpcd_caps.dprx_feature.bits.VSC_SDP_COLORIMETRY_SUPPORTED;
|
||||
|
||||
if (stream->out_transfer_func.tf == TRANSFER_FUNCTION_GAMMA22)
|
||||
tf = TRANSFER_FUNC_GAMMA_22;
|
||||
mod_build_vsc_infopacket(stream, &stream->vsc_infopacket, stream->output_color_space, tf);
|
||||
aconnector->psr_skip_count = AMDGPU_DM_PSR_ENTRY_DELAY;
|
||||
@ -6790,7 +6807,7 @@ static enum dc_status dm_validate_stream_and_context(struct dc *dc,
|
||||
if (!dc_plane_state)
|
||||
goto cleanup;
|
||||
|
||||
dc_state = dc_state_create(dc);
|
||||
dc_state = dc_state_create(dc, NULL);
|
||||
if (!dc_state)
|
||||
goto cleanup;
|
||||
|
||||
@ -8392,13 +8409,13 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state,
|
||||
|
||||
bundle->surface_updates[planes_count].surface = dc_plane;
|
||||
if (new_pcrtc_state->color_mgmt_changed) {
|
||||
bundle->surface_updates[planes_count].gamma = dc_plane->gamma_correction;
|
||||
bundle->surface_updates[planes_count].in_transfer_func = dc_plane->in_transfer_func;
|
||||
bundle->surface_updates[planes_count].gamma = &dc_plane->gamma_correction;
|
||||
bundle->surface_updates[planes_count].in_transfer_func = &dc_plane->in_transfer_func;
|
||||
bundle->surface_updates[planes_count].gamut_remap_matrix = &dc_plane->gamut_remap_matrix;
|
||||
bundle->surface_updates[planes_count].hdr_mult = dc_plane->hdr_mult;
|
||||
bundle->surface_updates[planes_count].func_shaper = dc_plane->in_shaper_func;
|
||||
bundle->surface_updates[planes_count].lut3d_func = dc_plane->lut3d_func;
|
||||
bundle->surface_updates[planes_count].blend_tf = dc_plane->blend_tf;
|
||||
bundle->surface_updates[planes_count].func_shaper = &dc_plane->in_shaper_func;
|
||||
bundle->surface_updates[planes_count].lut3d_func = &dc_plane->lut3d_func;
|
||||
bundle->surface_updates[planes_count].blend_tf = &dc_plane->blend_tf;
|
||||
}
|
||||
|
||||
amdgpu_dm_plane_fill_dc_scaling_info(dm->adev, new_plane_state,
|
||||
@ -8611,7 +8628,7 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state,
|
||||
bundle->stream_update.output_csc_transform =
|
||||
&acrtc_state->stream->csc_color_matrix;
|
||||
bundle->stream_update.out_transfer_func =
|
||||
acrtc_state->stream->out_transfer_func;
|
||||
&acrtc_state->stream->out_transfer_func;
|
||||
bundle->stream_update.lut3d_func =
|
||||
(struct dc_3dlut *) acrtc_state->stream->lut3d_func;
|
||||
bundle->stream_update.func_shaper =
|
||||
@ -8762,10 +8779,10 @@ static void amdgpu_dm_commit_audio(struct drm_device *dev,
|
||||
if (!drm_atomic_crtc_needs_modeset(new_crtc_state))
|
||||
continue;
|
||||
|
||||
notify:
|
||||
if (connector->connector_type == DRM_MODE_CONNECTOR_WRITEBACK)
|
||||
continue;
|
||||
|
||||
notify:
|
||||
aconnector = to_amdgpu_dm_connector(connector);
|
||||
|
||||
mutex_lock(&adev->dm.audio_lock);
|
||||
@ -8845,6 +8862,7 @@ static void amdgpu_dm_commit_streams(struct drm_atomic_state *state,
|
||||
struct drm_connector *connector;
|
||||
bool mode_set_reset_required = false;
|
||||
u32 i;
|
||||
struct dc_commit_streams_params params = {dc_state->streams, dc_state->stream_count};
|
||||
|
||||
/* Disable writeback */
|
||||
for_each_old_connector_in_state(state, connector, old_con_state, i) {
|
||||
@ -8981,7 +8999,7 @@ static void amdgpu_dm_commit_streams(struct drm_atomic_state *state,
|
||||
|
||||
dm_enable_per_frame_crtc_master_sync(dc_state);
|
||||
mutex_lock(&dm->dc_lock);
|
||||
WARN_ON(!dc_commit_streams(dm->dc, dc_state->streams, dc_state->stream_count));
|
||||
WARN_ON(!dc_commit_streams(dm->dc, ¶ms));
|
||||
|
||||
/* Allow idle optimization when vblank count is 0 for display off */
|
||||
if (dm->active_vblank_irq_count == 0)
|
||||
|
@ -571,7 +571,7 @@ static int amdgpu_dm_set_atomic_regamma(struct dc_stream_state *stream,
|
||||
uint32_t regamma_size, bool has_rom,
|
||||
enum dc_transfer_func_predefined tf)
|
||||
{
|
||||
struct dc_transfer_func *out_tf = stream->out_transfer_func;
|
||||
struct dc_transfer_func *out_tf = &stream->out_transfer_func;
|
||||
int ret = 0;
|
||||
|
||||
if (regamma_size || tf != TRANSFER_FUNCTION_LINEAR) {
|
||||
@ -954,8 +954,8 @@ int amdgpu_dm_update_crtc_color_mgmt(struct dm_crtc_state *crtc)
|
||||
* inverse color ramp in legacy userspace.
|
||||
*/
|
||||
crtc->cm_is_degamma_srgb = true;
|
||||
stream->out_transfer_func->type = TF_TYPE_DISTRIBUTED_POINTS;
|
||||
stream->out_transfer_func->tf = TRANSFER_FUNCTION_SRGB;
|
||||
stream->out_transfer_func.type = TF_TYPE_DISTRIBUTED_POINTS;
|
||||
stream->out_transfer_func.tf = TRANSFER_FUNCTION_SRGB;
|
||||
/*
|
||||
* Note: although we pass has_rom as parameter here, we never
|
||||
* actually use ROM because the color module only takes the ROM
|
||||
@ -963,7 +963,7 @@ int amdgpu_dm_update_crtc_color_mgmt(struct dm_crtc_state *crtc)
|
||||
*
|
||||
* See more in mod_color_calculate_regamma_params()
|
||||
*/
|
||||
r = __set_legacy_tf(stream->out_transfer_func, regamma_lut,
|
||||
r = __set_legacy_tf(&stream->out_transfer_func, regamma_lut,
|
||||
regamma_size, has_rom);
|
||||
if (r)
|
||||
return r;
|
||||
@ -1034,7 +1034,7 @@ map_crtc_degamma_to_dc_plane(struct dm_crtc_state *crtc,
|
||||
°amma_size);
|
||||
ASSERT(degamma_size == MAX_COLOR_LUT_ENTRIES);
|
||||
|
||||
dc_plane_state->in_transfer_func->type = TF_TYPE_DISTRIBUTED_POINTS;
|
||||
dc_plane_state->in_transfer_func.type = TF_TYPE_DISTRIBUTED_POINTS;
|
||||
|
||||
/*
|
||||
* This case isn't fully correct, but also fairly
|
||||
@ -1061,12 +1061,12 @@ map_crtc_degamma_to_dc_plane(struct dm_crtc_state *crtc,
|
||||
* map these to the atomic one instead.
|
||||
*/
|
||||
if (crtc->cm_is_degamma_srgb)
|
||||
dc_plane_state->in_transfer_func->tf = tf;
|
||||
dc_plane_state->in_transfer_func.tf = tf;
|
||||
else
|
||||
dc_plane_state->in_transfer_func->tf =
|
||||
dc_plane_state->in_transfer_func.tf =
|
||||
TRANSFER_FUNCTION_LINEAR;
|
||||
|
||||
r = __set_input_tf(caps, dc_plane_state->in_transfer_func,
|
||||
r = __set_input_tf(caps, &dc_plane_state->in_transfer_func,
|
||||
degamma_lut, degamma_size);
|
||||
if (r)
|
||||
return r;
|
||||
@ -1075,12 +1075,12 @@ map_crtc_degamma_to_dc_plane(struct dm_crtc_state *crtc,
|
||||
* For legacy gamma support we need the regamma input
|
||||
* in linear space. Assume that the input is sRGB.
|
||||
*/
|
||||
dc_plane_state->in_transfer_func->type = TF_TYPE_PREDEFINED;
|
||||
dc_plane_state->in_transfer_func->tf = tf;
|
||||
dc_plane_state->in_transfer_func.type = TF_TYPE_PREDEFINED;
|
||||
dc_plane_state->in_transfer_func.tf = tf;
|
||||
|
||||
if (tf != TRANSFER_FUNCTION_SRGB &&
|
||||
!mod_color_calculate_degamma_params(caps,
|
||||
dc_plane_state->in_transfer_func,
|
||||
&dc_plane_state->in_transfer_func,
|
||||
NULL, false))
|
||||
return -ENOMEM;
|
||||
}
|
||||
@ -1114,24 +1114,24 @@ __set_dm_plane_degamma(struct drm_plane_state *plane_state,
|
||||
if (!has_degamma_lut && tf == AMDGPU_TRANSFER_FUNCTION_DEFAULT)
|
||||
return -EINVAL;
|
||||
|
||||
dc_plane_state->in_transfer_func->tf = amdgpu_tf_to_dc_tf(tf);
|
||||
dc_plane_state->in_transfer_func.tf = amdgpu_tf_to_dc_tf(tf);
|
||||
|
||||
if (has_degamma_lut) {
|
||||
ASSERT(degamma_size == MAX_COLOR_LUT_ENTRIES);
|
||||
|
||||
dc_plane_state->in_transfer_func->type =
|
||||
dc_plane_state->in_transfer_func.type =
|
||||
TF_TYPE_DISTRIBUTED_POINTS;
|
||||
|
||||
ret = __set_input_tf(color_caps, dc_plane_state->in_transfer_func,
|
||||
ret = __set_input_tf(color_caps, &dc_plane_state->in_transfer_func,
|
||||
degamma_lut, degamma_size);
|
||||
if (ret)
|
||||
return ret;
|
||||
} else {
|
||||
dc_plane_state->in_transfer_func->type =
|
||||
dc_plane_state->in_transfer_func.type =
|
||||
TF_TYPE_PREDEFINED;
|
||||
|
||||
if (!mod_color_calculate_degamma_params(color_caps,
|
||||
dc_plane_state->in_transfer_func, NULL, false))
|
||||
&dc_plane_state->in_transfer_func, NULL, false))
|
||||
return -ENOMEM;
|
||||
}
|
||||
return 0;
|
||||
@ -1156,11 +1156,11 @@ amdgpu_dm_plane_set_color_properties(struct drm_plane_state *plane_state,
|
||||
lut3d = __extract_blob_lut(dm_plane_state->lut3d, &lut3d_size);
|
||||
lut3d_size = lut3d != NULL ? lut3d_size : 0;
|
||||
|
||||
amdgpu_dm_atomic_lut3d(lut3d, lut3d_size, dc_plane_state->lut3d_func);
|
||||
amdgpu_dm_atomic_lut3d(lut3d, lut3d_size, &dc_plane_state->lut3d_func);
|
||||
ret = amdgpu_dm_atomic_shaper_lut(shaper_lut, false,
|
||||
amdgpu_tf_to_dc_tf(shaper_tf),
|
||||
shaper_size,
|
||||
dc_plane_state->in_shaper_func);
|
||||
&dc_plane_state->in_shaper_func);
|
||||
if (ret) {
|
||||
drm_dbg_kms(plane_state->plane->dev,
|
||||
"setting plane %d shaper LUT failed.\n",
|
||||
@ -1175,7 +1175,7 @@ amdgpu_dm_plane_set_color_properties(struct drm_plane_state *plane_state,
|
||||
|
||||
ret = amdgpu_dm_atomic_blend_lut(blend_lut, false,
|
||||
amdgpu_tf_to_dc_tf(blend_tf),
|
||||
blend_size, dc_plane_state->blend_tf);
|
||||
blend_size, &dc_plane_state->blend_tf);
|
||||
if (ret) {
|
||||
drm_dbg_kms(plane_state->plane->dev,
|
||||
"setting plane %d gamma lut failed.\n",
|
||||
@ -1221,8 +1221,8 @@ int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc,
|
||||
color_caps = &dc_plane_state->ctx->dc->caps.color;
|
||||
|
||||
/* Initially, we can just bypass the DGM block. */
|
||||
dc_plane_state->in_transfer_func->type = TF_TYPE_BYPASS;
|
||||
dc_plane_state->in_transfer_func->tf = TRANSFER_FUNCTION_LINEAR;
|
||||
dc_plane_state->in_transfer_func.type = TF_TYPE_BYPASS;
|
||||
dc_plane_state->in_transfer_func.tf = TRANSFER_FUNCTION_LINEAR;
|
||||
|
||||
/* After, we start to update values according to color props */
|
||||
has_crtc_cm_degamma = (crtc->cm_has_degamma || crtc->cm_is_degamma_srgb);
|
||||
|
@ -791,25 +791,12 @@ struct dsc_mst_fairness_params {
|
||||
struct amdgpu_dm_connector *aconnector;
|
||||
};
|
||||
|
||||
static uint16_t get_fec_overhead_multiplier(struct dc_link *dc_link)
|
||||
{
|
||||
u8 link_coding_cap;
|
||||
uint16_t fec_overhead_multiplier_x1000 = PBN_FEC_OVERHEAD_MULTIPLIER_8B_10B;
|
||||
|
||||
link_coding_cap = dc_link_dp_mst_decide_link_encoding_format(dc_link);
|
||||
if (link_coding_cap == DP_128b_132b_ENCODING)
|
||||
fec_overhead_multiplier_x1000 = PBN_FEC_OVERHEAD_MULTIPLIER_128B_132B;
|
||||
|
||||
return fec_overhead_multiplier_x1000;
|
||||
}
|
||||
|
||||
static int kbps_to_peak_pbn(int kbps, uint16_t fec_overhead_multiplier_x1000)
|
||||
static int kbps_to_peak_pbn(int kbps)
|
||||
{
|
||||
u64 peak_kbps = kbps;
|
||||
|
||||
peak_kbps *= 1006;
|
||||
peak_kbps *= fec_overhead_multiplier_x1000;
|
||||
peak_kbps = div_u64(peak_kbps, 1000 * 1000);
|
||||
peak_kbps = div_u64(peak_kbps, 1000);
|
||||
return (int) DIV64_U64_ROUND_UP(peak_kbps * 64, (54 * 8 * 1000));
|
||||
}
|
||||
|
||||
@ -910,12 +897,11 @@ static int increase_dsc_bpp(struct drm_atomic_state *state,
|
||||
int link_timeslots_used;
|
||||
int fair_pbn_alloc;
|
||||
int ret = 0;
|
||||
uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link);
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
if (vars[i + k].dsc_enabled) {
|
||||
initial_slack[i] =
|
||||
kbps_to_peak_pbn(params[i].bw_range.max_kbps, fec_overhead_multiplier_x1000) - vars[i + k].pbn;
|
||||
kbps_to_peak_pbn(params[i].bw_range.max_kbps) - vars[i + k].pbn;
|
||||
bpp_increased[i] = false;
|
||||
remaining_to_increase += 1;
|
||||
} else {
|
||||
@ -1011,7 +997,6 @@ static int try_disable_dsc(struct drm_atomic_state *state,
|
||||
int next_index;
|
||||
int remaining_to_try = 0;
|
||||
int ret;
|
||||
uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link);
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
if (vars[i + k].dsc_enabled
|
||||
@ -1041,7 +1026,7 @@ static int try_disable_dsc(struct drm_atomic_state *state,
|
||||
if (next_index == -1)
|
||||
break;
|
||||
|
||||
vars[next_index].pbn = kbps_to_peak_pbn(params[next_index].bw_range.stream_kbps, fec_overhead_multiplier_x1000);
|
||||
vars[next_index].pbn = kbps_to_peak_pbn(params[next_index].bw_range.stream_kbps);
|
||||
ret = drm_dp_atomic_find_time_slots(state,
|
||||
params[next_index].port->mgr,
|
||||
params[next_index].port,
|
||||
@ -1054,7 +1039,8 @@ static int try_disable_dsc(struct drm_atomic_state *state,
|
||||
vars[next_index].dsc_enabled = false;
|
||||
vars[next_index].bpp_x16 = 0;
|
||||
} else {
|
||||
vars[next_index].pbn = kbps_to_peak_pbn(params[next_index].bw_range.max_kbps, fec_overhead_multiplier_x1000);
|
||||
vars[next_index].pbn = kbps_to_peak_pbn(
|
||||
params[next_index].bw_range.max_kbps);
|
||||
ret = drm_dp_atomic_find_time_slots(state,
|
||||
params[next_index].port->mgr,
|
||||
params[next_index].port,
|
||||
@ -1083,7 +1069,6 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state,
|
||||
int count = 0;
|
||||
int i, k, ret;
|
||||
bool debugfs_overwrite = false;
|
||||
uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link);
|
||||
|
||||
memset(params, 0, sizeof(params));
|
||||
|
||||
@ -1148,7 +1133,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state,
|
||||
/* Try no compression */
|
||||
for (i = 0; i < count; i++) {
|
||||
vars[i + k].aconnector = params[i].aconnector;
|
||||
vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps, fec_overhead_multiplier_x1000);
|
||||
vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps);
|
||||
vars[i + k].dsc_enabled = false;
|
||||
vars[i + k].bpp_x16 = 0;
|
||||
ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr, params[i].port,
|
||||
@ -1167,7 +1152,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state,
|
||||
/* Try max compression */
|
||||
for (i = 0; i < count; i++) {
|
||||
if (params[i].compression_possible && params[i].clock_force_enable != DSC_CLK_FORCE_DISABLE) {
|
||||
vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.min_kbps, fec_overhead_multiplier_x1000);
|
||||
vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.min_kbps);
|
||||
vars[i + k].dsc_enabled = true;
|
||||
vars[i + k].bpp_x16 = params[i].bw_range.min_target_bpp_x16;
|
||||
ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr,
|
||||
@ -1175,7 +1160,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state,
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
} else {
|
||||
vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps, fec_overhead_multiplier_x1000);
|
||||
vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps);
|
||||
vars[i + k].dsc_enabled = false;
|
||||
vars[i + k].bpp_x16 = 0;
|
||||
ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr,
|
||||
@ -1601,7 +1586,7 @@ enum dc_status dm_dp_mst_is_port_support_mode(
|
||||
struct amdgpu_dm_connector *aconnector,
|
||||
struct dc_stream_state *stream)
|
||||
{
|
||||
int bpp, pbn, branch_max_throughput_mps = 0;
|
||||
int pbn, branch_max_throughput_mps = 0;
|
||||
struct dc_link_settings cur_link_settings;
|
||||
unsigned int end_to_end_bw_in_kbps = 0;
|
||||
unsigned int upper_link_bw_in_kbps = 0, down_link_bw_in_kbps = 0;
|
||||
@ -1651,11 +1636,34 @@ enum dc_status dm_dp_mst_is_port_support_mode(
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* check if mode could be supported within full_pbn */
|
||||
bpp = convert_dc_color_depth_into_bpc(stream->timing.display_color_depth) * 3;
|
||||
pbn = drm_dp_calc_pbn_mode(stream->timing.pix_clk_100hz / 10, bpp << 4);
|
||||
if (pbn > aconnector->mst_output_port->full_pbn)
|
||||
/* Check if mode could be supported within max slot
|
||||
* number of current mst link and full_pbn of mst links.
|
||||
*/
|
||||
int pbn_div, slot_num, max_slot_num;
|
||||
enum dc_link_encoding_format link_encoding;
|
||||
uint32_t stream_kbps =
|
||||
dc_bandwidth_in_kbps_from_timing(&stream->timing,
|
||||
dc_link_get_highest_encoding_format(stream->link));
|
||||
|
||||
pbn = kbps_to_peak_pbn(stream_kbps);
|
||||
pbn_div = dm_mst_get_pbn_divider(stream->link);
|
||||
slot_num = DIV_ROUND_UP(pbn, pbn_div);
|
||||
|
||||
link_encoding = dc_link_get_highest_encoding_format(stream->link);
|
||||
if (link_encoding == DC_LINK_ENCODING_DP_8b_10b)
|
||||
max_slot_num = 63;
|
||||
else if (link_encoding == DC_LINK_ENCODING_DP_128b_132b)
|
||||
max_slot_num = 64;
|
||||
else {
|
||||
DRM_DEBUG_DRIVER("Invalid link encoding format\n");
|
||||
return DC_FAIL_BANDWIDTH_VALIDATE;
|
||||
}
|
||||
|
||||
if (slot_num > max_slot_num ||
|
||||
pbn > aconnector->mst_output_port->full_pbn) {
|
||||
DRM_DEBUG_DRIVER("Mode can not be supported within mst links!");
|
||||
return DC_FAIL_BANDWIDTH_VALIDATE;
|
||||
}
|
||||
}
|
||||
|
||||
/* check is mst dsc output bandwidth branch_overall_throughput_0_mps */
|
||||
|
@ -46,9 +46,6 @@
|
||||
#define SYNAPTICS_CASCADED_HUB_ID 0x5A
|
||||
#define IS_SYNAPTICS_CASCADED_PANAMERA(devName, data) ((IS_SYNAPTICS_PANAMERA(devName) && ((int)data[2] == SYNAPTICS_CASCADED_HUB_ID)) ? 1 : 0)
|
||||
|
||||
#define PBN_FEC_OVERHEAD_MULTIPLIER_8B_10B 1031
|
||||
#define PBN_FEC_OVERHEAD_MULTIPLIER_128B_132B 1000
|
||||
|
||||
enum mst_msg_ready_type {
|
||||
NONE_MSG_RDY_EVENT = 0,
|
||||
DOWN_REP_MSG_RDY_EVENT = 1,
|
||||
|
@ -212,7 +212,7 @@ bool amdgpu_dm_psr_disable(struct dc_stream_state *stream)
|
||||
}
|
||||
|
||||
/*
|
||||
* amdgpu_dm_psr_disable() - disable psr f/w
|
||||
* amdgpu_dm_psr_disable_all() - disable psr f/w for all streams
|
||||
* if psr is enabled on any stream
|
||||
*
|
||||
* Return: true if success
|
||||
|
@ -52,4 +52,12 @@ void dm_perf_trace_timestamp(const char *func_name, unsigned int line, struct dc
|
||||
func_name, line);
|
||||
}
|
||||
|
||||
void dm_trace_smu_msg(uint32_t msg_id, uint32_t param_in, struct dc_context *ctx)
|
||||
{
|
||||
}
|
||||
|
||||
void dm_trace_smu_delay(uint32_t delay, struct dc_context *ctx)
|
||||
{
|
||||
}
|
||||
|
||||
/**** power component interfaces ****/
|
||||
|
@ -76,10 +76,8 @@ static int amdgpu_dm_wb_encoder_atomic_check(struct drm_encoder *encoder,
|
||||
|
||||
static int amdgpu_dm_wb_connector_get_modes(struct drm_connector *connector)
|
||||
{
|
||||
struct drm_device *dev = connector->dev;
|
||||
|
||||
return drm_add_modes_noedid(connector, dev->mode_config.max_width,
|
||||
dev->mode_config.max_height);
|
||||
/* Maximum resolution supported by DWB */
|
||||
return drm_add_modes_noedid(connector, 3840, 2160);
|
||||
}
|
||||
|
||||
static int amdgpu_dm_wb_prepare_job(struct drm_writeback_connector *wb_connector,
|
||||
|
@ -22,7 +22,7 @@
|
||||
#
|
||||
# Makefile for Display Core (dc) component.
|
||||
|
||||
DC_LIBS = basics bios dml clk_mgr dce gpio hwss irq link virtual dsc resource optc
|
||||
DC_LIBS = basics bios dml clk_mgr dce gpio hwss irq link virtual dsc resource optc dpp
|
||||
|
||||
ifdef CONFIG_DRM_AMD_DC_FP
|
||||
|
||||
|
@ -44,8 +44,6 @@
|
||||
|
||||
#include "bios_parser_common.h"
|
||||
|
||||
#include "dc.h"
|
||||
|
||||
#define THREE_PERCENT_OF_10000 300
|
||||
|
||||
#define LAST_RECORD_TYPE 0xff
|
||||
@ -1731,6 +1729,7 @@ static uint32_t get_ss_entry_number_from_internal_ss_info_tbl_v2_1(
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* get_ss_entry_number_from_internal_ss_info_tbl_V3_1
|
||||
* Get Number of SpreadSpectrum Entry from the ASIC_InternalSS_Info table of
|
||||
|
@ -1594,8 +1594,6 @@ static bool bios_parser_is_device_id_supported(
|
||||
return (le16_to_cpu(bp->object_info_tbl.v1_5->supporteddevices) & mask) != 0;
|
||||
break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static uint32_t bios_parser_get_ss_entry_number(
|
||||
|
@ -329,15 +329,14 @@ struct clk_mgr *dc_clk_mgr_create(struct dc_context *ctx, struct pp_smu_funcs *p
|
||||
}
|
||||
break;
|
||||
case AMDGPU_FAMILY_GC_11_0_0: {
|
||||
struct clk_mgr_internal *clk_mgr = kzalloc(sizeof(*clk_mgr), GFP_KERNEL);
|
||||
struct clk_mgr_internal *clk_mgr = kzalloc(sizeof(*clk_mgr), GFP_KERNEL);
|
||||
|
||||
if (clk_mgr == NULL) {
|
||||
BREAK_TO_DEBUGGER();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
dcn32_clk_mgr_construct(ctx, clk_mgr, pp_smu, dccg);
|
||||
return &clk_mgr->base;
|
||||
if (clk_mgr == NULL) {
|
||||
BREAK_TO_DEBUGGER();
|
||||
return NULL;
|
||||
}
|
||||
dcn32_clk_mgr_construct(ctx, clk_mgr, pp_smu, dccg);
|
||||
return &clk_mgr->base;
|
||||
}
|
||||
|
||||
case AMDGPU_FAMILY_GC_11_0_1: {
|
||||
|
@ -131,8 +131,8 @@ int dce_get_dp_ref_freq_khz(struct clk_mgr *clk_mgr_base)
|
||||
struct clk_mgr_internal *clk_mgr = TO_CLK_MGR_INTERNAL(clk_mgr_base);
|
||||
int dprefclk_wdivider;
|
||||
int dprefclk_src_sel;
|
||||
int dp_ref_clk_khz;
|
||||
int target_div = 600000;
|
||||
int dp_ref_clk_khz = 600000;
|
||||
int target_div;
|
||||
|
||||
/* ASSERT DP Reference Clock source is from DFS*/
|
||||
REG_GET(DPREFCLK_CNTL, DPREFCLK_SRC_SEL, &dprefclk_src_sel);
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user