mirror of
https://github.com/torvalds/linux.git
synced 2024-12-29 14:21:47 +00:00
drm/amdgpu: Support passing amdgpu critical error to host via GPU Mailbox.
This feature works for SRIOV enviroment. For non-SRIOV enviroment, the trans_error function does nothing. The error information includes error_code (16bit), error_flags(16bit) and error_data(64bit). Since there are not many errors, we keep the errors in an array and transfer all errors to Host before amdgpu initialization function (amdgpu_device_init) exit. Signed-off-by: Gavin Wan <Gavin.Wan@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
8e1b90cc44
commit
890419409a
@ -25,7 +25,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
|
||||
amdgpu_prime.o amdgpu_vm.o amdgpu_ib.o amdgpu_pll.o \
|
||||
amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
|
||||
amdgpu_gtt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o amdgpu_atomfirmware.o \
|
||||
amdgpu_queue_mgr.o
|
||||
amdgpu_queue_mgr.o amdgpu_vf_error.o
|
||||
|
||||
# add asic specific block
|
||||
amdgpu-$(CONFIG_DRM_AMDGPU_CIK)+= cik.o cik_ih.o kv_smc.o kv_dpm.o \
|
||||
|
@ -53,6 +53,7 @@
|
||||
#include "bif/bif_4_1_d.h"
|
||||
#include <linux/pci.h>
|
||||
#include <linux/firmware.h>
|
||||
#include "amdgpu_vf_error.h"
|
||||
|
||||
MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
|
||||
MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
|
||||
@ -2134,6 +2135,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
r = amdgpu_atombios_init(adev);
|
||||
if (r) {
|
||||
dev_err(adev->dev, "amdgpu_atombios_init failed\n");
|
||||
amdgpu_vf_error_put(AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
|
||||
goto failed;
|
||||
}
|
||||
|
||||
@ -2144,6 +2146,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
if (amdgpu_vpost_needed(adev)) {
|
||||
if (!adev->bios) {
|
||||
dev_err(adev->dev, "no vBIOS found\n");
|
||||
amdgpu_vf_error_put(AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
|
||||
r = -EINVAL;
|
||||
goto failed;
|
||||
}
|
||||
@ -2151,6 +2154,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
|
||||
if (r) {
|
||||
dev_err(adev->dev, "gpu post error!\n");
|
||||
amdgpu_vf_error_put(AMDGIM_ERROR_VF_GPU_POST_ERROR, 0, 0);
|
||||
goto failed;
|
||||
}
|
||||
} else {
|
||||
@ -2162,7 +2166,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
r = amdgpu_atombios_get_clock_info(adev);
|
||||
if (r) {
|
||||
dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
|
||||
return r;
|
||||
amdgpu_vf_error_put(AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
|
||||
goto failed;
|
||||
}
|
||||
/* init i2c buses */
|
||||
amdgpu_atombios_i2c_init(adev);
|
||||
@ -2172,6 +2177,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
r = amdgpu_fence_driver_init(adev);
|
||||
if (r) {
|
||||
dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
|
||||
amdgpu_vf_error_put(AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
|
||||
goto failed;
|
||||
}
|
||||
|
||||
@ -2181,6 +2187,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
r = amdgpu_init(adev);
|
||||
if (r) {
|
||||
dev_err(adev->dev, "amdgpu_init failed\n");
|
||||
amdgpu_vf_error_put(AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
|
||||
amdgpu_fini(adev);
|
||||
goto failed;
|
||||
}
|
||||
@ -2200,6 +2207,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
r = amdgpu_ib_pool_init(adev);
|
||||
if (r) {
|
||||
dev_err(adev->dev, "IB initialization failed (%d).\n", r);
|
||||
amdgpu_vf_error_put(AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
|
||||
goto failed;
|
||||
}
|
||||
|
||||
@ -2244,12 +2252,14 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
r = amdgpu_late_init(adev);
|
||||
if (r) {
|
||||
dev_err(adev->dev, "amdgpu_late_init failed\n");
|
||||
amdgpu_vf_error_put(AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
|
||||
goto failed;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
failed:
|
||||
amdgpu_vf_error_trans_all(adev);
|
||||
if (runtime)
|
||||
vga_switcheroo_fini_domain_pm_ops(adev->dev);
|
||||
return r;
|
||||
@ -2937,6 +2947,7 @@ out:
|
||||
}
|
||||
} else {
|
||||
dev_err(adev->dev, "asic resume failed (%d).\n", r);
|
||||
amdgpu_vf_error_put(AMDGIM_ERROR_VF_ASIC_RESUME_FAIL, 0, r);
|
||||
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
|
||||
if (adev->rings[i] && adev->rings[i]->sched.thread) {
|
||||
kthread_unpark(adev->rings[i]->sched.thread);
|
||||
@ -2947,12 +2958,16 @@ out:
|
||||
drm_helper_resume_force_mode(adev->ddev);
|
||||
|
||||
ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
|
||||
if (r)
|
||||
if (r) {
|
||||
/* bad news, how to tell it to userspace ? */
|
||||
dev_info(adev->dev, "GPU reset failed\n");
|
||||
else
|
||||
amdgpu_vf_error_put(AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
|
||||
}
|
||||
else {
|
||||
dev_info(adev->dev, "GPU reset successed!\n");
|
||||
}
|
||||
|
||||
amdgpu_vf_error_trans_all(adev);
|
||||
return r;
|
||||
}
|
||||
|
||||
|
85
drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c
Normal file
85
drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c
Normal file
@ -0,0 +1,85 @@
|
||||
/*
|
||||
* Copyright 2017 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "amdgpu.h"
|
||||
#include "amdgpu_vf_error.h"
|
||||
#include "mxgpu_ai.h"
|
||||
|
||||
#define AMDGPU_VF_ERROR_ENTRY_SIZE 16
|
||||
|
||||
/* struct error_entry - amdgpu VF error information. */
|
||||
struct amdgpu_vf_error_buffer {
|
||||
int read_count;
|
||||
int write_count;
|
||||
uint16_t code[AMDGPU_VF_ERROR_ENTRY_SIZE];
|
||||
uint16_t flags[AMDGPU_VF_ERROR_ENTRY_SIZE];
|
||||
uint64_t data[AMDGPU_VF_ERROR_ENTRY_SIZE];
|
||||
};
|
||||
|
||||
struct amdgpu_vf_error_buffer admgpu_vf_errors;
|
||||
|
||||
|
||||
void amdgpu_vf_error_put(uint16_t sub_error_code, uint16_t error_flags, uint64_t error_data)
|
||||
{
|
||||
int index;
|
||||
uint16_t error_code = AMDGIM_ERROR_CODE(AMDGIM_ERROR_CATEGORY_VF, sub_error_code);
|
||||
|
||||
index = admgpu_vf_errors.write_count % AMDGPU_VF_ERROR_ENTRY_SIZE;
|
||||
admgpu_vf_errors.code [index] = error_code;
|
||||
admgpu_vf_errors.flags [index] = error_flags;
|
||||
admgpu_vf_errors.data [index] = error_data;
|
||||
admgpu_vf_errors.write_count ++;
|
||||
}
|
||||
|
||||
|
||||
void amdgpu_vf_error_trans_all(struct amdgpu_device *adev)
|
||||
{
|
||||
/* u32 pf2vf_flags = 0; */
|
||||
u32 data1, data2, data3;
|
||||
int index;
|
||||
|
||||
if ((NULL == adev) || (!amdgpu_sriov_vf(adev)) || (!adev->virt.ops) || (!adev->virt.ops->trans_msg)) {
|
||||
return;
|
||||
}
|
||||
/*
|
||||
TODO: Enable these code when pv2vf_info is merged
|
||||
AMDGPU_FW_VRAM_PF2VF_READ (adev, feature_flags, &pf2vf_flags);
|
||||
if (!(pf2vf_flags & AMDGIM_FEATURE_ERROR_LOG_COLLECT)) {
|
||||
return;
|
||||
}
|
||||
*/
|
||||
/* The errors are overlay of array, correct read_count as full. */
|
||||
if (admgpu_vf_errors.write_count - admgpu_vf_errors.read_count > AMDGPU_VF_ERROR_ENTRY_SIZE) {
|
||||
admgpu_vf_errors.read_count = admgpu_vf_errors.write_count - AMDGPU_VF_ERROR_ENTRY_SIZE;
|
||||
}
|
||||
|
||||
while (admgpu_vf_errors.read_count < admgpu_vf_errors.write_count) {
|
||||
index =admgpu_vf_errors.read_count % AMDGPU_VF_ERROR_ENTRY_SIZE;
|
||||
data1 = AMDGIM_ERROR_CODE_FLAGS_TO_MAILBOX (admgpu_vf_errors.code[index], admgpu_vf_errors.flags[index]);
|
||||
data2 = admgpu_vf_errors.data[index] & 0xFFFFFFFF;
|
||||
data3 = (admgpu_vf_errors.data[index] >> 32) & 0xFFFFFFFF;
|
||||
|
||||
adev->virt.ops->trans_msg(adev, IDH_LOG_VF_ERROR, data1, data2, data3);
|
||||
admgpu_vf_errors.read_count ++;
|
||||
}
|
||||
}
|
62
drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.h
Normal file
62
drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.h
Normal file
@ -0,0 +1,62 @@
|
||||
/*
|
||||
* Copyright 2017 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __VF_ERROR_H__
|
||||
#define __VF_ERROR_H__
|
||||
|
||||
#define AMDGIM_ERROR_CODE_FLAGS_TO_MAILBOX(c,f) (((c & 0xFFFF) << 16) | (f & 0xFFFF))
|
||||
#define AMDGIM_ERROR_CODE(t,c) (((t&0xF)<<12)|(c&0xFFF))
|
||||
|
||||
/* Please keep enum same as AMD GIM driver */
|
||||
enum AMDGIM_ERROR_VF {
|
||||
AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL = 0,
|
||||
AMDGIM_ERROR_VF_NO_VBIOS,
|
||||
AMDGIM_ERROR_VF_GPU_POST_ERROR,
|
||||
AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL,
|
||||
AMDGIM_ERROR_VF_FENCE_INIT_FAIL,
|
||||
|
||||
AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL,
|
||||
AMDGIM_ERROR_VF_IB_INIT_FAIL,
|
||||
AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL,
|
||||
AMDGIM_ERROR_VF_ASIC_RESUME_FAIL,
|
||||
AMDGIM_ERROR_VF_GPU_RESET_FAIL,
|
||||
|
||||
AMDGIM_ERROR_VF_TEST,
|
||||
AMDGIM_ERROR_VF_MAX
|
||||
};
|
||||
|
||||
enum AMDGIM_ERROR_CATEGORY {
|
||||
AMDGIM_ERROR_CATEGORY_NON_USED = 0,
|
||||
AMDGIM_ERROR_CATEGORY_GIM,
|
||||
AMDGIM_ERROR_CATEGORY_PF,
|
||||
AMDGIM_ERROR_CATEGORY_VF,
|
||||
AMDGIM_ERROR_CATEGORY_VBIOS,
|
||||
AMDGIM_ERROR_CATEGORY_MONITOR,
|
||||
|
||||
AMDGIM_ERROR_CATEGORY_MAX
|
||||
};
|
||||
|
||||
void amdgpu_vf_error_put(uint16_t sub_error_code, uint16_t error_flags, uint64_t error_data);
|
||||
void amdgpu_vf_error_trans_all (struct amdgpu_device *adev);
|
||||
|
||||
#endif /* __VF_ERROR_H__ */
|
@ -43,6 +43,7 @@ struct amdgpu_virt_ops {
|
||||
int (*req_full_gpu)(struct amdgpu_device *adev, bool init);
|
||||
int (*rel_full_gpu)(struct amdgpu_device *adev, bool init);
|
||||
int (*reset_gpu)(struct amdgpu_device *adev);
|
||||
void (*trans_msg)(struct amdgpu_device *adev, u32 req, u32 data1, u32 data2, u32 data3);
|
||||
};
|
||||
|
||||
/* GPU virtualization */
|
||||
|
@ -72,21 +72,6 @@ static void xgpu_ai_mailbox_set_valid(struct amdgpu_device *adev, bool val)
|
||||
reg);
|
||||
}
|
||||
|
||||
static void xgpu_ai_mailbox_trans_msg(struct amdgpu_device *adev,
|
||||
enum idh_request req)
|
||||
{
|
||||
u32 reg;
|
||||
|
||||
reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
|
||||
mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0));
|
||||
reg = REG_SET_FIELD(reg, BIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0,
|
||||
MSGBUF_DATA, req);
|
||||
WREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0),
|
||||
reg);
|
||||
|
||||
xgpu_ai_mailbox_set_valid(adev, true);
|
||||
}
|
||||
|
||||
static int xgpu_ai_mailbox_rcv_msg(struct amdgpu_device *adev,
|
||||
enum idh_event event)
|
||||
{
|
||||
@ -154,13 +139,25 @@ static int xgpu_ai_poll_msg(struct amdgpu_device *adev, enum idh_event event)
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
static int xgpu_ai_send_access_requests(struct amdgpu_device *adev,
|
||||
enum idh_request req)
|
||||
{
|
||||
static void xgpu_ai_mailbox_trans_msg (struct amdgpu_device *adev,
|
||||
enum idh_request req, u32 data1, u32 data2, u32 data3) {
|
||||
u32 reg;
|
||||
int r;
|
||||
|
||||
xgpu_ai_mailbox_trans_msg(adev, req);
|
||||
reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
|
||||
mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0));
|
||||
reg = REG_SET_FIELD(reg, BIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0,
|
||||
MSGBUF_DATA, req);
|
||||
WREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0),
|
||||
reg);
|
||||
WREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW1),
|
||||
data1);
|
||||
WREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW2),
|
||||
data2);
|
||||
WREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW3),
|
||||
data3);
|
||||
|
||||
xgpu_ai_mailbox_set_valid(adev, true);
|
||||
|
||||
/* start to poll ack */
|
||||
r = xgpu_ai_poll_ack(adev);
|
||||
@ -168,6 +165,14 @@ static int xgpu_ai_send_access_requests(struct amdgpu_device *adev,
|
||||
pr_err("Doesn't get ack from pf, continue\n");
|
||||
|
||||
xgpu_ai_mailbox_set_valid(adev, false);
|
||||
}
|
||||
|
||||
static int xgpu_ai_send_access_requests(struct amdgpu_device *adev,
|
||||
enum idh_request req)
|
||||
{
|
||||
int r;
|
||||
|
||||
xgpu_ai_mailbox_trans_msg(adev, req, 0, 0, 0);
|
||||
|
||||
/* start to check msg if request is idh_req_gpu_init_access */
|
||||
if (req == IDH_REQ_GPU_INIT_ACCESS ||
|
||||
@ -342,4 +347,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
|
||||
.req_full_gpu = xgpu_ai_request_full_gpu_access,
|
||||
.rel_full_gpu = xgpu_ai_release_full_gpu_access,
|
||||
.reset_gpu = xgpu_ai_request_reset,
|
||||
.trans_msg = xgpu_ai_mailbox_trans_msg,
|
||||
};
|
||||
|
@ -31,7 +31,9 @@ enum idh_request {
|
||||
IDH_REL_GPU_INIT_ACCESS,
|
||||
IDH_REQ_GPU_FINI_ACCESS,
|
||||
IDH_REL_GPU_FINI_ACCESS,
|
||||
IDH_REQ_GPU_RESET_ACCESS
|
||||
IDH_REQ_GPU_RESET_ACCESS,
|
||||
|
||||
IDH_LOG_VF_ERROR = 200,
|
||||
};
|
||||
|
||||
enum idh_event {
|
||||
|
@ -613,4 +613,5 @@ const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
|
||||
.req_full_gpu = xgpu_vi_request_full_gpu_access,
|
||||
.rel_full_gpu = xgpu_vi_release_full_gpu_access,
|
||||
.reset_gpu = xgpu_vi_request_reset,
|
||||
.trans_msg = NULL, /* Does not need to trans VF errors to host. */
|
||||
};
|
||||
|
@ -32,7 +32,9 @@ enum idh_request {
|
||||
IDH_REL_GPU_INIT_ACCESS,
|
||||
IDH_REQ_GPU_FINI_ACCESS,
|
||||
IDH_REL_GPU_FINI_ACCESS,
|
||||
IDH_REQ_GPU_RESET_ACCESS
|
||||
IDH_REQ_GPU_RESET_ACCESS,
|
||||
|
||||
IDH_LOG_VF_ERROR = 200,
|
||||
};
|
||||
|
||||
/* VI mailbox messages data */
|
||||
|
Loading…
Reference in New Issue
Block a user