forked from Minki/linux
drm/amdkfd: Add memory exception handling
This patch adds Peripheral Page Request (PPR) failure processing and reporting. Bad address or pointer to a system memory block with inappropriate read/write permission cause such PPR failure during a user queue processing. PPR request handling is done by IOMMU driver notifying AMDKFD module on PPR failure. The process triggering a PPR failure will be notified by appropriate event or SIGTERM signal will be sent to it. v3: - Change all bool fields in struct kfd_memory_exception_failure to uint32_t Signed-off-by: Alexey Skidanov <alexey.skidanov@gmail.com> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
This commit is contained in:
parent
f3a398183f
commit
59d3e8be87
@ -182,6 +182,32 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
|
||||
kfd_unbind_process_from_device(dev, pasid);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function called by IOMMU driver on PPR failure
|
||||
*/
|
||||
static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid,
|
||||
unsigned long address, u16 flags)
|
||||
{
|
||||
struct kfd_dev *dev;
|
||||
|
||||
dev_warn(kfd_device,
|
||||
"Invalid PPR device %x:%x.%x pasid %d address 0x%lX flags 0x%X",
|
||||
PCI_BUS_NUM(pdev->devfn),
|
||||
PCI_SLOT(pdev->devfn),
|
||||
PCI_FUNC(pdev->devfn),
|
||||
pasid,
|
||||
address,
|
||||
flags);
|
||||
|
||||
dev = kfd_device_by_pci_dev(pdev);
|
||||
BUG_ON(dev == NULL);
|
||||
|
||||
kfd_signal_iommu_event(dev, pasid, address,
|
||||
flags & PPR_FAULT_WRITE, flags & PPR_FAULT_EXEC);
|
||||
|
||||
return AMD_IOMMU_INV_PRI_RSP_INVALID;
|
||||
}
|
||||
|
||||
bool kgd2kfd_device_init(struct kfd_dev *kfd,
|
||||
const struct kgd2kfd_shared_resources *gpu_resources)
|
||||
{
|
||||
@ -251,6 +277,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
|
||||
}
|
||||
amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
|
||||
iommu_pasid_shutdown_callback);
|
||||
amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
|
||||
|
||||
kfd->dqm = device_queue_manager_init(kfd);
|
||||
if (!kfd->dqm) {
|
||||
@ -316,6 +343,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
|
||||
if (kfd->init_complete) {
|
||||
kfd->dqm->ops.stop(kfd->dqm);
|
||||
amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
|
||||
amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
|
||||
amd_iommu_free_device(kfd->pdev);
|
||||
}
|
||||
}
|
||||
@ -335,6 +363,7 @@ int kgd2kfd_resume(struct kfd_dev *kfd)
|
||||
return -ENXIO;
|
||||
amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
|
||||
iommu_pasid_shutdown_callback);
|
||||
amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
|
||||
kfd->dqm->ops.start(kfd->dqm);
|
||||
}
|
||||
|
||||
|
@ -30,6 +30,7 @@
|
||||
#include <linux/memory.h>
|
||||
#include "kfd_priv.h"
|
||||
#include "kfd_events.h"
|
||||
#include <linux/device.h>
|
||||
|
||||
/*
|
||||
* A task can only be on a single wait_queue at a time, but we need to support
|
||||
@ -45,6 +46,10 @@ struct kfd_event_waiter {
|
||||
|
||||
/* Transitions to true when the event this belongs to is signaled. */
|
||||
bool activated;
|
||||
|
||||
/* Event */
|
||||
struct kfd_event *event;
|
||||
uint32_t input_index;
|
||||
};
|
||||
|
||||
/*
|
||||
@ -609,14 +614,17 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
|
||||
}
|
||||
|
||||
static int init_event_waiter(struct kfd_process *p,
|
||||
struct kfd_event_waiter *waiter,
|
||||
uint32_t event_id)
|
||||
struct kfd_event_waiter *waiter,
|
||||
uint32_t event_id,
|
||||
uint32_t input_index)
|
||||
{
|
||||
struct kfd_event *ev = lookup_event_by_id(p, event_id);
|
||||
|
||||
if (!ev)
|
||||
return -EINVAL;
|
||||
|
||||
waiter->event = ev;
|
||||
waiter->input_index = input_index;
|
||||
waiter->activated = ev->signaled;
|
||||
ev->signaled = ev->signaled && !ev->auto_reset;
|
||||
|
||||
@ -643,6 +651,38 @@ static bool test_event_condition(bool all, uint32_t num_events,
|
||||
return activated_count == num_events;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy event specific data, if defined.
|
||||
* Currently only memory exception events have additional data to copy to user
|
||||
*/
|
||||
static bool copy_signaled_event_data(uint32_t num_events,
|
||||
struct kfd_event_waiter *event_waiters,
|
||||
struct kfd_event_data __user *data)
|
||||
{
|
||||
struct kfd_hsa_memory_exception_data *src;
|
||||
struct kfd_hsa_memory_exception_data __user *dst;
|
||||
struct kfd_event_waiter *waiter;
|
||||
struct kfd_event *event;
|
||||
uint32_t i;
|
||||
|
||||
for (i = 0; i < num_events; i++) {
|
||||
waiter = &event_waiters[i];
|
||||
event = waiter->event;
|
||||
if (waiter->activated && event->type == KFD_EVENT_TYPE_MEMORY) {
|
||||
dst = &data[waiter->input_index].memory_exception_data;
|
||||
src = &event->memory_exception_data;
|
||||
if (copy_to_user(dst, src,
|
||||
sizeof(struct kfd_hsa_memory_exception_data)))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
static long user_timeout_to_jiffies(uint32_t user_timeout_ms)
|
||||
{
|
||||
if (user_timeout_ms == KFD_EVENT_TIMEOUT_IMMEDIATE)
|
||||
@ -672,10 +712,12 @@ static void free_waiters(uint32_t num_events, struct kfd_event_waiter *waiters)
|
||||
}
|
||||
|
||||
int kfd_wait_on_events(struct kfd_process *p,
|
||||
uint32_t num_events, const uint32_t __user *event_ids,
|
||||
uint32_t num_events, void __user *data,
|
||||
bool all, uint32_t user_timeout_ms,
|
||||
enum kfd_event_wait_result *wait_result)
|
||||
{
|
||||
struct kfd_event_data __user *events =
|
||||
(struct kfd_event_data __user *) data;
|
||||
uint32_t i;
|
||||
int ret = 0;
|
||||
struct kfd_event_waiter *event_waiters = NULL;
|
||||
@ -690,13 +732,14 @@ int kfd_wait_on_events(struct kfd_process *p,
|
||||
}
|
||||
|
||||
for (i = 0; i < num_events; i++) {
|
||||
uint32_t event_id;
|
||||
struct kfd_event_data event_data;
|
||||
|
||||
ret = get_user(event_id, &event_ids[i]);
|
||||
if (ret)
|
||||
if (copy_from_user(&event_data, &events[i],
|
||||
sizeof(struct kfd_event_data)))
|
||||
goto fail;
|
||||
|
||||
ret = init_event_waiter(p, &event_waiters[i], event_id);
|
||||
ret = init_event_waiter(p, &event_waiters[i],
|
||||
event_data.event_id, i);
|
||||
if (ret)
|
||||
goto fail;
|
||||
}
|
||||
@ -723,7 +766,11 @@ int kfd_wait_on_events(struct kfd_process *p,
|
||||
}
|
||||
|
||||
if (test_event_condition(all, num_events, event_waiters)) {
|
||||
*wait_result = KFD_WAIT_COMPLETE;
|
||||
if (copy_signaled_event_data(num_events,
|
||||
event_waiters, events))
|
||||
*wait_result = KFD_WAIT_COMPLETE;
|
||||
else
|
||||
*wait_result = KFD_WAIT_ERROR;
|
||||
break;
|
||||
}
|
||||
|
||||
@ -797,3 +844,95 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma)
|
||||
return remap_pfn_range(vma, vma->vm_start, pfn,
|
||||
vma->vm_end - vma->vm_start, vma->vm_page_prot);
|
||||
}
|
||||
|
||||
/*
|
||||
* Assumes that p->event_mutex is held and of course
|
||||
* that p is not going away (current or locked).
|
||||
*/
|
||||
static void lookup_events_by_type_and_signal(struct kfd_process *p,
|
||||
int type, void *event_data)
|
||||
{
|
||||
struct kfd_hsa_memory_exception_data *ev_data;
|
||||
struct kfd_event *ev;
|
||||
int bkt;
|
||||
bool send_signal = true;
|
||||
|
||||
ev_data = (struct kfd_hsa_memory_exception_data *) event_data;
|
||||
|
||||
hash_for_each(p->events, bkt, ev, events)
|
||||
if (ev->type == type) {
|
||||
send_signal = false;
|
||||
dev_dbg(kfd_device,
|
||||
"Event found: id %X type %d",
|
||||
ev->event_id, ev->type);
|
||||
set_event(ev);
|
||||
if (ev->type == KFD_EVENT_TYPE_MEMORY && ev_data)
|
||||
ev->memory_exception_data = *ev_data;
|
||||
}
|
||||
|
||||
/* Send SIGTERM no event of type "type" has been found*/
|
||||
if (send_signal) {
|
||||
dev_warn(kfd_device,
|
||||
"Sending SIGTERM to HSA Process with PID %d ",
|
||||
p->lead_thread->pid);
|
||||
send_sig(SIGTERM, p->lead_thread, 0);
|
||||
}
|
||||
}
|
||||
|
||||
void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
|
||||
unsigned long address, bool is_write_requested,
|
||||
bool is_execute_requested)
|
||||
{
|
||||
struct kfd_hsa_memory_exception_data memory_exception_data;
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
/*
|
||||
* Because we are called from arbitrary context (workqueue) as opposed
|
||||
* to process context, kfd_process could attempt to exit while we are
|
||||
* running so the lookup function returns a locked process.
|
||||
*/
|
||||
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
|
||||
|
||||
if (!p)
|
||||
return; /* Presumably process exited. */
|
||||
|
||||
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
|
||||
|
||||
down_read(&p->mm->mmap_sem);
|
||||
vma = find_vma(p->mm, address);
|
||||
|
||||
memory_exception_data.gpu_id = dev->id;
|
||||
memory_exception_data.va = address;
|
||||
/* Set failure reason */
|
||||
memory_exception_data.failure.NotPresent = 1;
|
||||
memory_exception_data.failure.NoExecute = 0;
|
||||
memory_exception_data.failure.ReadOnly = 0;
|
||||
if (vma) {
|
||||
if (vma->vm_start > address) {
|
||||
memory_exception_data.failure.NotPresent = 1;
|
||||
memory_exception_data.failure.NoExecute = 0;
|
||||
memory_exception_data.failure.ReadOnly = 0;
|
||||
} else {
|
||||
memory_exception_data.failure.NotPresent = 0;
|
||||
if (is_write_requested && !(vma->vm_flags & VM_WRITE))
|
||||
memory_exception_data.failure.ReadOnly = 1;
|
||||
else
|
||||
memory_exception_data.failure.ReadOnly = 0;
|
||||
if (is_execute_requested && !(vma->vm_flags & VM_EXEC))
|
||||
memory_exception_data.failure.NoExecute = 1;
|
||||
else
|
||||
memory_exception_data.failure.NoExecute = 0;
|
||||
}
|
||||
}
|
||||
|
||||
up_read(&p->mm->mmap_sem);
|
||||
|
||||
mutex_lock(&p->event_mutex);
|
||||
|
||||
/* Lookup events by type and signal them */
|
||||
lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_MEMORY,
|
||||
&memory_exception_data);
|
||||
|
||||
mutex_unlock(&p->event_mutex);
|
||||
mutex_unlock(&p->mutex);
|
||||
}
|
||||
|
@ -28,6 +28,7 @@
|
||||
#include <linux/types.h>
|
||||
#include <linux/list.h>
|
||||
#include "kfd_priv.h"
|
||||
#include <uapi/linux/kfd_ioctl.h>
|
||||
|
||||
#define KFD_EVENT_ID_NONSIGNAL_MASK 0x80000000U
|
||||
#define KFD_FIRST_NONSIGNAL_EVENT_ID KFD_EVENT_ID_NONSIGNAL_MASK
|
||||
@ -61,6 +62,11 @@ struct kfd_event {
|
||||
struct signal_page *signal_page;
|
||||
unsigned int signal_slot_index;
|
||||
uint64_t __user *user_signal_address;
|
||||
|
||||
/* type specific data */
|
||||
union {
|
||||
struct kfd_hsa_memory_exception_data memory_exception_data;
|
||||
};
|
||||
};
|
||||
|
||||
#define KFD_EVENT_TIMEOUT_IMMEDIATE 0
|
||||
@ -69,6 +75,7 @@ struct kfd_event {
|
||||
/* Matching HSA_EVENTTYPE */
|
||||
#define KFD_EVENT_TYPE_SIGNAL 0
|
||||
#define KFD_EVENT_TYPE_DEBUG 5
|
||||
#define KFD_EVENT_TYPE_MEMORY 8
|
||||
|
||||
extern void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
|
||||
uint32_t valid_id_bits);
|
||||
|
@ -693,11 +693,14 @@ void kfd_event_init_process(struct kfd_process *p);
|
||||
void kfd_event_free_process(struct kfd_process *p);
|
||||
int kfd_event_mmap(struct kfd_process *process, struct vm_area_struct *vma);
|
||||
int kfd_wait_on_events(struct kfd_process *p,
|
||||
uint32_t num_events, const uint32_t __user *event_ids,
|
||||
uint32_t num_events, void __user *data,
|
||||
bool all, uint32_t user_timeout_ms,
|
||||
enum kfd_event_wait_result *wait_result);
|
||||
void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
|
||||
uint32_t valid_id_bits);
|
||||
void kfd_signal_iommu_event(struct kfd_dev *dev,
|
||||
unsigned int pasid, unsigned long address,
|
||||
bool is_write_requested, bool is_execute_requested);
|
||||
int kfd_set_event(struct kfd_process *p, uint32_t event_id);
|
||||
int kfd_reset_event(struct kfd_process *p, uint32_t event_id);
|
||||
int kfd_event_create(struct file *devkfd, struct kfd_process *p,
|
||||
|
Loading…
Reference in New Issue
Block a user