drm/amdkfd: Add memory exception handling

This patch adds Peripheral Page Request (PPR) failure processing
and reporting.

Bad address or pointer to a system memory block with inappropriate
read/write permission cause such PPR failure during a user queue
processing. PPR request handling is done by IOMMU driver notifying
AMDKFD module on PPR failure.

The process triggering a PPR failure will be notified by
appropriate event or SIGTERM signal will be sent to it.

v3:
- Change all bool fields in struct kfd_memory_exception_failure to
  uint32_t

Signed-off-by: Alexey Skidanov <alexey.skidanov@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
This commit is contained in:
Alexey Skidanov 2015-04-14 18:05:49 +03:00 committed by Oded Gabbay
parent f3a398183f
commit 59d3e8be87
4 changed files with 187 additions and 9 deletions

View File

@ -182,6 +182,32 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
kfd_unbind_process_from_device(dev, pasid);
}
/*
* This function called by IOMMU driver on PPR failure
*/
static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid,
unsigned long address, u16 flags)
{
struct kfd_dev *dev;
dev_warn(kfd_device,
"Invalid PPR device %x:%x.%x pasid %d address 0x%lX flags 0x%X",
PCI_BUS_NUM(pdev->devfn),
PCI_SLOT(pdev->devfn),
PCI_FUNC(pdev->devfn),
pasid,
address,
flags);
dev = kfd_device_by_pci_dev(pdev);
BUG_ON(dev == NULL);
kfd_signal_iommu_event(dev, pasid, address,
flags & PPR_FAULT_WRITE, flags & PPR_FAULT_EXEC);
return AMD_IOMMU_INV_PRI_RSP_INVALID;
}
bool kgd2kfd_device_init(struct kfd_dev *kfd,
const struct kgd2kfd_shared_resources *gpu_resources)
{
@ -251,6 +277,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
}
amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
iommu_pasid_shutdown_callback);
amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
kfd->dqm = device_queue_manager_init(kfd);
if (!kfd->dqm) {
@ -316,6 +343,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
if (kfd->init_complete) {
kfd->dqm->ops.stop(kfd->dqm);
amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
amd_iommu_free_device(kfd->pdev);
}
}
@ -335,6 +363,7 @@ int kgd2kfd_resume(struct kfd_dev *kfd)
return -ENXIO;
amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
iommu_pasid_shutdown_callback);
amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
kfd->dqm->ops.start(kfd->dqm);
}

View File

@ -30,6 +30,7 @@
#include <linux/memory.h>
#include "kfd_priv.h"
#include "kfd_events.h"
#include <linux/device.h>
/*
* A task can only be on a single wait_queue at a time, but we need to support
@ -45,6 +46,10 @@ struct kfd_event_waiter {
/* Transitions to true when the event this belongs to is signaled. */
bool activated;
/* Event */
struct kfd_event *event;
uint32_t input_index;
};
/*
@ -610,13 +615,16 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
static int init_event_waiter(struct kfd_process *p,
struct kfd_event_waiter *waiter,
uint32_t event_id)
uint32_t event_id,
uint32_t input_index)
{
struct kfd_event *ev = lookup_event_by_id(p, event_id);
if (!ev)
return -EINVAL;
waiter->event = ev;
waiter->input_index = input_index;
waiter->activated = ev->signaled;
ev->signaled = ev->signaled && !ev->auto_reset;
@ -643,6 +651,38 @@ static bool test_event_condition(bool all, uint32_t num_events,
return activated_count == num_events;
}
/*
* Copy event specific data, if defined.
* Currently only memory exception events have additional data to copy to user
*/
static bool copy_signaled_event_data(uint32_t num_events,
struct kfd_event_waiter *event_waiters,
struct kfd_event_data __user *data)
{
struct kfd_hsa_memory_exception_data *src;
struct kfd_hsa_memory_exception_data __user *dst;
struct kfd_event_waiter *waiter;
struct kfd_event *event;
uint32_t i;
for (i = 0; i < num_events; i++) {
waiter = &event_waiters[i];
event = waiter->event;
if (waiter->activated && event->type == KFD_EVENT_TYPE_MEMORY) {
dst = &data[waiter->input_index].memory_exception_data;
src = &event->memory_exception_data;
if (copy_to_user(dst, src,
sizeof(struct kfd_hsa_memory_exception_data)))
return false;
}
}
return true;
}
static long user_timeout_to_jiffies(uint32_t user_timeout_ms)
{
if (user_timeout_ms == KFD_EVENT_TIMEOUT_IMMEDIATE)
@ -672,10 +712,12 @@ static void free_waiters(uint32_t num_events, struct kfd_event_waiter *waiters)
}
int kfd_wait_on_events(struct kfd_process *p,
uint32_t num_events, const uint32_t __user *event_ids,
uint32_t num_events, void __user *data,
bool all, uint32_t user_timeout_ms,
enum kfd_event_wait_result *wait_result)
{
struct kfd_event_data __user *events =
(struct kfd_event_data __user *) data;
uint32_t i;
int ret = 0;
struct kfd_event_waiter *event_waiters = NULL;
@ -690,13 +732,14 @@ int kfd_wait_on_events(struct kfd_process *p,
}
for (i = 0; i < num_events; i++) {
uint32_t event_id;
struct kfd_event_data event_data;
ret = get_user(event_id, &event_ids[i]);
if (ret)
if (copy_from_user(&event_data, &events[i],
sizeof(struct kfd_event_data)))
goto fail;
ret = init_event_waiter(p, &event_waiters[i], event_id);
ret = init_event_waiter(p, &event_waiters[i],
event_data.event_id, i);
if (ret)
goto fail;
}
@ -723,7 +766,11 @@ int kfd_wait_on_events(struct kfd_process *p,
}
if (test_event_condition(all, num_events, event_waiters)) {
if (copy_signaled_event_data(num_events,
event_waiters, events))
*wait_result = KFD_WAIT_COMPLETE;
else
*wait_result = KFD_WAIT_ERROR;
break;
}
@ -797,3 +844,95 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma)
return remap_pfn_range(vma, vma->vm_start, pfn,
vma->vm_end - vma->vm_start, vma->vm_page_prot);
}
/*
* Assumes that p->event_mutex is held and of course
* that p is not going away (current or locked).
*/
static void lookup_events_by_type_and_signal(struct kfd_process *p,
int type, void *event_data)
{
struct kfd_hsa_memory_exception_data *ev_data;
struct kfd_event *ev;
int bkt;
bool send_signal = true;
ev_data = (struct kfd_hsa_memory_exception_data *) event_data;
hash_for_each(p->events, bkt, ev, events)
if (ev->type == type) {
send_signal = false;
dev_dbg(kfd_device,
"Event found: id %X type %d",
ev->event_id, ev->type);
set_event(ev);
if (ev->type == KFD_EVENT_TYPE_MEMORY && ev_data)
ev->memory_exception_data = *ev_data;
}
/* Send SIGTERM no event of type "type" has been found*/
if (send_signal) {
dev_warn(kfd_device,
"Sending SIGTERM to HSA Process with PID %d ",
p->lead_thread->pid);
send_sig(SIGTERM, p->lead_thread, 0);
}
}
void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
unsigned long address, bool is_write_requested,
bool is_execute_requested)
{
struct kfd_hsa_memory_exception_data memory_exception_data;
struct vm_area_struct *vma;
/*
* Because we are called from arbitrary context (workqueue) as opposed
* to process context, kfd_process could attempt to exit while we are
* running so the lookup function returns a locked process.
*/
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
if (!p)
return; /* Presumably process exited. */
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
down_read(&p->mm->mmap_sem);
vma = find_vma(p->mm, address);
memory_exception_data.gpu_id = dev->id;
memory_exception_data.va = address;
/* Set failure reason */
memory_exception_data.failure.NotPresent = 1;
memory_exception_data.failure.NoExecute = 0;
memory_exception_data.failure.ReadOnly = 0;
if (vma) {
if (vma->vm_start > address) {
memory_exception_data.failure.NotPresent = 1;
memory_exception_data.failure.NoExecute = 0;
memory_exception_data.failure.ReadOnly = 0;
} else {
memory_exception_data.failure.NotPresent = 0;
if (is_write_requested && !(vma->vm_flags & VM_WRITE))
memory_exception_data.failure.ReadOnly = 1;
else
memory_exception_data.failure.ReadOnly = 0;
if (is_execute_requested && !(vma->vm_flags & VM_EXEC))
memory_exception_data.failure.NoExecute = 1;
else
memory_exception_data.failure.NoExecute = 0;
}
}
up_read(&p->mm->mmap_sem);
mutex_lock(&p->event_mutex);
/* Lookup events by type and signal them */
lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_MEMORY,
&memory_exception_data);
mutex_unlock(&p->event_mutex);
mutex_unlock(&p->mutex);
}

View File

@ -28,6 +28,7 @@
#include <linux/types.h>
#include <linux/list.h>
#include "kfd_priv.h"
#include <uapi/linux/kfd_ioctl.h>
#define KFD_EVENT_ID_NONSIGNAL_MASK 0x80000000U
#define KFD_FIRST_NONSIGNAL_EVENT_ID KFD_EVENT_ID_NONSIGNAL_MASK
@ -61,6 +62,11 @@ struct kfd_event {
struct signal_page *signal_page;
unsigned int signal_slot_index;
uint64_t __user *user_signal_address;
/* type specific data */
union {
struct kfd_hsa_memory_exception_data memory_exception_data;
};
};
#define KFD_EVENT_TIMEOUT_IMMEDIATE 0
@ -69,6 +75,7 @@ struct kfd_event {
/* Matching HSA_EVENTTYPE */
#define KFD_EVENT_TYPE_SIGNAL 0
#define KFD_EVENT_TYPE_DEBUG 5
#define KFD_EVENT_TYPE_MEMORY 8
extern void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
uint32_t valid_id_bits);

View File

@ -693,11 +693,14 @@ void kfd_event_init_process(struct kfd_process *p);
void kfd_event_free_process(struct kfd_process *p);
int kfd_event_mmap(struct kfd_process *process, struct vm_area_struct *vma);
int kfd_wait_on_events(struct kfd_process *p,
uint32_t num_events, const uint32_t __user *event_ids,
uint32_t num_events, void __user *data,
bool all, uint32_t user_timeout_ms,
enum kfd_event_wait_result *wait_result);
void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
uint32_t valid_id_bits);
void kfd_signal_iommu_event(struct kfd_dev *dev,
unsigned int pasid, unsigned long address,
bool is_write_requested, bool is_execute_requested);
int kfd_set_event(struct kfd_process *p, uint32_t event_id);
int kfd_reset_event(struct kfd_process *p, uint32_t event_id);
int kfd_event_create(struct file *devkfd, struct kfd_process *p,