linux/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
Emily Deng 8ee3a52e3f drm/gpu-sched: fix force APP kill hang(v4)
issue:
there are VMC page fault occurred if force APP kill during
3dmark test, the cause is in entity_fini we manually signal
all those jobs in entity's queue which confuse the sync/dep
mechanism:

1)page fault occurred in sdma's clear job which operate on
shadow buffer, and shadow buffer's Gart table is cleaned by
ttm_bo_release since the fence in its reservation was fake signaled
by entity_fini() under the case of SIGKILL received.

2)page fault occurred in gfx' job because during the lifetime
of gfx job we manually fake signal all jobs from its entity
in entity_fini(), thus the unmapping/clear PTE job depend on those
result fence is satisfied and sdma start clearing the PTE and lead
to GFX page fault.

fix:
1)should at least wait all jobs already scheduled complete in entity_fini()
if SIGKILL is the case.

2)if a fence signaled and try to clear some entity's dependency, should
set this entity guilty to prevent its job really run since the dependency
is fake signaled.

v2:
splitting drm_sched_entity_fini() into two functions:
1)The first one is does the waiting, removes the entity from the
runqueue and returns an error when the process was killed.
2)The second one then goes over the entity, install it as
completion signal for the remaining jobs and signals all jobs
with an error code.

v3:
1)Replace the fini1 and fini2 with better name
2)Call the first part before the VM teardown in
amdgpu_driver_postclose_kms() and the second part
after the VM teardown
3)Keep the original function drm_sched_entity_fini to
refine the code.

v4:
1)Rename entity->finished to entity->last_scheduled;
2)Rename drm_sched_entity_fini_job_cb() to
drm_sched_entity_kill_jobs_cb();
3)Pass NULL to drm_sched_entity_fini_job_cb() if -ENOENT;
4)Replace the type of entity->fini_status with "int";
5)Remove the check about entity->finished.

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Signed-off-by: Emily Deng <Emily.Deng@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
2018-05-15 13:43:17 -05:00

502 lines
12 KiB
C

/*
* Copyright 2015 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
* Authors: monk liu <monk.liu@amd.com>
*/
#include <drm/drmP.h>
#include <drm/drm_auth.h>
#include "amdgpu.h"
#include "amdgpu_sched.h"
static int amdgpu_ctx_priority_permit(struct drm_file *filp,
enum drm_sched_priority priority)
{
/* NORMAL and below are accessible by everyone */
if (priority <= DRM_SCHED_PRIORITY_NORMAL)
return 0;
if (capable(CAP_SYS_NICE))
return 0;
if (drm_is_current_master(filp))
return 0;
return -EACCES;
}
static int amdgpu_ctx_init(struct amdgpu_device *adev,
enum drm_sched_priority priority,
struct drm_file *filp,
struct amdgpu_ctx *ctx)
{
unsigned i, j;
int r;
if (priority < 0 || priority >= DRM_SCHED_PRIORITY_MAX)
return -EINVAL;
r = amdgpu_ctx_priority_permit(filp, priority);
if (r)
return r;
memset(ctx, 0, sizeof(*ctx));
ctx->adev = adev;
kref_init(&ctx->refcount);
spin_lock_init(&ctx->ring_lock);
ctx->fences = kcalloc(amdgpu_sched_jobs * AMDGPU_MAX_RINGS,
sizeof(struct dma_fence*), GFP_KERNEL);
if (!ctx->fences)
return -ENOMEM;
mutex_init(&ctx->lock);
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
ctx->rings[i].sequence = 1;
ctx->rings[i].fences = &ctx->fences[amdgpu_sched_jobs * i];
}
ctx->reset_counter = atomic_read(&adev->gpu_reset_counter);
ctx->reset_counter_query = ctx->reset_counter;
ctx->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
ctx->init_priority = priority;
ctx->override_priority = DRM_SCHED_PRIORITY_UNSET;
/* create context entity for each ring */
for (i = 0; i < adev->num_rings; i++) {
struct amdgpu_ring *ring = adev->rings[i];
struct drm_sched_rq *rq;
rq = &ring->sched.sched_rq[priority];
if (ring == &adev->gfx.kiq.ring)
continue;
r = drm_sched_entity_init(&ring->sched, &ctx->rings[i].entity,
rq, amdgpu_sched_jobs, &ctx->guilty);
if (r)
goto failed;
}
r = amdgpu_queue_mgr_init(adev, &ctx->queue_mgr);
if (r)
goto failed;
return 0;
failed:
for (j = 0; j < i; j++)
drm_sched_entity_fini(&adev->rings[j]->sched,
&ctx->rings[j].entity);
kfree(ctx->fences);
ctx->fences = NULL;
return r;
}
static void amdgpu_ctx_fini(struct kref *ref)
{
struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx, refcount);
struct amdgpu_device *adev = ctx->adev;
unsigned i, j;
if (!adev)
return;
for (i = 0; i < AMDGPU_MAX_RINGS; ++i)
for (j = 0; j < amdgpu_sched_jobs; ++j)
dma_fence_put(ctx->rings[i].fences[j]);
kfree(ctx->fences);
ctx->fences = NULL;
amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);
mutex_destroy(&ctx->lock);
kfree(ctx);
}
static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
struct amdgpu_fpriv *fpriv,
struct drm_file *filp,
enum drm_sched_priority priority,
uint32_t *id)
{
struct amdgpu_ctx_mgr *mgr = &fpriv->ctx_mgr;
struct amdgpu_ctx *ctx;
int r;
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
mutex_lock(&mgr->lock);
r = idr_alloc(&mgr->ctx_handles, ctx, 1, 0, GFP_KERNEL);
if (r < 0) {
mutex_unlock(&mgr->lock);
kfree(ctx);
return r;
}
*id = (uint32_t)r;
r = amdgpu_ctx_init(adev, priority, filp, ctx);
if (r) {
idr_remove(&mgr->ctx_handles, *id);
*id = 0;
kfree(ctx);
}
mutex_unlock(&mgr->lock);
return r;
}
static void amdgpu_ctx_do_release(struct kref *ref)
{
struct amdgpu_ctx *ctx;
u32 i;
ctx = container_of(ref, struct amdgpu_ctx, refcount);
for (i = 0; i < ctx->adev->num_rings; i++)
drm_sched_entity_fini(&ctx->adev->rings[i]->sched,
&ctx->rings[i].entity);
amdgpu_ctx_fini(ref);
}
static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id)
{
struct amdgpu_ctx_mgr *mgr = &fpriv->ctx_mgr;
struct amdgpu_ctx *ctx;
mutex_lock(&mgr->lock);
ctx = idr_remove(&mgr->ctx_handles, id);
if (ctx)
kref_put(&ctx->refcount, amdgpu_ctx_do_release);
mutex_unlock(&mgr->lock);
return ctx ? 0 : -EINVAL;
}
static int amdgpu_ctx_query(struct amdgpu_device *adev,
struct amdgpu_fpriv *fpriv, uint32_t id,
union drm_amdgpu_ctx_out *out)
{
struct amdgpu_ctx *ctx;
struct amdgpu_ctx_mgr *mgr;
unsigned reset_counter;
if (!fpriv)
return -EINVAL;
mgr = &fpriv->ctx_mgr;
mutex_lock(&mgr->lock);
ctx = idr_find(&mgr->ctx_handles, id);
if (!ctx) {
mutex_unlock(&mgr->lock);
return -EINVAL;
}
/* TODO: these two are always zero */
out->state.flags = 0x0;
out->state.hangs = 0x0;
/* determine if a GPU reset has occured since the last call */
reset_counter = atomic_read(&adev->gpu_reset_counter);
/* TODO: this should ideally return NO, GUILTY, or INNOCENT. */
if (ctx->reset_counter_query == reset_counter)
out->state.reset_status = AMDGPU_CTX_NO_RESET;
else
out->state.reset_status = AMDGPU_CTX_UNKNOWN_RESET;
ctx->reset_counter_query = reset_counter;
mutex_unlock(&mgr->lock);
return 0;
}
static int amdgpu_ctx_query2(struct amdgpu_device *adev,
struct amdgpu_fpriv *fpriv, uint32_t id,
union drm_amdgpu_ctx_out *out)
{
struct amdgpu_ctx *ctx;
struct amdgpu_ctx_mgr *mgr;
if (!fpriv)
return -EINVAL;
mgr = &fpriv->ctx_mgr;
mutex_lock(&mgr->lock);
ctx = idr_find(&mgr->ctx_handles, id);
if (!ctx) {
mutex_unlock(&mgr->lock);
return -EINVAL;
}
out->state.flags = 0x0;
out->state.hangs = 0x0;
if (ctx->reset_counter != atomic_read(&adev->gpu_reset_counter))
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RESET;
if (ctx->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST;
if (atomic_read(&ctx->guilty))
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
mutex_unlock(&mgr->lock);
return 0;
}
int amdgpu_ctx_ioctl(struct drm_device *dev, void *data,
struct drm_file *filp)
{
int r;
uint32_t id;
enum drm_sched_priority priority;
union drm_amdgpu_ctx *args = data;
struct amdgpu_device *adev = dev->dev_private;
struct amdgpu_fpriv *fpriv = filp->driver_priv;
r = 0;
id = args->in.ctx_id;
priority = amdgpu_to_sched_priority(args->in.priority);
/* For backwards compatibility reasons, we need to accept
* ioctls with garbage in the priority field */
if (priority == DRM_SCHED_PRIORITY_INVALID)
priority = DRM_SCHED_PRIORITY_NORMAL;
switch (args->in.op) {
case AMDGPU_CTX_OP_ALLOC_CTX:
r = amdgpu_ctx_alloc(adev, fpriv, filp, priority, &id);
args->out.alloc.ctx_id = id;
break;
case AMDGPU_CTX_OP_FREE_CTX:
r = amdgpu_ctx_free(fpriv, id);
break;
case AMDGPU_CTX_OP_QUERY_STATE:
r = amdgpu_ctx_query(adev, fpriv, id, &args->out);
break;
case AMDGPU_CTX_OP_QUERY_STATE2:
r = amdgpu_ctx_query2(adev, fpriv, id, &args->out);
break;
default:
return -EINVAL;
}
return r;
}
struct amdgpu_ctx *amdgpu_ctx_get(struct amdgpu_fpriv *fpriv, uint32_t id)
{
struct amdgpu_ctx *ctx;
struct amdgpu_ctx_mgr *mgr;
if (!fpriv)
return NULL;
mgr = &fpriv->ctx_mgr;
mutex_lock(&mgr->lock);
ctx = idr_find(&mgr->ctx_handles, id);
if (ctx)
kref_get(&ctx->refcount);
mutex_unlock(&mgr->lock);
return ctx;
}
int amdgpu_ctx_put(struct amdgpu_ctx *ctx)
{
if (ctx == NULL)
return -EINVAL;
kref_put(&ctx->refcount, amdgpu_ctx_do_release);
return 0;
}
int amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx, struct amdgpu_ring *ring,
struct dma_fence *fence, uint64_t* handler)
{
struct amdgpu_ctx_ring *cring = & ctx->rings[ring->idx];
uint64_t seq = cring->sequence;
unsigned idx = 0;
struct dma_fence *other = NULL;
idx = seq & (amdgpu_sched_jobs - 1);
other = cring->fences[idx];
if (other)
BUG_ON(!dma_fence_is_signaled(other));
dma_fence_get(fence);
spin_lock(&ctx->ring_lock);
cring->fences[idx] = fence;
cring->sequence++;
spin_unlock(&ctx->ring_lock);
dma_fence_put(other);
if (handler)
*handler = seq;
return 0;
}
struct dma_fence *amdgpu_ctx_get_fence(struct amdgpu_ctx *ctx,
struct amdgpu_ring *ring, uint64_t seq)
{
struct amdgpu_ctx_ring *cring = & ctx->rings[ring->idx];
struct dma_fence *fence;
spin_lock(&ctx->ring_lock);
if (seq == ~0ull)
seq = ctx->rings[ring->idx].sequence - 1;
if (seq >= cring->sequence) {
spin_unlock(&ctx->ring_lock);
return ERR_PTR(-EINVAL);
}
if (seq + amdgpu_sched_jobs < cring->sequence) {
spin_unlock(&ctx->ring_lock);
return NULL;
}
fence = dma_fence_get(cring->fences[seq & (amdgpu_sched_jobs - 1)]);
spin_unlock(&ctx->ring_lock);
return fence;
}
void amdgpu_ctx_priority_override(struct amdgpu_ctx *ctx,
enum drm_sched_priority priority)
{
int i;
struct amdgpu_device *adev = ctx->adev;
struct drm_sched_rq *rq;
struct drm_sched_entity *entity;
struct amdgpu_ring *ring;
enum drm_sched_priority ctx_prio;
ctx->override_priority = priority;
ctx_prio = (ctx->override_priority == DRM_SCHED_PRIORITY_UNSET) ?
ctx->init_priority : ctx->override_priority;
for (i = 0; i < adev->num_rings; i++) {
ring = adev->rings[i];
entity = &ctx->rings[i].entity;
rq = &ring->sched.sched_rq[ctx_prio];
if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ)
continue;
drm_sched_entity_set_rq(entity, rq);
}
}
int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx, unsigned ring_id)
{
struct amdgpu_ctx_ring *cring = &ctx->rings[ring_id];
unsigned idx = cring->sequence & (amdgpu_sched_jobs - 1);
struct dma_fence *other = cring->fences[idx];
if (other) {
signed long r;
r = dma_fence_wait_timeout(other, false, MAX_SCHEDULE_TIMEOUT);
if (r < 0) {
DRM_ERROR("Error (%ld) waiting for fence!\n", r);
return r;
}
}
return 0;
}
void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr)
{
mutex_init(&mgr->lock);
idr_init(&mgr->ctx_handles);
}
void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
{
struct amdgpu_ctx *ctx;
struct idr *idp;
uint32_t id, i;
idp = &mgr->ctx_handles;
idr_for_each_entry(idp, ctx, id) {
if (!ctx->adev)
return;
for (i = 0; i < ctx->adev->num_rings; i++)
if (kref_read(&ctx->refcount) == 1)
drm_sched_entity_do_release(&ctx->adev->rings[i]->sched,
&ctx->rings[i].entity);
else
DRM_ERROR("ctx %p is still alive\n", ctx);
}
}
void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr)
{
struct amdgpu_ctx *ctx;
struct idr *idp;
uint32_t id, i;
idp = &mgr->ctx_handles;
idr_for_each_entry(idp, ctx, id) {
if (!ctx->adev)
return;
for (i = 0; i < ctx->adev->num_rings; i++)
if (kref_read(&ctx->refcount) == 1)
drm_sched_entity_cleanup(&ctx->adev->rings[i]->sched,
&ctx->rings[i].entity);
else
DRM_ERROR("ctx %p is still alive\n", ctx);
}
}
void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)
{
struct amdgpu_ctx *ctx;
struct idr *idp;
uint32_t id;
amdgpu_ctx_mgr_entity_cleanup(mgr);
idp = &mgr->ctx_handles;
idr_for_each_entry(idp, ctx, id) {
if (kref_put(&ctx->refcount, amdgpu_ctx_fini) != 1)
DRM_ERROR("ctx %p is still alive\n", ctx);
}
idr_destroy(&mgr->ctx_handles);
mutex_destroy(&mgr->lock);
}