perf: Collapse and fix event_function_call() users

There is one common bug left in all the event_function_call() users,
between loading ctx->task and getting to the remote_function(),
ctx->task can already have been changed.

Therefore we need to double check and retry if ctx->task != current.

Insert another trampoline specific to event_function_call() that
checks for this and further validates state. This also allows getting
rid of the active/inactive functions.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Peter Zijlstra 2016-01-11 15:00:50 +01:00 committed by Ingo Molnar
parent 32132a3d0d
commit fae3fde651
3 changed files with 168 additions and 201 deletions

View File

@ -1044,7 +1044,7 @@ extern void perf_swevent_put_recursion_context(int rctx);
extern u64 perf_swevent_set_period(struct perf_event *event); extern u64 perf_swevent_set_period(struct perf_event *event);
extern void perf_event_enable(struct perf_event *event); extern void perf_event_enable(struct perf_event *event);
extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event);
extern int __perf_event_disable(void *info); extern void perf_event_disable_local(struct perf_event *event);
extern void perf_event_task_tick(void); extern void perf_event_task_tick(void);
#else /* !CONFIG_PERF_EVENTS: */ #else /* !CONFIG_PERF_EVENTS: */
static inline void * static inline void *

View File

@ -126,6 +126,28 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
return data.ret; return data.ret;
} }
static inline struct perf_cpu_context *
__get_cpu_context(struct perf_event_context *ctx)
{
return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
}
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
raw_spin_lock(&cpuctx->ctx.lock);
if (ctx)
raw_spin_lock(&ctx->lock);
}
static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
if (ctx)
raw_spin_unlock(&ctx->lock);
raw_spin_unlock(&cpuctx->ctx.lock);
}
/* /*
* On task ctx scheduling... * On task ctx scheduling...
* *
@ -158,21 +180,96 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
* If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
*/ */
static void event_function_call(struct perf_event *event, typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
int (*active)(void *), struct perf_event_context *, void *);
void (*inactive)(void *),
void *data) struct event_function_struct {
struct perf_event *event;
event_f func;
void *data;
};
static int event_function(void *info)
{
struct event_function_struct *efs = info;
struct perf_event *event = efs->event;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
WARN_ON_ONCE(!irqs_disabled());
/*
* Since we do the IPI call without holding ctx->lock things can have
* changed, double check we hit the task we set out to hit.
*
* If ctx->task == current, we know things must remain valid because
* we have IRQs disabled so we cannot schedule.
*/
if (ctx->task) {
if (ctx->task != current)
return -EAGAIN;
WARN_ON_ONCE(task_ctx != ctx);
} else {
WARN_ON_ONCE(&cpuctx->ctx != ctx);
}
perf_ctx_lock(cpuctx, task_ctx);
/*
* Now that we hold locks, double check state. Paranoia pays.
*/
if (task_ctx) {
WARN_ON_ONCE(task_ctx->task != current);
/*
* We only use event_function_call() on established contexts,
* and event_function() is only ever called when active (or
* rather, we'll have bailed in task_function_call() or the
* above ctx->task != current test), therefore we must have
* ctx->is_active here.
*/
WARN_ON_ONCE(!ctx->is_active);
/*
* And since we have ctx->is_active, cpuctx->task_ctx must
* match.
*/
WARN_ON_ONCE(cpuctx->task_ctx != task_ctx);
}
efs->func(event, cpuctx, ctx, efs->data);
perf_ctx_unlock(cpuctx, task_ctx);
return 0;
}
static void event_function_local(struct perf_event *event, event_f func, void *data)
{
struct event_function_struct efs = {
.event = event,
.func = func,
.data = data,
};
int ret = event_function(&efs);
WARN_ON_ONCE(ret);
}
static void event_function_call(struct perf_event *event, event_f func, void *data)
{ {
struct perf_event_context *ctx = event->ctx; struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task; struct task_struct *task = ctx->task;
struct event_function_struct efs = {
.event = event,
.func = func,
.data = data,
};
if (!task) { if (!task) {
cpu_function_call(event->cpu, active, data); cpu_function_call(event->cpu, event_function, &efs);
return; return;
} }
again: again:
if (!task_function_call(task, active, data)) if (!task_function_call(task, event_function, &efs))
return; return;
raw_spin_lock_irq(&ctx->lock); raw_spin_lock_irq(&ctx->lock);
@ -185,7 +282,7 @@ again:
raw_spin_unlock_irq(&ctx->lock); raw_spin_unlock_irq(&ctx->lock);
goto again; goto again;
} }
inactive(data); func(event, NULL, ctx, data);
raw_spin_unlock_irq(&ctx->lock); raw_spin_unlock_irq(&ctx->lock);
} }
@ -400,28 +497,6 @@ static inline u64 perf_event_clock(struct perf_event *event)
return event->clock(); return event->clock();
} }
static inline struct perf_cpu_context *
__get_cpu_context(struct perf_event_context *ctx)
{
return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
}
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
raw_spin_lock(&cpuctx->ctx.lock);
if (ctx)
raw_spin_lock(&ctx->lock);
}
static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
if (ctx)
raw_spin_unlock(&ctx->lock);
raw_spin_unlock(&cpuctx->ctx.lock);
}
#ifdef CONFIG_CGROUP_PERF #ifdef CONFIG_CGROUP_PERF
static inline bool static inline bool
@ -1684,38 +1759,22 @@ group_sched_out(struct perf_event *group_event,
cpuctx->exclusive = 0; cpuctx->exclusive = 0;
} }
struct remove_event {
struct perf_event *event;
bool detach_group;
};
static void ___perf_remove_from_context(void *info)
{
struct remove_event *re = info;
struct perf_event *event = re->event;
struct perf_event_context *ctx = event->ctx;
if (re->detach_group)
perf_group_detach(event);
list_del_event(event, ctx);
}
/* /*
* Cross CPU call to remove a performance event * Cross CPU call to remove a performance event
* *
* We disable the event on the hardware level first. After that we * We disable the event on the hardware level first. After that we
* remove it from the context list. * remove it from the context list.
*/ */
static int __perf_remove_from_context(void *info) static void
__perf_remove_from_context(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
void *info)
{ {
struct remove_event *re = info; bool detach_group = (unsigned long)info;
struct perf_event *event = re->event;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
raw_spin_lock(&ctx->lock);
event_sched_out(event, cpuctx, ctx); event_sched_out(event, cpuctx, ctx);
if (re->detach_group) if (detach_group)
perf_group_detach(event); perf_group_detach(event);
list_del_event(event, ctx); list_del_event(event, ctx);
@ -1726,17 +1785,11 @@ static int __perf_remove_from_context(void *info)
cpuctx->task_ctx = NULL; cpuctx->task_ctx = NULL;
} }
} }
raw_spin_unlock(&ctx->lock);
return 0;
} }
/* /*
* Remove the event from a task's (or a CPU's) list of events. * Remove the event from a task's (or a CPU's) list of events.
* *
* CPU events are removed with a smp call. For task events we only
* call when the task is on a CPU.
*
* If event->ctx is a cloned context, callers must make sure that * If event->ctx is a cloned context, callers must make sure that
* every task struct that event->ctx->task could possibly point to * every task struct that event->ctx->task could possibly point to
* remains valid. This is OK when called from perf_release since * remains valid. This is OK when called from perf_release since
@ -1746,71 +1799,31 @@ static int __perf_remove_from_context(void *info)
*/ */
static void perf_remove_from_context(struct perf_event *event, bool detach_group) static void perf_remove_from_context(struct perf_event *event, bool detach_group)
{ {
struct perf_event_context *ctx = event->ctx; lockdep_assert_held(&event->ctx->mutex);
struct remove_event re = {
.event = event,
.detach_group = detach_group,
};
lockdep_assert_held(&ctx->mutex);
event_function_call(event, __perf_remove_from_context, event_function_call(event, __perf_remove_from_context,
___perf_remove_from_context, &re); (void *)(unsigned long)detach_group);
} }
/* /*
* Cross CPU call to disable a performance event * Cross CPU call to disable a performance event
*/ */
int __perf_event_disable(void *info) static void __perf_event_disable(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
void *info)
{ {
struct perf_event *event = info; if (event->state < PERF_EVENT_STATE_INACTIVE)
struct perf_event_context *ctx = event->ctx; return;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
/* update_context_time(ctx);
* If this is a per-task event, need to check whether this update_cgrp_time_from_event(event);
* event's task is the current task on this cpu. update_group_times(event);
* if (event == event->group_leader)
* Can trigger due to concurrent perf_event_context_sched_out() group_sched_out(event, cpuctx, ctx);
* flipping contexts around. else
*/ event_sched_out(event, cpuctx, ctx);
if (ctx->task && cpuctx->task_ctx != ctx) event->state = PERF_EVENT_STATE_OFF;
return -EINVAL;
raw_spin_lock(&ctx->lock);
/*
* If the event is on, turn it off.
* If it is in error state, leave it in error state.
*/
if (event->state >= PERF_EVENT_STATE_INACTIVE) {
update_context_time(ctx);
update_cgrp_time_from_event(event);
update_group_times(event);
if (event == event->group_leader)
group_sched_out(event, cpuctx, ctx);
else
event_sched_out(event, cpuctx, ctx);
event->state = PERF_EVENT_STATE_OFF;
}
raw_spin_unlock(&ctx->lock);
return 0;
}
void ___perf_event_disable(void *info)
{
struct perf_event *event = info;
/*
* Since we have the lock this context can't be scheduled
* in, so we can change the state safely.
*/
if (event->state == PERF_EVENT_STATE_INACTIVE) {
update_group_times(event);
event->state = PERF_EVENT_STATE_OFF;
}
} }
/* /*
@ -1837,8 +1850,12 @@ static void _perf_event_disable(struct perf_event *event)
} }
raw_spin_unlock_irq(&ctx->lock); raw_spin_unlock_irq(&ctx->lock);
event_function_call(event, __perf_event_disable, event_function_call(event, __perf_event_disable, NULL);
___perf_event_disable, event); }
void perf_event_disable_local(struct perf_event *event)
{
event_function_local(event, __perf_event_disable, NULL);
} }
/* /*
@ -2202,44 +2219,29 @@ static void __perf_event_mark_enabled(struct perf_event *event)
/* /*
* Cross CPU call to enable a performance event * Cross CPU call to enable a performance event
*/ */
static int __perf_event_enable(void *info) static void __perf_event_enable(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
void *info)
{ {
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
struct perf_event *leader = event->group_leader; struct perf_event *leader = event->group_leader;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx;
struct perf_event_context *task_ctx = cpuctx->task_ctx;
/*
* There's a time window between 'ctx->is_active' check
* in perf_event_enable function and this place having:
* - IRQs on
* - ctx->lock unlocked
*
* where the task could be killed and 'ctx' deactivated
* by perf_event_exit_task.
*/
if (!ctx->is_active)
return -EINVAL;
perf_ctx_lock(cpuctx, task_ctx);
WARN_ON_ONCE(&cpuctx->ctx != ctx && task_ctx != ctx);
update_context_time(ctx);
if (event->state >= PERF_EVENT_STATE_INACTIVE) if (event->state >= PERF_EVENT_STATE_INACTIVE)
goto unlock; return;
/*
* set current task's cgroup time reference point
*/
perf_cgroup_set_timestamp(current, ctx);
update_context_time(ctx);
__perf_event_mark_enabled(event); __perf_event_mark_enabled(event);
if (!ctx->is_active)
return;
if (!event_filter_match(event)) { if (!event_filter_match(event)) {
if (is_cgroup_event(event)) if (is_cgroup_event(event)) {
perf_cgroup_set_timestamp(current, ctx); // XXX ?
perf_cgroup_defer_enabled(event); perf_cgroup_defer_enabled(event);
goto unlock; }
return;
} }
/* /*
@ -2247,19 +2249,13 @@ static int __perf_event_enable(void *info)
* then don't put it on unless the group is on. * then don't put it on unless the group is on.
*/ */
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
goto unlock; return;
task_ctx = cpuctx->task_ctx;
if (ctx->task)
WARN_ON_ONCE(task_ctx != ctx);
ctx_resched(cpuctx, task_ctx); ctx_resched(cpuctx, task_ctx);
unlock:
perf_ctx_unlock(cpuctx, task_ctx);
return 0;
}
void ___perf_event_enable(void *info)
{
__perf_event_mark_enabled((struct perf_event *)info);
} }
/* /*
@ -2292,8 +2288,7 @@ static void _perf_event_enable(struct perf_event *event)
event->state = PERF_EVENT_STATE_OFF; event->state = PERF_EVENT_STATE_OFF;
raw_spin_unlock_irq(&ctx->lock); raw_spin_unlock_irq(&ctx->lock);
event_function_call(event, __perf_event_enable, event_function_call(event, __perf_event_enable, NULL);
___perf_event_enable, event);
} }
/* /*
@ -4095,36 +4090,14 @@ static void perf_event_for_each(struct perf_event *event,
perf_event_for_each_child(sibling, func); perf_event_for_each_child(sibling, func);
} }
struct period_event { static void __perf_event_period(struct perf_event *event,
struct perf_event *event; struct perf_cpu_context *cpuctx,
u64 value; struct perf_event_context *ctx,
}; void *info)
static void ___perf_event_period(void *info)
{ {
struct period_event *pe = info; u64 value = *((u64 *)info);
struct perf_event *event = pe->event;
u64 value = pe->value;
if (event->attr.freq) {
event->attr.sample_freq = value;
} else {
event->attr.sample_period = value;
event->hw.sample_period = value;
}
local64_set(&event->hw.period_left, 0);
}
static int __perf_event_period(void *info)
{
struct period_event *pe = info;
struct perf_event *event = pe->event;
struct perf_event_context *ctx = event->ctx;
u64 value = pe->value;
bool active; bool active;
raw_spin_lock(&ctx->lock);
if (event->attr.freq) { if (event->attr.freq) {
event->attr.sample_freq = value; event->attr.sample_freq = value;
} else { } else {
@ -4144,14 +4117,10 @@ static int __perf_event_period(void *info)
event->pmu->start(event, PERF_EF_RELOAD); event->pmu->start(event, PERF_EF_RELOAD);
perf_pmu_enable(ctx->pmu); perf_pmu_enable(ctx->pmu);
} }
raw_spin_unlock(&ctx->lock);
return 0;
} }
static int perf_event_period(struct perf_event *event, u64 __user *arg) static int perf_event_period(struct perf_event *event, u64 __user *arg)
{ {
struct period_event pe = { .event = event, };
u64 value; u64 value;
if (!is_sampling_event(event)) if (!is_sampling_event(event))
@ -4166,10 +4135,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
if (event->attr.freq && value > sysctl_perf_event_sample_rate) if (event->attr.freq && value > sysctl_perf_event_sample_rate)
return -EINVAL; return -EINVAL;
pe.value = value; event_function_call(event, __perf_event_period, &value);
event_function_call(event, __perf_event_period,
___perf_event_period, &pe);
return 0; return 0;
} }
@ -4941,7 +4907,7 @@ static void perf_pending_event(struct irq_work *entry)
if (event->pending_disable) { if (event->pending_disable) {
event->pending_disable = 0; event->pending_disable = 0;
__perf_event_disable(event); perf_event_disable_local(event);
} }
if (event->pending_wakeup) { if (event->pending_wakeup) {
@ -9239,13 +9205,14 @@ static void perf_event_init_cpu(int cpu)
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info) static void __perf_event_exit_context(void *__info)
{ {
struct remove_event re = { .detach_group = true };
struct perf_event_context *ctx = __info; struct perf_event_context *ctx = __info;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event *event;
rcu_read_lock(); raw_spin_lock(&ctx->lock);
list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) list_for_each_entry(event, &ctx->event_list, event_entry)
__perf_remove_from_context(&re); __perf_remove_from_context(event, cpuctx, ctx, (void *)(unsigned long)true);
rcu_read_unlock(); raw_spin_unlock(&ctx->lock);
} }
static void perf_event_exit_cpu_context(int cpu) static void perf_event_exit_cpu_context(int cpu)

View File

@ -444,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
* current task. * current task.
*/ */
if (irqs_disabled() && bp->ctx && bp->ctx->task == current) if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
__perf_event_disable(bp); perf_event_disable_local(bp);
else else
perf_event_disable(bp); perf_event_disable(bp);