diff --git a/include/linux/sched.h b/include/linux/sched.h index be7adb7588e5..f53cdf216cef 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1080,7 +1080,7 @@ struct sched_class { struct task_struct *task); #ifdef CONFIG_FAIR_GROUP_SCHED - void (*moved_group) (struct task_struct *p, int on_rq); + void (*task_move_group) (struct task_struct *p, int on_rq); #endif }; diff --git a/kernel/sched.c b/kernel/sched.c index d42992bccdfa..aa14a56f9d03 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8510,12 +8510,12 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - set_task_rq(tsk, task_cpu(tsk)); - #ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->moved_group) - tsk->sched_class->moved_group(tsk, on_rq); + if (tsk->sched_class->task_move_group) + tsk->sched_class->task_move_group(tsk, on_rq); + else #endif + set_task_rq(tsk, task_cpu(tsk)); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 933f3d1b62ea..f4f6a8326dd0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3869,13 +3869,26 @@ static void set_curr_task_fair(struct rq *rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void moved_group_fair(struct task_struct *p, int on_rq) +static void task_move_group_fair(struct task_struct *p, int on_rq) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); - - update_curr(cfs_rq); + /* + * If the task was not on the rq at the time of this cgroup movement + * it must have been asleep, sleeping tasks keep their ->vruntime + * absolute on their old rq until wakeup (needed for the fair sleeper + * bonus in place_entity()). + * + * If it was on the rq, we've just 'preempted' it, which does convert + * ->vruntime to a relative base. + * + * Make sure both cases convert their relative position when migrating + * to another cgroup's rq. This does somewhat interfere with the + * fair sleeper stuff for the first placement, but who cares. + */ if (!on_rq) - place_entity(cfs_rq, &p->se, 1); + p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; + set_task_rq(p, task_cpu(p)); + if (!on_rq) + p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; } #endif @@ -3927,7 +3940,7 @@ static const struct sched_class fair_sched_class = { .get_rr_interval = get_rr_interval_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .moved_group = moved_group_fair, + .task_move_group = task_move_group_fair, #endif }; diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 25c2f962f6fc..48ddf431db0e 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -157,15 +157,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) } /* - * Called when a process is dequeued from the active array and given - * the cpu. We should note that with the exception of interactive - * tasks, the expired queue will become the active queue after the active - * queue is empty, without explicitly dequeuing and requeuing tasks in the - * expired queue. (Interactive tasks may be requeued directly to the - * active queue, thus delaying tasks in the expired queue from running; - * see scheduler_tick()). - * - * Though we are interested in knowing how long it was from the *first* time a + * We are interested in knowing how long it was from the *first* time a * task was queued to the time that it finally hit a cpu, we call this routine * from dequeue_task() to account for possible rq->clock skew across cpus. The * delta taken on each cpu would annul the skew. @@ -203,16 +195,6 @@ static void sched_info_arrive(struct task_struct *t) } /* - * Called when a process is queued into either the active or expired - * array. The time is noted and later used to determine how long we - * had to wait for us to reach the cpu. Since the expired queue will - * become the active queue after active queue is empty, without dequeuing - * and requeuing any tasks, we are interested in queuing to either. It - * is unusual but not impossible for tasks to be dequeued and immediately - * requeued in the same or another array: this can happen in sched_yield(), - * set_user_nice(), and even load_balance() as it moves tasks from runqueue - * to runqueue. - * * This function is only called from enqueue_task(), but also only updates * the timestamp if it is already not set. It's assumed that * sched_info_dequeued() will clear that stamp when appropriate.