Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched, x86: Avoid unnecessary overflow in sched_clock sched: Fix buglet in return_cfs_rq_runtime() sched: Avoid SMT siblings in select_idle_sibling() if possible sched: Set the command name of the idle tasks in SMP kernels sched, rt: Provide means of disabling cross-cpu bandwidth sharing sched: Document wait_for_completion_*() return values sched_fair: Fix a typo in the comment describing update_sd_lb_stats sched: Add a comment to effective_load() since it's a pain
This commit is contained in:
commit
7125faceab
@ -32,6 +32,22 @@ extern int no_timer_check;
|
||||
* (mathieu.desnoyers@polymtl.ca)
|
||||
*
|
||||
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
|
||||
*
|
||||
* In:
|
||||
*
|
||||
* ns = cycles * cyc2ns_scale / SC
|
||||
*
|
||||
* Although we may still have enough bits to store the value of ns,
|
||||
* in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
|
||||
* leading to an incorrect result.
|
||||
*
|
||||
* To avoid this, we can decompose 'cycles' into quotient and remainder
|
||||
* of division by SC. Then,
|
||||
*
|
||||
* ns = (quot * SC + rem) * cyc2ns_scale / SC
|
||||
* = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
|
||||
*
|
||||
* - sqazi@google.com
|
||||
*/
|
||||
|
||||
DECLARE_PER_CPU(unsigned long, cyc2ns);
|
||||
@ -41,9 +57,14 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
|
||||
|
||||
static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
|
||||
{
|
||||
unsigned long long quot;
|
||||
unsigned long long rem;
|
||||
int cpu = smp_processor_id();
|
||||
unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
|
||||
ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR;
|
||||
quot = (cyc >> CYC2NS_SCALE_FACTOR);
|
||||
rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1);
|
||||
ns += quot * per_cpu(cyc2ns, cpu) +
|
||||
((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR);
|
||||
return ns;
|
||||
}
|
||||
|
||||
|
@ -126,6 +126,8 @@ extern struct cred init_cred;
|
||||
# define INIT_PERF_EVENTS(tsk)
|
||||
#endif
|
||||
|
||||
#define INIT_TASK_COMM "swapper"
|
||||
|
||||
/*
|
||||
* INIT_TASK is used to set up the first task table, touch at
|
||||
* your own risk!. Base=0, limit=0x1fffff (=2MB)
|
||||
@ -162,7 +164,7 @@ extern struct cred init_cred;
|
||||
.group_leader = &tsk, \
|
||||
RCU_INIT_POINTER(.real_cred, &init_cred), \
|
||||
RCU_INIT_POINTER(.cred, &init_cred), \
|
||||
.comm = "swapper", \
|
||||
.comm = INIT_TASK_COMM, \
|
||||
.thread = INIT_THREAD, \
|
||||
.fs = &init_fs, \
|
||||
.files = &init_files, \
|
||||
|
@ -71,6 +71,7 @@
|
||||
#include <linux/ctype.h>
|
||||
#include <linux/ftrace.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/init_task.h>
|
||||
|
||||
#include <asm/tlb.h>
|
||||
#include <asm/irq_regs.h>
|
||||
@ -4810,6 +4811,9 @@ EXPORT_SYMBOL(wait_for_completion);
|
||||
* This waits for either a completion of a specific task to be signaled or for a
|
||||
* specified timeout to expire. The timeout is in jiffies. It is not
|
||||
* interruptible.
|
||||
*
|
||||
* The return value is 0 if timed out, and positive (at least 1, or number of
|
||||
* jiffies left till timeout) if completed.
|
||||
*/
|
||||
unsigned long __sched
|
||||
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
|
||||
@ -4824,6 +4828,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
|
||||
*
|
||||
* This waits for completion of a specific task to be signaled. It is
|
||||
* interruptible.
|
||||
*
|
||||
* The return value is -ERESTARTSYS if interrupted, 0 if completed.
|
||||
*/
|
||||
int __sched wait_for_completion_interruptible(struct completion *x)
|
||||
{
|
||||
@ -4841,6 +4847,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
|
||||
*
|
||||
* This waits for either a completion of a specific task to be signaled or for a
|
||||
* specified timeout to expire. It is interruptible. The timeout is in jiffies.
|
||||
*
|
||||
* The return value is -ERESTARTSYS if interrupted, 0 if timed out,
|
||||
* positive (at least 1, or number of jiffies left till timeout) if completed.
|
||||
*/
|
||||
long __sched
|
||||
wait_for_completion_interruptible_timeout(struct completion *x,
|
||||
@ -4856,6 +4865,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
|
||||
*
|
||||
* This waits to be signaled for completion of a specific task. It can be
|
||||
* interrupted by a kill signal.
|
||||
*
|
||||
* The return value is -ERESTARTSYS if interrupted, 0 if completed.
|
||||
*/
|
||||
int __sched wait_for_completion_killable(struct completion *x)
|
||||
{
|
||||
@ -4874,6 +4885,9 @@ EXPORT_SYMBOL(wait_for_completion_killable);
|
||||
* This waits for either a completion of a specific task to be
|
||||
* signaled or for a specified timeout to expire. It can be
|
||||
* interrupted by a kill signal. The timeout is in jiffies.
|
||||
*
|
||||
* The return value is -ERESTARTSYS if interrupted, 0 if timed out,
|
||||
* positive (at least 1, or number of jiffies left till timeout) if completed.
|
||||
*/
|
||||
long __sched
|
||||
wait_for_completion_killable_timeout(struct completion *x,
|
||||
@ -6099,6 +6113,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
|
||||
*/
|
||||
idle->sched_class = &idle_sched_class;
|
||||
ftrace_graph_init_idle_task(idle, cpu);
|
||||
#if defined(CONFIG_SMP)
|
||||
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -772,19 +772,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
|
||||
list_del_leaf_cfs_rq(cfs_rq);
|
||||
}
|
||||
|
||||
static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
|
||||
{
|
||||
long tg_weight;
|
||||
|
||||
/*
|
||||
* Use this CPU's actual weight instead of the last load_contribution
|
||||
* to gain a more accurate current total weight. See
|
||||
* update_cfs_rq_load_contribution().
|
||||
*/
|
||||
tg_weight = atomic_read(&tg->load_weight);
|
||||
tg_weight -= cfs_rq->load_contribution;
|
||||
tg_weight += cfs_rq->load.weight;
|
||||
|
||||
return tg_weight;
|
||||
}
|
||||
|
||||
static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
|
||||
{
|
||||
long load_weight, load, shares;
|
||||
long tg_weight, load, shares;
|
||||
|
||||
tg_weight = calc_tg_weight(tg, cfs_rq);
|
||||
load = cfs_rq->load.weight;
|
||||
|
||||
load_weight = atomic_read(&tg->load_weight);
|
||||
load_weight += load;
|
||||
load_weight -= cfs_rq->load_contribution;
|
||||
|
||||
shares = (tg->shares * load);
|
||||
if (load_weight)
|
||||
shares /= load_weight;
|
||||
if (tg_weight)
|
||||
shares /= tg_weight;
|
||||
|
||||
if (shares < MIN_SHARES)
|
||||
shares = MIN_SHARES;
|
||||
@ -1743,7 +1756,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
|
||||
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
|
||||
if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
|
||||
return;
|
||||
|
||||
__return_cfs_rq_runtime(cfs_rq);
|
||||
@ -2036,36 +2049,100 @@ static void task_waking_fair(struct task_struct *p)
|
||||
* Adding load to a group doesn't make a group heavier, but can cause movement
|
||||
* of group shares between cpus. Assuming the shares were perfectly aligned one
|
||||
* can calculate the shift in shares.
|
||||
*
|
||||
* Calculate the effective load difference if @wl is added (subtracted) to @tg
|
||||
* on this @cpu and results in a total addition (subtraction) of @wg to the
|
||||
* total group weight.
|
||||
*
|
||||
* Given a runqueue weight distribution (rw_i) we can compute a shares
|
||||
* distribution (s_i) using:
|
||||
*
|
||||
* s_i = rw_i / \Sum rw_j (1)
|
||||
*
|
||||
* Suppose we have 4 CPUs and our @tg is a direct child of the root group and
|
||||
* has 7 equal weight tasks, distributed as below (rw_i), with the resulting
|
||||
* shares distribution (s_i):
|
||||
*
|
||||
* rw_i = { 2, 4, 1, 0 }
|
||||
* s_i = { 2/7, 4/7, 1/7, 0 }
|
||||
*
|
||||
* As per wake_affine() we're interested in the load of two CPUs (the CPU the
|
||||
* task used to run on and the CPU the waker is running on), we need to
|
||||
* compute the effect of waking a task on either CPU and, in case of a sync
|
||||
* wakeup, compute the effect of the current task going to sleep.
|
||||
*
|
||||
* So for a change of @wl to the local @cpu with an overall group weight change
|
||||
* of @wl we can compute the new shares distribution (s'_i) using:
|
||||
*
|
||||
* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
|
||||
*
|
||||
* Suppose we're interested in CPUs 0 and 1, and want to compute the load
|
||||
* differences in waking a task to CPU 0. The additional task changes the
|
||||
* weight and shares distributions like:
|
||||
*
|
||||
* rw'_i = { 3, 4, 1, 0 }
|
||||
* s'_i = { 3/8, 4/8, 1/8, 0 }
|
||||
*
|
||||
* We can then compute the difference in effective weight by using:
|
||||
*
|
||||
* dw_i = S * (s'_i - s_i) (3)
|
||||
*
|
||||
* Where 'S' is the group weight as seen by its parent.
|
||||
*
|
||||
* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
|
||||
* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
|
||||
* 4/7) times the weight of the group.
|
||||
*/
|
||||
static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
|
||||
{
|
||||
struct sched_entity *se = tg->se[cpu];
|
||||
|
||||
if (!tg->parent)
|
||||
if (!tg->parent) /* the trivial, non-cgroup case */
|
||||
return wl;
|
||||
|
||||
for_each_sched_entity(se) {
|
||||
long lw, w;
|
||||
long w, W;
|
||||
|
||||
tg = se->my_q->tg;
|
||||
w = se->my_q->load.weight;
|
||||
|
||||
/* use this cpu's instantaneous contribution */
|
||||
lw = atomic_read(&tg->load_weight);
|
||||
lw -= se->my_q->load_contribution;
|
||||
lw += w + wg;
|
||||
/*
|
||||
* W = @wg + \Sum rw_j
|
||||
*/
|
||||
W = wg + calc_tg_weight(tg, se->my_q);
|
||||
|
||||
wl += w;
|
||||
/*
|
||||
* w = rw_i + @wl
|
||||
*/
|
||||
w = se->my_q->load.weight + wl;
|
||||
|
||||
if (lw > 0 && wl < lw)
|
||||
wl = (wl * tg->shares) / lw;
|
||||
/*
|
||||
* wl = S * s'_i; see (2)
|
||||
*/
|
||||
if (W > 0 && w < W)
|
||||
wl = (w * tg->shares) / W;
|
||||
else
|
||||
wl = tg->shares;
|
||||
|
||||
/* zero point is MIN_SHARES */
|
||||
/*
|
||||
* Per the above, wl is the new se->load.weight value; since
|
||||
* those are clipped to [MIN_SHARES, ...) do so now. See
|
||||
* calc_cfs_shares().
|
||||
*/
|
||||
if (wl < MIN_SHARES)
|
||||
wl = MIN_SHARES;
|
||||
|
||||
/*
|
||||
* wl = dw_i = S * (s'_i - s_i); see (3)
|
||||
*/
|
||||
wl -= se->load.weight;
|
||||
|
||||
/*
|
||||
* Recursively apply this logic to all parent groups to compute
|
||||
* the final effective load change on the root group. Since
|
||||
* only the @tg group gets extra weight, all parent groups can
|
||||
* only redistribute existing shares. @wl is the shift in shares
|
||||
* resulting from this level per the above.
|
||||
*/
|
||||
wg = 0;
|
||||
}
|
||||
|
||||
@ -2249,7 +2326,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
|
||||
int cpu = smp_processor_id();
|
||||
int prev_cpu = task_cpu(p);
|
||||
struct sched_domain *sd;
|
||||
int i;
|
||||
struct sched_group *sg;
|
||||
int i, smt = 0;
|
||||
|
||||
/*
|
||||
* If the task is going to be woken-up on this cpu and if it is
|
||||
@ -2269,25 +2347,38 @@ static int select_idle_sibling(struct task_struct *p, int target)
|
||||
* Otherwise, iterate the domains and find an elegible idle cpu.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
again:
|
||||
for_each_domain(target, sd) {
|
||||
if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
|
||||
break;
|
||||
if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
|
||||
continue;
|
||||
|
||||
for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) {
|
||||
if (idle_cpu(i)) {
|
||||
target = i;
|
||||
break;
|
||||
if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) {
|
||||
if (!smt) {
|
||||
smt = 1;
|
||||
goto again;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Lets stop looking for an idle sibling when we reached
|
||||
* the domain that spans the current cpu and prev_cpu.
|
||||
*/
|
||||
if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
|
||||
cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
|
||||
break;
|
||||
sg = sd->groups;
|
||||
do {
|
||||
if (!cpumask_intersects(sched_group_cpus(sg),
|
||||
tsk_cpus_allowed(p)))
|
||||
goto next;
|
||||
|
||||
for_each_cpu(i, sched_group_cpus(sg)) {
|
||||
if (!idle_cpu(i))
|
||||
goto next;
|
||||
}
|
||||
|
||||
target = cpumask_first_and(sched_group_cpus(sg),
|
||||
tsk_cpus_allowed(p));
|
||||
goto done;
|
||||
next:
|
||||
sg = sg->next;
|
||||
} while (sg != sd->groups);
|
||||
}
|
||||
done:
|
||||
rcu_read_unlock();
|
||||
|
||||
return target;
|
||||
@ -3511,7 +3602,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
|
||||
}
|
||||
|
||||
/**
|
||||
* update_sd_lb_stats - Update sched_group's statistics for load balancing.
|
||||
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
|
||||
* @sd: sched_domain whose statistics are to be updated.
|
||||
* @this_cpu: Cpu for which load balance is currently performed.
|
||||
* @idle: Idle status of this_cpu
|
||||
|
@ -67,3 +67,4 @@ SCHED_FEAT(NONTASK_POWER, 1)
|
||||
SCHED_FEAT(TTWU_QUEUE, 1)
|
||||
|
||||
SCHED_FEAT(FORCE_SD_OVERLAP, 0)
|
||||
SCHED_FEAT(RT_RUNTIME_SHARE, 1)
|
||||
|
@ -560,6 +560,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
|
||||
{
|
||||
int more = 0;
|
||||
|
||||
if (!sched_feat(RT_RUNTIME_SHARE))
|
||||
return more;
|
||||
|
||||
if (rt_rq->rt_time > rt_rq->rt_runtime) {
|
||||
raw_spin_unlock(&rt_rq->rt_runtime_lock);
|
||||
more = do_balance_runtime(rt_rq);
|
||||
|
Loading…
Reference in New Issue
Block a user