- Fix virtual runtime calculation when recomputing a sched entity's

weights
 
 - Fix wrongly rejected unprivileged poll requests to the cgroup psi
   pressure files
 
 - Make sure the load balancing is done by only one CPU
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmVaBe0ACgkQEsHwGGHe
 VUqX4hAAmrlp7bcloMRto6j4yC+pjDIQlFym7opa7kaEPeY3icOydfpSGEDnEwbv
 HxOOmveb2sC8DBE+Rkum4bHb2I46SD/5LlM/MZHvSguEGNgAJEYCcPfGZJDgGlW1
 MgALG78ThA/mVKr5i3/Q1U6U71+vuNHJOpCY1s4o+jgF/sG3AYIdK1sqaVI++ygz
 q0WK31jGo+YelPpNDKnXpVGIuOcUlh9v/Hu2zGBBJD9pf4kfTelseiV7rc+rk0yI
 YHSKpw2jCnuJaGS748Q4aIG+8kLRBz+HqUKDWQPlq3pRWjJWTBbH+i8TZef7keZQ
 gAk/uJpdQ9z4Z7suwY3vcEBVRo4e6AoD99XDG1eUX07C+f1d9p54EVDkgFIZMIle
 pT2yd5GT/zl0UfcZ8B96y2lJHoa6pHnv83uZKtRZhBMiN5F4iN88lhQFVpZDoCBg
 xZ+NPfpXcZxm4HpKFjfsGyxQJxIkC6NDdf6Rfhtc3sV1rx4AT1Qii4fDnBHOkaBs
 iFgpFOCeb+K9UUXB0ONJ5PWZVnc8OGPtm/22TwtZ9rBzVqrmtVJb+VDg2YWpwFwU
 xhy0hMWxwZFsn0VjjsBbgfm1/+WGjCKjbPa1SvS3oH3+H9EbWiBjxe1zwkS46PUf
 HjC0RCMPxfnYG4+h9JHEaFioGvUqQQ6Ub3K8epd8MPUtD9DCnro=
 =hJzS
 -----END PGP SIGNATURE-----

Merge tag 'sched_urgent_for_v6.7_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Borislav Petkov:

 - Fix virtual runtime calculation when recomputing a sched entity's
   weights

 - Fix wrongly rejected unprivileged poll requests to the cgroup psi
   pressure files

 - Make sure the load balancing is done by only one CPU

* tag 'sched_urgent_for_v6.7_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/fair: Fix the decision for load balance
  sched: psi: fix unprivileged polling against cgroups
  sched/eevdf: Fix vruntime adjustment on reweight
This commit is contained in:
Linus Torvalds 2023-11-19 13:32:00 -08:00
commit 2a0adc4954
2 changed files with 135 additions and 38 deletions

View File

@ -3885,14 +3885,6 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}
static int cgroup_pressure_open(struct kernfs_open_file *of)
{
if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
return -EPERM;
return 0;
}
static void cgroup_pressure_release(struct kernfs_open_file *of)
{
struct cgroup_file_ctx *ctx = of->priv;
@ -5299,7 +5291,6 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "io.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
.open = cgroup_pressure_open,
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@ -5308,7 +5299,6 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "memory.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
.open = cgroup_pressure_open,
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@ -5317,7 +5307,6 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "cpu.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
.open = cgroup_pressure_open,
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
@ -5327,7 +5316,6 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "irq.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
.open = cgroup_pressure_open,
.seq_show = cgroup_irq_pressure_show,
.write = cgroup_irq_pressure_write,
.poll = cgroup_pressure_poll,

View File

@ -3666,41 +3666,140 @@ static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
#endif
static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
unsigned long old_weight = se->load.weight;
u64 avruntime = avg_vruntime(cfs_rq);
s64 vlag, vslice;
/*
* VRUNTIME
* ========
*
* COROLLARY #1: The virtual runtime of the entity needs to be
* adjusted if re-weight at !0-lag point.
*
* Proof: For contradiction assume this is not true, so we can
* re-weight without changing vruntime at !0-lag point.
*
* Weight VRuntime Avg-VRuntime
* before w v V
* after w' v' V'
*
* Since lag needs to be preserved through re-weight:
*
* lag = (V - v)*w = (V'- v')*w', where v = v'
* ==> V' = (V - v)*w/w' + v (1)
*
* Let W be the total weight of the entities before reweight,
* since V' is the new weighted average of entities:
*
* V' = (WV + w'v - wv) / (W + w' - w) (2)
*
* by using (1) & (2) we obtain:
*
* (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
* ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
* ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
* ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
*
* Since we are doing at !0-lag point which means V != v, we
* can simplify (3):
*
* ==> W / (W + w' - w) = w / w'
* ==> Ww' = Ww + ww' - ww
* ==> W * (w' - w) = w * (w' - w)
* ==> W = w (re-weight indicates w' != w)
*
* So the cfs_rq contains only one entity, hence vruntime of
* the entity @v should always equal to the cfs_rq's weighted
* average vruntime @V, which means we will always re-weight
* at 0-lag point, thus breach assumption. Proof completed.
*
*
* COROLLARY #2: Re-weight does NOT affect weighted average
* vruntime of all the entities.
*
* Proof: According to corollary #1, Eq. (1) should be:
*
* (V - v)*w = (V' - v')*w'
* ==> v' = V' - (V - v)*w/w' (4)
*
* According to the weighted average formula, we have:
*
* V' = (WV - wv + w'v') / (W - w + w')
* = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
* = (WV - wv + w'V' - Vw + wv) / (W - w + w')
* = (WV + w'V' - Vw) / (W - w + w')
*
* ==> V'*(W - w + w') = WV + w'V' - Vw
* ==> V' * (W - w) = (W - w) * V (5)
*
* If the entity is the only one in the cfs_rq, then reweight
* always occurs at 0-lag point, so V won't change. Or else
* there are other entities, hence W != w, then Eq. (5) turns
* into V' = V. So V won't change in either case, proof done.
*
*
* So according to corollary #1 & #2, the effect of re-weight
* on vruntime should be:
*
* v' = V' - (V - v) * w / w' (4)
* = V - (V - v) * w / w'
* = V - vl * w / w'
* = V - vl'
*/
if (avruntime != se->vruntime) {
vlag = (s64)(avruntime - se->vruntime);
vlag = div_s64(vlag * old_weight, weight);
se->vruntime = avruntime - vlag;
}
/*
* DEADLINE
* ========
*
* When the weight changes, the virtual time slope changes and
* we should adjust the relative virtual deadline accordingly.
*
* d' = v' + (d - v)*w/w'
* = V' - (V - v)*w/w' + (d - v)*w/w'
* = V - (V - v)*w/w' + (d - v)*w/w'
* = V + (d - V)*w/w'
*/
vslice = (s64)(se->deadline - avruntime);
vslice = div_s64(vslice * old_weight, weight);
se->deadline = avruntime + vslice;
}
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
unsigned long old_weight = se->load.weight;
bool curr = cfs_rq->curr == se;
if (se->on_rq) {
/* commit outstanding execution time */
if (cfs_rq->curr == se)
if (curr)
update_curr(cfs_rq);
else
avg_vruntime_sub(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
update_load_set(&se->load, weight);
if (!se->on_rq) {
/*
* Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
* we need to scale se->vlag when w_i changes.
*/
se->vlag = div_s64(se->vlag * old_weight, weight);
se->vlag = div_s64(se->vlag * se->load.weight, weight);
} else {
s64 deadline = se->deadline - se->vruntime;
/*
* When the weight changes, the virtual time slope changes and
* we should adjust the relative virtual deadline accordingly.
*/
deadline = div_s64(deadline * old_weight, weight);
se->deadline = se->vruntime + deadline;
if (se != cfs_rq->curr)
min_deadline_cb_propagate(&se->run_node, NULL);
reweight_eevdf(cfs_rq, se, weight);
}
update_load_set(&se->load, weight);
#ifdef CONFIG_SMP
do {
u32 divider = get_pelt_divider(&se->avg);
@ -3712,8 +3811,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
update_load_add(&cfs_rq->load, se->load.weight);
if (cfs_rq->curr != se)
avg_vruntime_add(cfs_rq, se);
if (!curr) {
/*
* The entity's vruntime has been adjusted, so let's check
* whether the rq-wide min_vruntime needs updated too. Since
* the calculations above require stable min_vruntime rather
* than up-to-date one, we do the update at the end of the
* reweight process.
*/
__enqueue_entity(cfs_rq, se);
update_min_vruntime(cfs_rq);
}
}
}
@ -3857,14 +3965,11 @@ static void update_cfs_group(struct sched_entity *se)
#ifndef CONFIG_SMP
shares = READ_ONCE(gcfs_rq->tg->shares);
if (likely(se->load.weight == shares))
return;
#else
shares = calc_group_shares(gcfs_rq);
shares = calc_group_shares(gcfs_rq);
#endif
reweight_entity(cfs_rq_of(se), se, shares);
if (unlikely(se->load.weight != shares))
reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
@ -11079,12 +11184,16 @@ static int should_we_balance(struct lb_env *env)
continue;
}
/* Are we the first idle CPU? */
/*
* Are we the first idle core in a non-SMT domain or higher,
* or the first idle CPU in a SMT domain?
*/
return cpu == env->dst_cpu;
}
if (idle_smt == env->dst_cpu)
return true;
/* Are we the first idle CPU with busy siblings? */
if (idle_smt != -1)
return idle_smt == env->dst_cpu;
/* Are we the first CPU of this group ? */
return group_balance_cpu(sg) == env->dst_cpu;