sched_ext: Fixes for v6.12-rc7

- The fair sched class currently has a bug where its balance() returns true
   telling the sched core that it has tasks to run but then NULL from
   pick_task(). This makes sched core call sched_ext's pick_task() without
   preceding balance() which can lead to stalls in partial mode. For now,
   work around by detecting the condition and forcing the CPU to go through
   another scheduling cycle.
 
 - Add a missing newline to an error message and fix drgn introspection tool
   which went out of sync.
 -----BEGIN PGP SIGNATURE-----
 
 iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCZzI8sw4cdGpAa2VybmVs
 Lm9yZwAKCRCxYfJx3gVYGb5KAP40b/o6TyAFDG+Hn6GxyxQT7rcAUMXsdB2bcEpg
 /IjmzQEAwbHU5KP5vQXV6XHv+2V7Rs7u6ZqFtDnL88N0A9hf3wk=
 =7hL8
 -----END PGP SIGNATURE-----

Merge tag 'sched_ext-for-6.12-rc7-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fixes from Tejun Heo:

 - The fair sched class currently has a bug where its balance() returns
   true telling the sched core that it has tasks to run but then NULL
   from pick_task(). This makes sched core call sched_ext's pick_task()
   without preceding balance() which can lead to stalls in partial mode.

   For now, work around by detecting the condition and forcing the CPU
   to go through another scheduling cycle.

 - Add a missing newline to an error message and fix drgn introspection
   tool which went out of sync.

* tag 'sched_ext-for-6.12-rc7-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
  sched_ext: Handle cases where pick_task_scx() is called without preceding balance_scx()
  sched_ext: Update scx_show_state.py to match scx_ops_bypass_depth's new type
  sched_ext: Add a missing newline at the end of an error message
This commit is contained in:
Linus Torvalds 2024-11-11 14:09:57 -08:00
commit 3022e9d00e
4 changed files with 44 additions and 22 deletions

View File

@ -5920,12 +5920,15 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
#ifdef CONFIG_SCHED_CLASS_EXT
/*
* SCX requires a balance() call before every pick_next_task() including
* when waking up from SCHED_IDLE. If @start_class is below SCX, start
* from SCX instead.
* SCX requires a balance() call before every pick_task() including when
* waking up from SCHED_IDLE. If @start_class is below SCX, start from
* SCX instead. Also, set a flag to detect missing balance() call.
*/
if (scx_enabled() && sched_class_above(&ext_sched_class, start_class))
start_class = &ext_sched_class;
if (scx_enabled()) {
rq->scx.flags |= SCX_RQ_BAL_PENDING;
if (sched_class_above(&ext_sched_class, start_class))
start_class = &ext_sched_class;
}
#endif
/*

View File

@ -2634,7 +2634,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
lockdep_assert_rq_held(rq);
rq->scx.flags |= SCX_RQ_IN_BALANCE;
rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
unlikely(rq->scx.cpu_released)) {
@ -2948,12 +2948,11 @@ static struct task_struct *pick_task_scx(struct rq *rq)
{
struct task_struct *prev = rq->curr;
struct task_struct *p;
bool prev_on_scx = prev->sched_class == &ext_sched_class;
bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
bool kick_idle = false;
/*
* If balance_scx() is telling us to keep running @prev, replenish slice
* if necessary and keep running @prev. Otherwise, pop the first one
* from the local DSQ.
*
* WORKAROUND:
*
* %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
@ -2962,22 +2961,41 @@ static struct task_struct *pick_task_scx(struct rq *rq)
* which then ends up calling pick_task_scx() without preceding
* balance_scx().
*
* For now, ignore cases where $prev is not on SCX. This isn't great and
* can theoretically lead to stalls. However, for switch_all cases, this
* happens only while a BPF scheduler is being loaded or unloaded, and,
* for partial cases, fair will likely keep triggering this CPU.
* Keep running @prev if possible and avoid stalling from entering idle
* without balancing.
*
* Once fair is fixed, restore WARN_ON_ONCE().
* Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
* if pick_task_scx() is called without preceding balance_scx().
*/
if ((rq->scx.flags & SCX_RQ_BAL_KEEP) &&
prev->sched_class == &ext_sched_class) {
if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
if (prev_on_scx) {
keep_prev = true;
} else {
keep_prev = false;
kick_idle = true;
}
} else if (unlikely(keep_prev && !prev_on_scx)) {
/* only allowed during transitions */
WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
keep_prev = false;
}
/*
* If balance_scx() is telling us to keep running @prev, replenish slice
* if necessary and keep running @prev. Otherwise, pop the first one
* from the local DSQ.
*/
if (keep_prev) {
p = prev;
if (!p->scx.slice)
p->scx.slice = SCX_SLICE_DFL;
} else {
p = first_local_task(rq);
if (!p)
if (!p) {
if (kick_idle)
scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
return NULL;
}
if (unlikely(!p->scx.slice)) {
if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
@ -4979,7 +4997,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
cpu_possible_mask)) {
pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation");
pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
return -EINVAL;
}

View File

@ -751,8 +751,9 @@ enum scx_rq_flags {
*/
SCX_RQ_ONLINE = 1 << 0,
SCX_RQ_CAN_STOP_TICK = 1 << 1,
SCX_RQ_BAL_KEEP = 1 << 2, /* balance decided to keep current */
SCX_RQ_BYPASSING = 1 << 3,
SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */
SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
SCX_RQ_BYPASSING = 1 << 4,
SCX_RQ_IN_WAKEUP = 1 << 16,
SCX_RQ_IN_BALANCE = 1 << 17,

View File

@ -35,6 +35,6 @@ print(f'enabled : {read_static_key("__scx_ops_enabled")}')
print(f'switching_all : {read_int("scx_switching_all")}')
print(f'switched_all : {read_static_key("__scx_switched_all")}')
print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})')
print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}')
print(f'bypass_depth : {prog["scx_ops_bypass_depth"].value_()}')
print(f'nr_rejected : {read_atomic("scx_nr_rejected")}')
print(f'enable_seq : {read_atomic("scx_enable_seq")}')