Scheduler updates:

- migrate_disable/enable() support which originates from the RT tree and
    is now a prerequisite for the new preemptible kmap_local() API which aims
    to replace kmap_atomic().
 
  - A fair amount of topology and NUMA related improvements
 
  - Improvements for the frequency invariant calculations
 
  - Enhanced robustness for the global CPU priority tracking and decision
    making
 
  - The usual small fixes and enhancements all over the place
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAl/XwK4THHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYoX28D/9cVrvziSQGfBfuQWnUiw8iOIq1QBa2
 Me+Tvenhfrlt7xU6rbP9ciFu7eTN+fS06m5uQPGI+t22WuJmHzbmw1bJVXfkvYfI
 /QoU+Hg7DkDAn1p7ZKXh0dRkV0nI9ixxSHl0E+Zf1ATBxCUMV2SO85flg6z/4qJq
 3VWUye0dmR7/bhtkIjv5rwce9v2JB2g1AbgYXYTW9lHVoUdGoMSdiZAF4tGyHLnx
 sJ6DMqQ+k+dmPyYO0z5MTzjW/fXit4n9w2e3z9TvRH/uBu58WSW1RBmQYX6aHBAg
 dhT9F4lvTs6lJY23x5RSFWDOv6xAvKF5a0xfb8UZcyH5EoLYrPRvm42a0BbjdeRa
 u0z7LbwIlKA+RFdZzFZWz8UvvO0ljyMjmiuqZnZ5dY9Cd80LSBuxrWeQYG0qg6lR
 Y2povhhCepEG+q8AXIe2YjHKWKKC1s/l/VY3CNnCzcd21JPQjQ4Z5eWGmHif5IED
 CntaeFFhZadR3w02tkX35zFmY3w4soKKrbI4EKWrQwd+cIEQlOSY7dEPI/b5BbYj
 MWAb3P4EG9N77AWTNmbhK4nN0brEYb+rBbCA+5dtNBVhHTxAC7OTWElJOC2O66FI
 e06dREjvwYtOkRUkUguWwErbIai2gJ2MH0VILV3hHoh64oRk7jjM8PZYnjQkdptQ
 Gsq0rJW5iiu/OQ==
 =Oz1V
 -----END PGP SIGNATURE-----

Merge tag 'sched-core-2020-12-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Thomas Gleixner:

 - migrate_disable/enable() support which originates from the RT tree
   and is now a prerequisite for the new preemptible kmap_local() API
   which aims to replace kmap_atomic().

 - A fair amount of topology and NUMA related improvements

 - Improvements for the frequency invariant calculations

 - Enhanced robustness for the global CPU priority tracking and decision
   making

 - The usual small fixes and enhancements all over the place

* tag 'sched-core-2020-12-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (61 commits)
  sched/fair: Trivial correction of the newidle_balance() comment
  sched/fair: Clear SMT siblings after determining the core is not idle
  sched: Fix kernel-doc markup
  x86: Print ratio freq_max/freq_base used in frequency invariance calculations
  x86, sched: Use midpoint of max_boost and max_P for frequency invariance on AMD EPYC
  x86, sched: Calculate frequency invariance for AMD systems
  irq_work: Optimize irq_work_single()
  smp: Cleanup smp_call_function*()
  irq_work: Cleanup
  sched: Limit the amount of NUMA imbalance that can exist at fork time
  sched/numa: Allow a floating imbalance between NUMA nodes
  sched: Avoid unnecessary calculation of load imbalance at clone time
  sched/numa: Rename nr_running and break out the magic number
  sched: Make migrate_disable/enable() independent of RT
  sched/topology: Condition EAS enablement on FIE support
  arm64: Rebuild sched domains on invariance status changes
  sched/topology,schedutil: Wrap sched domains rebuild
  sched/uclamp: Allow to reset a task uclamp constraint value
  sched/core: Fix typos in comments
  Documentation: scheduler: fix information on arch SD flags, sched_domain and sched_debug
  ...
This commit is contained in:
Linus Torvalds 2020-12-14 18:29:11 -08:00
commit adb35e8dc9
59 changed files with 1929 additions and 642 deletions

View File

@ -65,21 +65,17 @@ of the SMP domain will span the entire machine, with each group having the
cpumask of a node. Or, you could do multi-level NUMA or Opteron, for example,
might have just one domain covering its one NUMA level.
The implementor should read comments in include/linux/sched.h:
struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of
the specifics and what to tune.
The implementor should read comments in include/linux/sched/sd_flags.h:
SD_* to get an idea of the specifics and what to tune for the SD flags
of a sched_domain.
Architectures may retain the regular override the default SD_*_INIT flags
while using the generic domain builder in kernel/sched/core.c if they wish to
retain the traditional SMT->SMP->NUMA topology (or some subset of that). This
can be done by #define'ing ARCH_HASH_SCHED_TUNE.
Alternatively, the architecture may completely override the generic domain
builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your
arch_init_sched_domains function. This function will attach domains to all
CPUs using cpu_attach_domain.
Architectures may override the generic domain builder and the default SD flags
for a given topology level by creating a sched_domain_topology_level array and
calling set_sched_topology() with this array as the parameter.
The sched-domains debugging infrastructure can be enabled by enabling
CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains
which should catch most possible errors (described above). It also prints out
the domain structure in a visual format.
CONFIG_SCHED_DEBUG and adding 'sched_debug' to your cmdline. If you forgot to
tweak your cmdline, you can also flip the /sys/kernel/debug/sched_debug
knob. This enables an error checking parse of the sched domains which should
catch most possible errors (described above). It also prints out the domain
structure in a visual format.

View File

@ -223,6 +223,7 @@ static DEFINE_STATIC_KEY_FALSE(amu_fie_key);
static int __init init_amu_fie(void)
{
bool invariance_status = topology_scale_freq_invariant();
cpumask_var_t valid_cpus;
bool have_policy = false;
int ret = 0;
@ -269,6 +270,15 @@ static int __init init_amu_fie(void)
if (!topology_scale_freq_invariant())
static_branch_disable(&amu_fie_key);
/*
* Task scheduler behavior depends on frequency invariance support,
* either cpufreq or counter driven. If the support status changes as
* a result of counter initialisation and use, retrigger the build of
* scheduling domains to ensure the information is propagated properly.
*/
if (invariance_status != topology_scale_freq_invariant())
rebuild_sched_domains_energy();
free_valid_mask:
free_cpumask_var(valid_cpus);

View File

@ -702,7 +702,6 @@ unsigned long arch_align_stack(unsigned long sp)
return sp & ALMASK;
}
static DEFINE_PER_CPU(call_single_data_t, backtrace_csd);
static struct cpumask backtrace_csd_busy;
static void handle_backtrace(void *info)
@ -711,6 +710,9 @@ static void handle_backtrace(void *info)
cpumask_clear_cpu(smp_processor_id(), &backtrace_csd_busy);
}
static DEFINE_PER_CPU(call_single_data_t, backtrace_csd) =
CSD_INIT(handle_backtrace, NULL);
static void raise_backtrace(cpumask_t *mask)
{
call_single_data_t *csd;
@ -730,7 +732,6 @@ static void raise_backtrace(cpumask_t *mask)
}
csd = &per_cpu(backtrace_csd, cpu);
csd->func = handle_backtrace;
smp_call_function_single_async(cpu, csd);
}
}

View File

@ -687,7 +687,13 @@ EXPORT_SYMBOL(flush_tlb_one);
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd);
static void tick_broadcast_callee(void *info)
{
tick_receive_broadcast();
}
static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd) =
CSD_INIT(tick_broadcast_callee, NULL);
void tick_broadcast(const struct cpumask *mask)
{
@ -700,23 +706,4 @@ void tick_broadcast(const struct cpumask *mask)
}
}
static void tick_broadcast_callee(void *info)
{
tick_receive_broadcast();
}
static int __init tick_broadcast_init(void)
{
call_single_data_t *csd;
int cpu;
for (cpu = 0; cpu < NR_CPUS; cpu++) {
csd = &per_cpu(tick_broadcast_csd, cpu);
csd->func = tick_broadcast_callee;
}
return 0;
}
early_initcall(tick_broadcast_init);
#endif /* CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */

View File

@ -179,9 +179,7 @@ static void zpci_handle_fallback_irq(void)
if (atomic_inc_return(&cpu_data->scheduled) > 1)
continue;
cpu_data->csd.func = zpci_handle_remote_irq;
cpu_data->csd.info = &cpu_data->scheduled;
cpu_data->csd.flags = 0;
INIT_CSD(&cpu_data->csd, zpci_handle_remote_irq, &cpu_data->scheduled);
smp_call_function_single_async(cpu, &cpu_data->csd);
}
}

View File

@ -218,4 +218,9 @@ static inline void arch_set_max_freq_ratio(bool turbo_disabled)
}
#endif
#ifdef CONFIG_ACPI_CPPC_LIB
void init_freq_invariance_cppc(void);
#define init_freq_invariance_cppc init_freq_invariance_cppc
#endif
#endif /* _ASM_X86_TOPOLOGY_H */

View File

@ -74,10 +74,9 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
init_completion(&cmd.done);
for (; count; count -= 16) {
call_single_data_t csd = {
.func = cpuid_smp_cpuid,
.info = &cmd,
};
call_single_data_t csd;
INIT_CSD(&csd, cpuid_smp_cpuid, &cmd);
cmd.regs.eax = pos;
cmd.regs.ecx = pos >> 32;

View File

@ -82,6 +82,10 @@
#include <asm/hw_irq.h>
#include <asm/stackprotector.h>
#ifdef CONFIG_ACPI_CPPC_LIB
#include <acpi/cppc_acpi.h>
#endif
/* representing HT siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
@ -148,7 +152,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
}
static void init_freq_invariance(bool secondary);
static void init_freq_invariance(bool secondary, bool cppc_ready);
/*
* Report back to the Boot Processor during boot time or to the caller processor
@ -186,7 +190,7 @@ static void smp_callin(void)
*/
set_cpu_sibling_map(raw_smp_processor_id());
init_freq_invariance(true);
init_freq_invariance(true, false);
/*
* Get our bogomips.
@ -1341,7 +1345,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
set_sched_topology(x86_topology);
set_cpu_sibling_map(0);
init_freq_invariance(false);
init_freq_invariance(false, false);
smp_sanity_check();
switch (apic_intr_mode) {
@ -2028,6 +2032,48 @@ out:
return true;
}
#ifdef CONFIG_ACPI_CPPC_LIB
static bool amd_set_max_freq_ratio(void)
{
struct cppc_perf_caps perf_caps;
u64 highest_perf, nominal_perf;
u64 perf_ratio;
int rc;
rc = cppc_get_perf_caps(0, &perf_caps);
if (rc) {
pr_debug("Could not retrieve perf counters (%d)\n", rc);
return false;
}
highest_perf = perf_caps.highest_perf;
nominal_perf = perf_caps.nominal_perf;
if (!highest_perf || !nominal_perf) {
pr_debug("Could not retrieve highest or nominal performance\n");
return false;
}
perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
/* midpoint between max_boost and max_P */
perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
if (!perf_ratio) {
pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n");
return false;
}
arch_turbo_freq_ratio = perf_ratio;
arch_set_max_freq_ratio(false);
return true;
}
#else
static bool amd_set_max_freq_ratio(void)
{
return false;
}
#endif
static void init_counter_refs(void)
{
u64 aperf, mperf;
@ -2039,7 +2085,7 @@ static void init_counter_refs(void)
this_cpu_write(arch_prev_mperf, mperf);
}
static void init_freq_invariance(bool secondary)
static void init_freq_invariance(bool secondary, bool cppc_ready)
{
bool ret = false;
@ -2055,15 +2101,38 @@ static void init_freq_invariance(bool secondary)
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
ret = intel_set_max_freq_ratio();
else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
if (!cppc_ready) {
return;
}
ret = amd_set_max_freq_ratio();
}
if (ret) {
init_counter_refs();
static_branch_enable(&arch_scale_freq_key);
pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
} else {
pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
}
}
#ifdef CONFIG_ACPI_CPPC_LIB
static DEFINE_MUTEX(freq_invariance_lock);
void init_freq_invariance_cppc(void)
{
static bool secondary;
mutex_lock(&freq_invariance_lock);
init_freq_invariance(secondary, true);
secondary = true;
mutex_unlock(&freq_invariance_lock);
}
#endif
static void disable_freq_invariance_workfn(struct work_struct *work)
{
static_branch_disable(&arch_scale_freq_key);
@ -2113,7 +2182,7 @@ error:
schedule_work(&disable_freq_invariance_work);
}
#else
static inline void init_freq_invariance(bool secondary)
static inline void init_freq_invariance(bool secondary, bool cppc_ready)
{
}
#endif /* CONFIG_X86_64 */

View File

@ -169,12 +169,11 @@ static void __wrmsr_safe_on_cpu(void *info)
int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
struct msr_info_completion rv;
call_single_data_t csd = {
.func = __rdmsr_safe_on_cpu,
.info = &rv,
};
call_single_data_t csd;
int err;
INIT_CSD(&csd, __rdmsr_safe_on_cpu, &rv);
memset(&rv, 0, sizeof(rv));
init_completion(&rv.done);
rv.msr.msr_no = msr_no;

View File

@ -671,9 +671,7 @@ bool blk_mq_complete_request_remote(struct request *rq)
return false;
if (blk_mq_complete_need_ipi(rq)) {
rq->csd.func = __blk_mq_complete_request_remote;
rq->csd.info = rq;
rq->csd.flags = 0;
INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
} else {
if (rq->q->nr_hw_queues > 1)

View File

@ -39,6 +39,7 @@
#include <linux/ktime.h>
#include <linux/rwsem.h>
#include <linux/wait.h>
#include <linux/topology.h>
#include <acpi/cppc_acpi.h>
@ -688,6 +689,10 @@ static bool is_cppc_supported(int revision, int num_ent)
* }
*/
#ifndef init_freq_invariance_cppc
static inline void init_freq_invariance_cppc(void) { }
#endif
/**
* acpi_cppc_processor_probe - Search for per CPU _CPC objects.
* @pr: Ptr to acpi_processor containing this CPU's logical ID.
@ -850,6 +855,8 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr)
goto out_free;
}
init_freq_invariance_cppc();
kfree(output.pointer);
return 0;

View File

@ -674,8 +674,7 @@ have_coupled:
coupled->refcnt++;
csd = &per_cpu(cpuidle_coupled_poke_cb, dev->cpu);
csd->func = cpuidle_coupled_handle_poke;
csd->info = (void *)(unsigned long)dev->cpu;
INIT_CSD(csd, cpuidle_coupled_handle_poke, (void *)(unsigned long)dev->cpu);
return 0;
}

View File

@ -197,7 +197,7 @@ __notify_execute_cb(struct i915_request *rq, bool (*fn)(struct irq_work *wrk))
llist_for_each_entry_safe(cb, cn,
llist_del_all(&rq->execute_cb),
work.llnode)
work.node.llist)
fn(&cb->work);
}
@ -460,7 +460,7 @@ __await_execution(struct i915_request *rq,
* callback first, then checking the ACTIVE bit, we serialise with
* the completed/retired request.
*/
if (llist_add(&cb->work.llnode, &signal->execute_cb)) {
if (llist_add(&cb->work.node.llist, &signal->execute_cb)) {
if (i915_request_is_active(signal) ||
__request_in_flight(signal))
__notify_execute_cb_imm(signal);

View File

@ -729,13 +729,8 @@ static void liquidio_napi_drv_callback(void *arg)
droq->cpu_id == this_cpu) {
napi_schedule_irqoff(&droq->napi);
} else {
call_single_data_t *csd = &droq->csd;
csd->func = napi_schedule_wrapper;
csd->info = &droq->napi;
csd->flags = 0;
smp_call_function_single_async(droq->cpu_id, csd);
INIT_CSD(&droq->csd, napi_schedule_wrapper, &droq->napi);
smp_call_function_single_async(droq->cpu_id, &droq->csd);
}
}

View File

@ -383,9 +383,9 @@ static inline void task_context_switch_counts(struct seq_file *m,
static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
{
seq_printf(m, "Cpus_allowed:\t%*pb\n",
cpumask_pr_args(task->cpus_ptr));
cpumask_pr_args(&task->cpus_mask));
seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
cpumask_pr_args(task->cpus_ptr));
cpumask_pr_args(&task->cpus_mask));
}
static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)

View File

@ -152,6 +152,7 @@ enum cpuhp_state {
CPUHP_AP_ONLINE,
CPUHP_TEARDOWN_CPU,
CPUHP_AP_ONLINE_IDLE,
CPUHP_AP_SCHED_WAIT_EMPTY,
CPUHP_AP_SMPBOOT_THREADS,
CPUHP_AP_X86_VDSO_VMA_ONLINE,
CPUHP_AP_IRQ_AFFINITY_ONLINE,

View File

@ -199,6 +199,11 @@ static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
return cpumask_next_and(-1, src1p, src2p);
}
static inline int cpumask_any_distribute(const struct cpumask *srcp)
{
return cpumask_first(srcp);
}
#define for_each_cpu(cpu, mask) \
for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
#define for_each_cpu_not(cpu, mask) \
@ -252,6 +257,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
unsigned int cpumask_local_spread(unsigned int i, int node);
int cpumask_any_and_distribute(const struct cpumask *src1p,
const struct cpumask *src2p);
int cpumask_any_distribute(const struct cpumask *srcp);
/**
* for_each_cpu - iterate over every cpu in a mask

View File

@ -14,28 +14,37 @@
*/
struct irq_work {
union {
struct __call_single_node node;
struct {
struct llist_node llnode;
atomic_t flags;
};
};
void (*func)(struct irq_work *);
};
#define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \
.node = { .u_flags = (_flags), }, \
.func = (_func), \
}
#define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0)
#define IRQ_WORK_INIT_LAZY(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_LAZY)
#define IRQ_WORK_INIT_HARD(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_HARD_IRQ)
#define DEFINE_IRQ_WORK(name, _f) \
struct irq_work name = IRQ_WORK_INIT(_f)
static inline
void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
{
atomic_set(&work->flags, 0);
work->func = func;
*work = IRQ_WORK_INIT(func);
}
#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { \
.flags = ATOMIC_INIT(0), \
.func = (_f) \
static inline bool irq_work_is_pending(struct irq_work *work)
{
return atomic_read(&work->node.a_flags) & IRQ_WORK_PENDING;
}
static inline bool irq_work_is_busy(struct irq_work *work)
{
return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY;
}
bool irq_work_queue(struct irq_work *work);
bool irq_work_queue_on(struct irq_work *work, int cpu);

View File

@ -107,14 +107,14 @@ do { \
current->irq_config = 0; \
} while (0)
# define lockdep_irq_work_enter(__work) \
# define lockdep_irq_work_enter(_flags) \
do { \
if (!(atomic_read(&__work->flags) & IRQ_WORK_HARD_IRQ))\
if (!((_flags) & IRQ_WORK_HARD_IRQ)) \
current->irq_config = 1; \
} while (0)
# define lockdep_irq_work_exit(__work) \
# define lockdep_irq_work_exit(_flags) \
do { \
if (!(atomic_read(&__work->flags) & IRQ_WORK_HARD_IRQ))\
if (!((_flags) & IRQ_WORK_HARD_IRQ)) \
current->irq_config = 0; \
} while (0)

View File

@ -204,6 +204,7 @@ extern int _cond_resched(void);
extern void ___might_sleep(const char *file, int line, int preempt_offset);
extern void __might_sleep(const char *file, int line, int preempt_offset);
extern void __cant_sleep(const char *file, int line, int preempt_offset);
extern void __cant_migrate(const char *file, int line);
/**
* might_sleep - annotation for functions that can sleep
@ -227,6 +228,18 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
# define cant_sleep() \
do { __cant_sleep(__FILE__, __LINE__, 0); } while (0)
# define sched_annotate_sleep() (current->task_state_change = 0)
/**
* cant_migrate - annotation for functions that cannot migrate
*
* Will print a stack trace if executed in code which is migratable
*/
# define cant_migrate() \
do { \
if (IS_ENABLED(CONFIG_SMP)) \
__cant_migrate(__FILE__, __LINE__); \
} while (0)
/**
* non_block_start - annotate the start of section where sleeping is prohibited
*
@ -251,6 +264,7 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
int preempt_offset) { }
# define might_sleep() do { might_resched(); } while (0)
# define cant_sleep() do { } while (0)
# define cant_migrate() do { } while (0)
# define sched_annotate_sleep() do { } while (0)
# define non_block_start() do { } while (0)
# define non_block_end() do { } while (0)
@ -258,13 +272,6 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
#ifndef CONFIG_PREEMPT_RT
# define cant_migrate() cant_sleep()
#else
/* Placeholder for now */
# define cant_migrate() do { } while (0)
#endif
/**
* abs - return absolute value of an argument
* @x: the value. If it is unsigned type, it is converted to signed type first.

View File

@ -322,34 +322,71 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
#endif
/**
* migrate_disable - Prevent migration of the current task
*
* Maps to preempt_disable() which also disables preemption. Use
* migrate_disable() to annotate that the intent is to prevent migration,
* but not necessarily preemption.
*
* Can be invoked nested like preempt_disable() and needs the corresponding
* number of migrate_enable() invocations.
*/
static __always_inline void migrate_disable(void)
{
preempt_disable();
}
#ifdef CONFIG_SMP
/**
* migrate_enable - Allow migration of the current task
/*
* Migrate-Disable and why it is undesired.
*
* Counterpart to migrate_disable().
* When a preempted task becomes elegible to run under the ideal model (IOW it
* becomes one of the M highest priority tasks), it might still have to wait
* for the preemptee's migrate_disable() section to complete. Thereby suffering
* a reduction in bandwidth in the exact duration of the migrate_disable()
* section.
*
* As migrate_disable() can be invoked nested, only the outermost invocation
* reenables migration.
* Per this argument, the change from preempt_disable() to migrate_disable()
* gets us:
*
* - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
* it would have had to wait for the lower priority task.
*
* - a lower priority tasks; which under preempt_disable() could've instantly
* migrated away when another CPU becomes available, is now constrained
* by the ability to push the higher priority task away, which might itself be
* in a migrate_disable() section, reducing it's available bandwidth.
*
* IOW it trades latency / moves the interference term, but it stays in the
* system, and as long as it remains unbounded, the system is not fully
* deterministic.
*
*
* The reason we have it anyway.
*
* PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
* number of primitives into becoming preemptible, they would also allow
* migration. This turns out to break a bunch of per-cpu usage. To this end,
* all these primitives employ migirate_disable() to restore this implicit
* assumption.
*
* This is a 'temporary' work-around at best. The correct solution is getting
* rid of the above assumptions and reworking the code to employ explicit
* per-cpu locking or short preempt-disable regions.
*
* The end goal must be to get rid of migrate_disable(), alternatively we need
* a schedulability theory that does not depend on abritrary migration.
*
*
* Notes on the implementation.
*
* The implementation is particularly tricky since existing code patterns
* dictate neither migrate_disable() nor migrate_enable() is allowed to block.
* This means that it cannot use cpus_read_lock() to serialize against hotplug,
* nor can it easily migrate itself into a pending affinity mask change on
* migrate_enable().
*
*
* Note: even non-work-conserving schedulers like semi-partitioned depends on
* migration, so migrate_disable() is not only a problem for
* work-conserving schedulers.
*
* Currently mapped to preempt_enable().
*/
static __always_inline void migrate_enable(void)
{
preempt_enable();
}
extern void migrate_disable(void);
extern void migrate_enable(void);
#else
static inline void migrate_disable(void) { }
static inline void migrate_enable(void) { }
#endif /* CONFIG_SMP */
#endif /* __LINUX_PREEMPT_H */

View File

@ -723,6 +723,11 @@ struct task_struct {
int nr_cpus_allowed;
const cpumask_t *cpus_ptr;
cpumask_t cpus_mask;
void *migration_pending;
#ifdef CONFIG_SMP
unsigned short migration_disabled;
#endif
unsigned short migration_flags;
#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;

View File

@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned int cpu);
extern int sched_cpu_deactivate(unsigned int cpu);
#ifdef CONFIG_HOTPLUG_CPU
extern int sched_cpu_wait_empty(unsigned int cpu);
extern int sched_cpu_dying(unsigned int cpu);
#else
# define sched_cpu_wait_empty NULL
# define sched_cpu_dying NULL
#endif

View File

@ -347,6 +347,8 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
extern void membarrier_exec_mmap(struct mm_struct *mm);
extern void membarrier_update_current_mm(struct mm_struct *next_mm);
#else
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
@ -361,6 +363,9 @@ static inline void membarrier_exec_mmap(struct mm_struct *mm)
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
}
static inline void membarrier_update_current_mm(struct mm_struct *next_mm)
{
}
#endif
#endif /* _LINUX_SCHED_MM_H */

View File

@ -225,6 +225,14 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
#endif /* !CONFIG_SMP */
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
extern void rebuild_sched_domains_energy(void);
#else
static inline void rebuild_sched_domains_energy(void)
{
}
#endif
#ifndef arch_scale_cpu_capacity
/**
* arch_scale_cpu_capacity - get the capacity scale factor of a given CPU.

View File

@ -21,24 +21,23 @@ typedef bool (*smp_cond_func_t)(int cpu, void *info);
* structure shares (partial) layout with struct irq_work
*/
struct __call_single_data {
union {
struct __call_single_node node;
struct {
struct llist_node llist;
unsigned int flags;
#ifdef CONFIG_64BIT
u16 src, dst;
#endif
};
};
smp_call_func_t func;
void *info;
};
#define CSD_INIT(_func, _info) \
(struct __call_single_data){ .func = (_func), .info = (_info), }
/* Use __aligned() to avoid to use 2 cache lines for 1 csd */
typedef struct __call_single_data call_single_data_t
__aligned(sizeof(struct __call_single_data));
#define INIT_CSD(_csd, _func, _info) \
do { \
*(_csd) = CSD_INIT((_func), (_info)); \
} while (0)
/*
* Enqueue a llist_node on the call_single_queue; be very careful, read
* flush_smp_call_function_queue() in detail.

View File

@ -24,6 +24,7 @@ typedef int (*cpu_stop_fn_t)(void *arg);
struct cpu_stop_work {
struct list_head list; /* cpu_stopper->works */
cpu_stop_fn_t fn;
unsigned long caller;
void *arg;
struct cpu_stop_done *done;
};
@ -36,6 +37,8 @@ void stop_machine_park(int cpu);
void stop_machine_unpark(int cpu);
void stop_machine_yield(const struct cpumask *cpumask);
extern void print_stop_info(const char *log_lvl, struct task_struct *task);
#else /* CONFIG_SMP */
#include <linux/workqueue.h>
@ -80,6 +83,8 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu,
return false;
}
static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { }
#endif /* CONFIG_SMP */
/*

View File

@ -96,6 +96,8 @@ struct sched_param {
* on a CPU with a capacity big enough to fit the specified value.
* A task with a max utilization value smaller than 1024 is more likely
* scheduled on a CPU with no more capacity than the specified value.
*
* A task utilization boundary can be reset by setting the attribute to -1.
*/
struct sched_attr {
__u32 size;

View File

@ -298,7 +298,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
if (irqs_disabled()) {
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
work = this_cpu_ptr(&up_read_work);
if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) {
if (irq_work_is_busy(&work->irq_work)) {
/* cannot queue more up_read, fallback */
irq_work_busy = true;
}

View File

@ -983,25 +983,48 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
*/
static void rebuild_sched_domains_locked(void)
{
struct cgroup_subsys_state *pos_css;
struct sched_domain_attr *attr;
cpumask_var_t *doms;
struct cpuset *cs;
int ndoms;
lockdep_assert_cpus_held();
percpu_rwsem_assert_held(&cpuset_rwsem);
/*
* We have raced with CPU hotplug. Don't do anything to avoid
* If we have raced with CPU hotplug, return early to avoid
* passing doms with offlined cpu to partition_sched_domains().
* Anyways, hotplug work item will rebuild sched domains.
* Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
*
* With no CPUs in any subpartitions, top_cpuset's effective CPUs
* should be the same as the active CPUs, so checking only top_cpuset
* is enough to detect racing CPU offlines.
*/
if (!top_cpuset.nr_subparts_cpus &&
!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
return;
if (top_cpuset.nr_subparts_cpus &&
!cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
/*
* With subpartition CPUs, however, the effective CPUs of a partition
* root should be only a subset of the active CPUs. Since a CPU in any
* partition root could be offlined, all must be checked.
*/
if (top_cpuset.nr_subparts_cpus) {
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
if (!is_partition_root(cs)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
if (!cpumask_subset(cs->effective_cpus,
cpu_active_mask)) {
rcu_read_unlock();
return;
}
}
rcu_read_unlock();
}
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(&doms, &attr);

View File

@ -1606,7 +1606,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.name = "ap:online",
},
/*
* Handled on controll processor until the plugged processor manages
* Handled on control processor until the plugged processor manages
* this itself.
*/
[CPUHP_TEARDOWN_CPU] = {
@ -1615,6 +1615,13 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.teardown.single = takedown_cpu,
.cant_stop = true,
},
[CPUHP_AP_SCHED_WAIT_EMPTY] = {
.name = "sched:waitempty",
.startup.single = NULL,
.teardown.single = sched_cpu_wait_empty,
},
/* Handle smpboot threads park/unpark */
[CPUHP_AP_SMPBOOT_THREADS] = {
.name = "smpboot/threads:online",

View File

@ -225,8 +225,6 @@ NOKPROBE_SYMBOL(kgdb_skipexception);
* Default (weak) implementation for kgdb_roundup_cpus
*/
static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd);
void __weak kgdb_call_nmi_hook(void *ignored)
{
/*
@ -241,6 +239,9 @@ void __weak kgdb_call_nmi_hook(void *ignored)
}
NOKPROBE_SYMBOL(kgdb_call_nmi_hook);
static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd) =
CSD_INIT(kgdb_call_nmi_hook, NULL);
void __weak kgdb_roundup_cpus(void)
{
call_single_data_t *csd;
@ -267,7 +268,6 @@ void __weak kgdb_roundup_cpus(void)
continue;
kgdb_info[cpu].rounding_up = true;
csd->func = kgdb_call_nmi_hook;
ret = smp_call_function_single_async(cpu, csd);
if (ret)
kgdb_info[cpu].rounding_up = false;

View File

@ -478,10 +478,24 @@ static void exit_mm(void)
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
/*
* When a thread stops operating on an address space, the loop
* in membarrier_private_expedited() may not observe that
* tsk->mm, and the loop in membarrier_global_expedited() may
* not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
* rq->membarrier_state, so those would not issue an IPI.
* Membarrier requires a memory barrier after accessing
* user-space memory, before clearing tsk->mm or the
* rq->membarrier_state.
*/
smp_mb__after_spinlock();
local_irq_disable();
current->mm = NULL;
mmap_read_unlock(mm);
membarrier_update_current_mm(NULL);
enter_lazy_tlb(mm, current);
local_irq_enable();
task_unlock(current);
mmap_read_unlock(mm);
mm_update_next_owner(mm);
mmput(mm);
if (test_thread_flag(TIF_MEMDIE))

View File

@ -31,10 +31,10 @@ static bool irq_work_claim(struct irq_work *work)
{
int oflags;
oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags);
oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags);
/*
* If the work is already pending, no need to raise the IPI.
* The pairing atomic_fetch_andnot() in irq_work_run() makes sure
* The pairing smp_mb() in irq_work_single() makes sure
* everything we did before is visible.
*/
if (oflags & IRQ_WORK_PENDING)
@ -53,12 +53,12 @@ void __weak arch_irq_work_raise(void)
static void __irq_work_queue_local(struct irq_work *work)
{
/* If the work is "lazy", handle it from next tick if any */
if (atomic_read(&work->flags) & IRQ_WORK_LAZY) {
if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) {
if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) &&
tick_nohz_tick_stopped())
arch_irq_work_raise();
} else {
if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list)))
arch_irq_work_raise();
}
}
@ -102,7 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (cpu != smp_processor_id()) {
/* Arch remote IPI send/receive backend aren't NMI safe */
WARN_ON_ONCE(in_nmi());
__smp_call_single_queue(cpu, &work->llnode);
__smp_call_single_queue(cpu, &work->node.llist);
} else {
__irq_work_queue_local(work);
}
@ -136,23 +136,28 @@ void irq_work_single(void *arg)
int flags;
/*
* Clear the PENDING bit, after this point the @work
* can be re-used.
* Make it immediately visible so that other CPUs trying
* to claim that work don't rely on us to handle their data
* while we are in the middle of the func.
*/
flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
lockdep_irq_work_enter(work);
work->func(work);
lockdep_irq_work_exit(work);
/*
* Clear the BUSY bit and return to the free state if
* no-one else claimed it meanwhile.
* Clear the PENDING bit, after this point the @work can be re-used.
* The PENDING bit acts as a lock, and we own it, so we can clear it
* without atomic ops.
*/
flags = atomic_read(&work->node.a_flags);
flags &= ~IRQ_WORK_PENDING;
(void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
atomic_set(&work->node.a_flags, flags);
/*
* See irq_work_claim().
*/
smp_mb();
lockdep_irq_work_enter(flags);
work->func(work);
lockdep_irq_work_exit(flags);
/*
* Clear the BUSY bit, if set, and return to the free state if no-one
* else claimed it meanwhile.
*/
(void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);
}
static void irq_work_run_list(struct llist_head *list)
@ -166,7 +171,7 @@ static void irq_work_run_list(struct llist_head *list)
return;
llnode = llist_del_all(list);
llist_for_each_entry_safe(work, tmp, llnode, llnode)
llist_for_each_entry_safe(work, tmp, llnode, node.llist)
irq_work_single(work);
}
@ -198,7 +203,7 @@ void irq_work_sync(struct irq_work *work)
{
lockdep_assert_irqs_enabled();
while (atomic_read(&work->flags) & IRQ_WORK_BUSY)
while (irq_work_is_busy(work))
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);

View File

@ -1249,6 +1249,7 @@ void kthread_use_mm(struct mm_struct *mm)
tsk->active_mm = mm;
}
tsk->mm = mm;
membarrier_update_current_mm(mm);
switch_mm_irqs_off(active_mm, mm, tsk);
local_irq_enable();
task_unlock(tsk);
@ -1256,8 +1257,19 @@ void kthread_use_mm(struct mm_struct *mm)
finish_arch_post_lock_switch();
#endif
/*
* When a kthread starts operating on an address space, the loop
* in membarrier_{private,global}_expedited() may not observe
* that tsk->mm, and not issue an IPI. Membarrier requires a
* memory barrier after storing to tsk->mm, before accessing
* user-space memory. A full memory barrier for membarrier
* {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
* mmdrop(), or explicitly with smp_mb().
*/
if (active_mm != mm)
mmdrop(active_mm);
else
smp_mb();
to_kthread(tsk)->oldfs = force_uaccess_begin();
}
@ -1277,9 +1289,18 @@ void kthread_unuse_mm(struct mm_struct *mm)
force_uaccess_end(to_kthread(tsk)->oldfs);
task_lock(tsk);
/*
* When a kthread stops operating on an address space, the loop
* in membarrier_{private,global}_expedited() may not observe
* that tsk->mm, and not issue an IPI. Membarrier requires a
* memory barrier after accessing user-space memory, before
* clearing tsk->mm.
*/
smp_mb__after_spinlock();
sync_mm_rss(mm);
local_irq_disable();
tsk->mm = NULL;
membarrier_update_current_mm(NULL);
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
local_irq_enable();

View File

@ -3025,10 +3025,8 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
wake_up_interruptible(&log_wait);
}
static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
.func = wake_up_klogd_work_func,
.flags = ATOMIC_INIT(IRQ_WORK_LAZY),
};
static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func);
void wake_up_klogd(void)
{

View File

@ -1322,8 +1322,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (IS_ENABLED(CONFIG_IRQ_WORK) &&
!rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
(rnp->ffmask & rdp->grpmask)) {
init_irq_work(&rdp->rcu_iw, rcu_iw_handler);
atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ);
rdp->rcu_iw_pending = true;
rdp->rcu_iw_gp_seq = rnp->gp_seq;
irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
@ -4023,6 +4021,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->cpu_no_qs.b.norm = true;
rdp->core_needs_qs = false;
rdp->rcu_iw_pending = false;
rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler);
rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);

File diff suppressed because it is too large Load Diff

View File

@ -120,7 +120,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) {
unsigned long cap, max_cap = 0;
int cpu, max_cpu = -1;
@ -151,7 +151,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
if (cpumask_test_cpu(best_cpu, &p->cpus_mask) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
if (later_mask)
cpumask_set_cpu(best_cpu, later_mask);

View File

@ -899,16 +899,9 @@ struct cpufreq_governor *cpufreq_default_governor(void)
cpufreq_governor_init(schedutil_gov);
#ifdef CONFIG_ENERGY_MODEL
extern bool sched_energy_update;
extern struct mutex sched_energy_mutex;
static void rebuild_sd_workfn(struct work_struct *work)
{
mutex_lock(&sched_energy_mutex);
sched_energy_update = true;
rebuild_sched_domains();
sched_energy_update = false;
mutex_unlock(&sched_energy_mutex);
rebuild_sched_domains_energy();
}
static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);

View File

@ -11,7 +11,7 @@
* This code tracks the priority of each CPU so that global migration
* decisions are easy to calculate. Each CPU can be in a state as follows:
*
* (INVALID), IDLE, NORMAL, RT1, ... RT99
* (INVALID), NORMAL, RT1, ... RT99, HIGHER
*
* going from the lowest priority to the highest. CPUs in the INVALID state
* are not eligible for routing. The system maintains this state with
@ -19,24 +19,48 @@
* in that class). Therefore a typical application without affinity
* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
* searches). For tasks with affinity restrictions, the algorithm has a
* worst case complexity of O(min(102, nr_domcpus)), though the scenario that
* worst case complexity of O(min(101, nr_domcpus)), though the scenario that
* yields the worst case search is fairly contrived.
*/
#include "sched.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */
/*
* p->rt_priority p->prio newpri cpupri
*
* -1 -1 (CPUPRI_INVALID)
*
* 99 0 (CPUPRI_NORMAL)
*
* 1 98 98 1
* ...
* 49 50 50 49
* 50 49 49 50
* ...
* 99 0 0 99
*
* 100 100 (CPUPRI_HIGHER)
*/
static int convert_prio(int prio)
{
int cpupri;
if (prio == CPUPRI_INVALID)
cpupri = CPUPRI_INVALID;
else if (prio == MAX_PRIO)
cpupri = CPUPRI_IDLE;
else if (prio >= MAX_RT_PRIO)
cpupri = CPUPRI_NORMAL;
else
cpupri = MAX_RT_PRIO - prio + 1;
switch (prio) {
case CPUPRI_INVALID:
cpupri = CPUPRI_INVALID; /* -1 */
break;
case 0 ... 98:
cpupri = MAX_RT_PRIO-1 - prio; /* 1 ... 99 */
break;
case MAX_RT_PRIO-1:
cpupri = CPUPRI_NORMAL; /* 0 */
break;
case MAX_RT_PRIO:
cpupri = CPUPRI_HIGHER; /* 100 */
break;
}
return cpupri;
}
@ -73,11 +97,11 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
if (skip)
return 0;
if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
return 0;
if (lowest_mask) {
cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
/*
* We have to ensure that we have at least one bit
@ -177,7 +201,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
* cpupri_set - update the CPU priority setting
* @cp: The cpupri context
* @cpu: The target CPU
* @newpri: The priority (INVALID-RT99) to assign to this CPU
* @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU
*
* Note: Assumes cpu_rq(cpu)->lock is locked
*

View File

@ -1,11 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0 */
#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1)
#define CPUPRI_INVALID -1
#define CPUPRI_IDLE 0
#define CPUPRI_NORMAL 1
/* values 2-101 are RT priorities 0-99 */
#define CPUPRI_NORMAL 0
/* values 1-99 are for RT1-RT99 priorities */
#define CPUPRI_HIGHER 100
struct cpupri_vec {
atomic_t count;

View File

@ -119,6 +119,17 @@ static inline unsigned long dl_bw_capacity(int i)
return __dl_bw_capacity(i);
}
}
static inline bool dl_bw_visited(int cpu, u64 gen)
{
struct root_domain *rd = cpu_rq(cpu)->rd;
if (rd->visit_gen == gen)
return true;
rd->visit_gen = gen;
return false;
}
#else
static inline struct dl_bw *dl_bw_of(int i)
{
@ -134,6 +145,11 @@ static inline unsigned long dl_bw_capacity(int i)
{
return SCHED_CAPACITY_SCALE;
}
static inline bool dl_bw_visited(int cpu, u64 gen)
{
return false;
}
#endif
static inline
@ -565,7 +581,7 @@ static int push_dl_task(struct rq *rq);
static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
{
return dl_task(prev);
return rq->online && dl_task(prev);
}
static DEFINE_PER_CPU(struct callback_head, dl_push_head);
@ -1397,6 +1413,8 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
if (dl_rq->earliest_dl.curr == 0 ||
dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
if (dl_rq->earliest_dl.curr == 0)
cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER);
dl_rq->earliest_dl.curr = deadline;
cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
}
@ -1414,6 +1432,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
dl_rq->earliest_dl.curr = 0;
dl_rq->earliest_dl.next = 0;
cpudl_clear(&rq->rd->cpudl, rq->cpu);
cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
} else {
struct rb_node *leftmost = dl_rq->root.rb_leftmost;
struct sched_dl_entity *entry;
@ -1670,13 +1689,13 @@ static void yield_task_dl(struct rq *rq)
static int find_later_rq(struct task_struct *task);
static int
select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
select_task_rq_dl(struct task_struct *p, int cpu, int flags)
{
struct task_struct *curr;
bool select_rq;
struct rq *rq;
if (sd_flag != SD_BALANCE_WAKE)
if (!(flags & WF_TTWU))
goto out;
rq = cpu_rq(cpu);
@ -1918,7 +1937,7 @@ static void task_fork_dl(struct task_struct *p)
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
cpumask_test_cpu(cpu, p->cpus_ptr))
cpumask_test_cpu(cpu, &p->cpus_mask))
return 1;
return 0;
}
@ -2008,7 +2027,7 @@ static int find_later_rq(struct task_struct *task)
return this_cpu;
}
best_cpu = cpumask_first_and(later_mask,
best_cpu = cpumask_any_and_distribute(later_mask,
sched_domain_span(sd));
/*
* Last chance: if a CPU being in both later_mask
@ -2031,7 +2050,7 @@ static int find_later_rq(struct task_struct *task)
if (this_cpu != -1)
return this_cpu;
cpu = cpumask_any(later_mask);
cpu = cpumask_any_distribute(later_mask);
if (cpu < nr_cpu_ids)
return cpu;
@ -2068,7 +2087,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
!cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
task_running(rq, task) ||
!dl_task(task) ||
!task_on_rq_queued(task))) {
@ -2135,6 +2154,9 @@ static int push_dl_task(struct rq *rq)
return 0;
retry:
if (is_migration_disabled(next_task))
return 0;
if (WARN_ON(next_task == rq->curr))
return 0;
@ -2212,7 +2234,7 @@ static void push_dl_tasks(struct rq *rq)
static void pull_dl_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, cpu;
struct task_struct *p;
struct task_struct *p, *push_task;
bool resched = false;
struct rq *src_rq;
u64 dmin = LONG_MAX;
@ -2242,6 +2264,7 @@ static void pull_dl_task(struct rq *this_rq)
continue;
/* Might drop this_rq->lock */
push_task = NULL;
double_lock_balance(this_rq, src_rq);
/*
@ -2273,17 +2296,27 @@ static void pull_dl_task(struct rq *this_rq)
src_rq->curr->dl.deadline))
goto skip;
resched = true;
if (is_migration_disabled(p)) {
push_task = get_push_task(src_rq);
} else {
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
resched = true;
}
/* Is there any other task even earlier? */
}
skip:
double_unlock_balance(this_rq, src_rq);
if (push_task) {
raw_spin_unlock(&this_rq->lock);
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
push_task, &src_rq->push_work);
raw_spin_lock(&this_rq->lock);
}
}
if (resched)
@ -2307,7 +2340,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
}
static void set_cpus_allowed_dl(struct task_struct *p,
const struct cpumask *new_mask)
const struct cpumask *new_mask,
u32 flags)
{
struct root_domain *src_rd;
struct rq *rq;
@ -2336,7 +2370,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
raw_spin_unlock(&src_dl_b->lock);
}
set_cpus_allowed_common(p, new_mask);
set_cpus_allowed_common(p, new_mask, flags);
}
/* Assumes rq->lock is held */
@ -2509,8 +2543,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
}
}
const struct sched_class dl_sched_class
__section("__dl_sched_class") = {
DEFINE_SCHED_CLASS(dl) = {
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
@ -2529,6 +2563,7 @@ const struct sched_class dl_sched_class
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
.task_woken = task_woken_dl,
.find_lock_rq = find_lock_later_rq,
#endif
.task_tick = task_tick_dl,
@ -2541,33 +2576,39 @@ const struct sched_class dl_sched_class
.update_curr = update_curr_dl,
};
/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */
static u64 dl_generation;
int sched_dl_global_validate(void)
{
u64 runtime = global_rt_runtime();
u64 period = global_rt_period();
u64 new_bw = to_ratio(period, runtime);
u64 gen = ++dl_generation;
struct dl_bw *dl_b;
int cpu, ret = 0;
int cpu, cpus, ret = 0;
unsigned long flags;
/*
* Here we want to check the bandwidth not being set to some
* value smaller than the currently allocated bandwidth in
* any of the root_domains.
*
* FIXME: Cycling on all the CPUs is overdoing, but simpler than
* cycling on root_domains... Discussion on different/better
* solutions is welcome!
*/
for_each_possible_cpu(cpu) {
rcu_read_lock_sched();
if (dl_bw_visited(cpu, gen))
goto next;
dl_b = dl_bw_of(cpu);
cpus = dl_bw_cpus(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
if (new_bw < dl_b->total_bw)
if (new_bw * cpus < dl_b->total_bw)
ret = -EBUSY;
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
next:
rcu_read_unlock_sched();
if (ret)
@ -2593,6 +2634,7 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
void sched_dl_do_global(void)
{
u64 new_bw = -1;
u64 gen = ++dl_generation;
struct dl_bw *dl_b;
int cpu;
unsigned long flags;
@ -2603,11 +2645,14 @@ void sched_dl_do_global(void)
if (global_rt_runtime() != RUNTIME_INF)
new_bw = to_ratio(global_rt_period(), global_rt_runtime());
/*
* FIXME: As above...
*/
for_each_possible_cpu(cpu) {
rcu_read_lock_sched();
if (dl_bw_visited(cpu, gen)) {
rcu_read_unlock_sched();
continue;
}
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);

View File

@ -906,6 +906,15 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (!schedstat_enabled())
return;
/*
* When the sched_schedstat changes from 0 to 1, some sched se
* maybe already in the runqueue, the se->statistics.wait_start
* will be 0.So it will let the delta wrong. We need to avoid this
* scenario.
*/
if (unlikely(!schedstat_val(se->statistics.wait_start)))
return;
delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
if (entity_is_task(se)) {
@ -1550,7 +1559,8 @@ struct task_numa_env {
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
static unsigned long cpu_util(int cpu);
static inline long adjust_numa_imbalance(int imbalance, int nr_running);
static inline long adjust_numa_imbalance(int imbalance,
int dst_running, int dst_weight);
static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
@ -1930,7 +1940,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,
src_running = env->src_stats.nr_running - 1;
dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running);
imbalance = adjust_numa_imbalance(imbalance, dst_running);
imbalance = adjust_numa_imbalance(imbalance, dst_running,
env->dst_stats.weight);
/* Use idle CPU if there is no imbalance */
if (!imbalance) {
@ -4779,25 +4790,37 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
break;
goto done;
if (dequeue) {
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
} else {
update_load_avg(qcfs_rq, se, 0);
se_update_runnable(se);
}
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
if (qcfs_rq->load.weight)
dequeue = 0;
if (qcfs_rq->load.weight) {
/* Avoid re-evaluating load for this entity: */
se = parent_entity(se);
break;
}
}
if (!se)
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
goto done;
update_load_avg(qcfs_rq, se, 0);
se_update_runnable(se);
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
}
/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, task_delta);
done:
/*
* Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion.
@ -5105,9 +5128,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
return;
distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
/*
@ -5805,6 +5825,9 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
if (sync && cpu_rq(this_cpu)->nr_running == 1)
return this_cpu;
if (available_idle_cpu(prev_cpu))
return prev_cpu;
return nr_cpumask_bits;
}
@ -6063,10 +6086,11 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
break;
}
}
cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
if (idle)
return core;
cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
}
/*
@ -6307,7 +6331,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
}
/**
* Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
* cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks.
* @cpu: the CPU to get the utilization of
*
* The unit of the return value must be the one of capacity so we can compare
@ -6683,7 +6707,7 @@ fail:
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
* that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
*
* Balances load by selecting the idlest CPU in the idlest group, or under
@ -6694,15 +6718,17 @@ fail:
* preempt must be disabled.
*/
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
{
int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
struct sched_domain *tmp, *sd = NULL;
int cpu = smp_processor_id();
int new_cpu = prev_cpu;
int want_affine = 0;
int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
/* SD_flags and WF_flags share the first nibble */
int sd_flag = wake_flags & 0xF;
if (sd_flag & SD_BALANCE_WAKE) {
if (wake_flags & WF_TTWU) {
record_wakee(p);
if (sched_energy_enabled()) {
@ -6739,9 +6765,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (unlikely(sd)) {
/* Slow path */
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
} else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
} else if (wake_flags & WF_TTWU) { /* XXX always ? */
/* Fast path */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
if (want_affine)
@ -8757,6 +8782,16 @@ static bool update_pick_idlest(struct sched_group *idlest,
return true;
}
/*
* Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
* This is an approximation as the number of running tasks may not be
* related to the number of busy CPUs due to sched_setaffinity.
*/
static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
{
return (dst_running < (dst_weight >> 2));
}
/*
* find_idlest_group() finds and returns the least busy CPU group within the
* domain.
@ -8775,9 +8810,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
.group_type = group_overloaded,
};
imbalance = scale_load_down(NICE_0_LOAD) *
(sd->imbalance_pct-100) / 100;
do {
int local_group;
@ -8831,6 +8863,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
switch (local_sgs.group_type) {
case group_overloaded:
case group_fully_busy:
/* Calculate allowed imbalance based on load */
imbalance = scale_load_down(NICE_0_LOAD) *
(sd->imbalance_pct-100) / 100;
/*
* When comparing groups across NUMA domains, it's possible for
* the local domain to be very lightly loaded relative to the
@ -8887,7 +8924,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
* a real need of migration, periodic load balance will
* take care of it.
*/
if (local_sgs.idle_cpus)
if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
return NULL;
}
@ -8989,16 +9026,19 @@ next_group:
}
}
static inline long adjust_numa_imbalance(int imbalance, int nr_running)
#define NUMA_IMBALANCE_MIN 2
static inline long adjust_numa_imbalance(int imbalance,
int dst_running, int dst_weight)
{
unsigned int imbalance_min;
if (!allow_numa_imbalance(dst_running, dst_weight))
return imbalance;
/*
* Allow a small imbalance based on a simple pair of communicating
* tasks that remain local when the source domain is almost idle.
* tasks that remain local when the destination is lightly loaded.
*/
imbalance_min = 2;
if (nr_running <= imbalance_min)
if (imbalance <= NUMA_IMBALANCE_MIN)
return 0;
return imbalance;
@ -9101,9 +9141,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
}
/* Consider allowing a small imbalance between NUMA groups */
if (env->sd->flags & SD_NUMA)
if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance,
busiest->sum_nr_running);
busiest->sum_nr_running, busiest->group_weight);
}
return;
}
@ -10068,6 +10109,10 @@ static inline int find_new_ilb(void)
for_each_cpu_and(ilb, nohz.idle_cpus_mask,
housekeeping_cpumask(HK_FLAG_MISC)) {
if (ilb == smp_processor_id())
continue;
if (idle_cpu(ilb))
return ilb;
}
@ -10505,7 +10550,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
#endif /* CONFIG_NO_HZ_COMMON */
/*
* idle_balance is called by schedule() if this_cpu is about to become
* newidle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*
* Returns:
@ -11179,8 +11224,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
/*
* All the scheduling class methods:
*/
const struct sched_class fair_sched_class
__section("__fair_sched_class") = {
DEFINE_SCHED_CLASS(fair) = {
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,

View File

@ -364,6 +364,7 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns)
WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
WARN_ON_ONCE(!duration_ns);
WARN_ON_ONCE(current->mm);
rcu_sleep_check();
preempt_disable();
@ -401,7 +402,7 @@ void cpu_startup_entry(enum cpuhp_state state)
#ifdef CONFIG_SMP
static int
select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
select_task_rq_idle(struct task_struct *p, int cpu, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
@ -483,8 +484,8 @@ static void update_curr_idle(struct rq *rq)
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
const struct sched_class idle_sched_class
__section("__idle_sched_class") = {
DEFINE_SCHED_CLASS(idle) = {
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */

View File

@ -6,6 +6,134 @@
*/
#include "sched.h"
/*
* For documentation purposes, here are some membarrier ordering
* scenarios to keep in mind:
*
* A) Userspace thread execution after IPI vs membarrier's memory
* barrier before sending the IPI
*
* Userspace variables:
*
* int x = 0, y = 0;
*
* The memory barrier at the start of membarrier() on CPU0 is necessary in
* order to enforce the guarantee that any writes occurring on CPU0 before
* the membarrier() is executed will be visible to any code executing on
* CPU1 after the IPI-induced memory barrier:
*
* CPU0 CPU1
*
* x = 1
* membarrier():
* a: smp_mb()
* b: send IPI IPI-induced mb
* c: smp_mb()
* r2 = y
* y = 1
* barrier()
* r1 = x
*
* BUG_ON(r1 == 0 && r2 == 0)
*
* The write to y and load from x by CPU1 are unordered by the hardware,
* so it's possible to have "r1 = x" reordered before "y = 1" at any
* point after (b). If the memory barrier at (a) is omitted, then "x = 1"
* can be reordered after (a) (although not after (c)), so we get r1 == 0
* and r2 == 0. This violates the guarantee that membarrier() is
* supposed by provide.
*
* The timing of the memory barrier at (a) has to ensure that it executes
* before the IPI-induced memory barrier on CPU1.
*
* B) Userspace thread execution before IPI vs membarrier's memory
* barrier after completing the IPI
*
* Userspace variables:
*
* int x = 0, y = 0;
*
* The memory barrier at the end of membarrier() on CPU0 is necessary in
* order to enforce the guarantee that any writes occurring on CPU1 before
* the membarrier() is executed will be visible to any code executing on
* CPU0 after the membarrier():
*
* CPU0 CPU1
*
* x = 1
* barrier()
* y = 1
* r2 = y
* membarrier():
* a: smp_mb()
* b: send IPI IPI-induced mb
* c: smp_mb()
* r1 = x
* BUG_ON(r1 == 0 && r2 == 1)
*
* The writes to x and y are unordered by the hardware, so it's possible to
* have "r2 = 1" even though the write to x doesn't execute until (b). If
* the memory barrier at (c) is omitted then "r1 = x" can be reordered
* before (b) (although not before (a)), so we get "r1 = 0". This violates
* the guarantee that membarrier() is supposed to provide.
*
* The timing of the memory barrier at (c) has to ensure that it executes
* after the IPI-induced memory barrier on CPU1.
*
* C) Scheduling userspace thread -> kthread -> userspace thread vs membarrier
*
* CPU0 CPU1
*
* membarrier():
* a: smp_mb()
* d: switch to kthread (includes mb)
* b: read rq->curr->mm == NULL
* e: switch to user (includes mb)
* c: smp_mb()
*
* Using the scenario from (A), we can show that (a) needs to be paired
* with (e). Using the scenario from (B), we can show that (c) needs to
* be paired with (d).
*
* D) exit_mm vs membarrier
*
* Two thread groups are created, A and B. Thread group B is created by
* issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD.
* Let's assume we have a single thread within each thread group (Thread A
* and Thread B). Thread A runs on CPU0, Thread B runs on CPU1.
*
* CPU0 CPU1
*
* membarrier():
* a: smp_mb()
* exit_mm():
* d: smp_mb()
* e: current->mm = NULL
* b: read rq->curr->mm == NULL
* c: smp_mb()
*
* Using scenario (B), we can show that (c) needs to be paired with (d).
*
* E) kthread_{use,unuse}_mm vs membarrier
*
* CPU0 CPU1
*
* membarrier():
* a: smp_mb()
* kthread_unuse_mm()
* d: smp_mb()
* e: current->mm = NULL
* b: read rq->curr->mm == NULL
* kthread_use_mm()
* f: current->mm = mm
* g: smp_mb()
* c: smp_mb()
*
* Using the scenario from (A), we can show that (a) needs to be paired
* with (g). Using the scenario from (B), we can show that (c) needs to
* be paired with (d).
*/
/*
* Bitmask made from a "or" of all commands within enum membarrier_cmd,
* except MEMBARRIER_CMD_QUERY.
@ -101,6 +229,18 @@ void membarrier_exec_mmap(struct mm_struct *mm)
this_cpu_write(runqueues.membarrier_state, 0);
}
void membarrier_update_current_mm(struct mm_struct *next_mm)
{
struct rq *rq = this_rq();
int membarrier_state = 0;
if (next_mm)
membarrier_state = atomic_read(&next_mm->membarrier_state);
if (READ_ONCE(rq->membarrier_state) == membarrier_state)
return;
WRITE_ONCE(rq->membarrier_state, membarrier_state);
}
static int membarrier_global_expedited(void)
{
int cpu;
@ -139,12 +279,11 @@ static int membarrier_global_expedited(void)
continue;
/*
* Skip the CPU if it runs a kernel thread. The scheduler
* leaves the prior task mm in place as an optimization when
* scheduling a kthread.
* Skip the CPU if it runs a kernel thread which is not using
* a task mm.
*/
p = rcu_dereference(cpu_rq(cpu)->curr);
if (p->flags & PF_KTHREAD)
if (!p->mm)
continue;
__cpumask_set_cpu(cpu, tmpmask);

View File

@ -89,8 +89,8 @@ void init_rt_rq(struct rt_rq *rt_rq)
__set_bit(MAX_RT_PRIO, array->bitmap);
#if defined CONFIG_SMP
rt_rq->highest_prio.curr = MAX_RT_PRIO;
rt_rq->highest_prio.next = MAX_RT_PRIO;
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->highest_prio.next = MAX_RT_PRIO-1;
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
@ -161,7 +161,7 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
{
struct rq *rq = cpu_rq(cpu);
rt_rq->highest_prio.curr = MAX_RT_PRIO;
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->rt_nr_boosted = 0;
rt_rq->rq = rq;
rt_rq->tg = tg;
@ -265,7 +265,7 @@ static void pull_rt_task(struct rq *this_rq);
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{
/* Try to pull RT tasks here if we lower this rq's prio */
return rq->rt.highest_prio.curr > prev->prio;
return rq->online && rq->rt.highest_prio.curr > prev->prio;
}
static inline int rt_overloaded(struct rq *rq)
@ -393,8 +393,9 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
p = plist_first_entry(&rq->rt.pushable_tasks,
struct task_struct, pushable_tasks);
rq->rt.highest_prio.next = p->prio;
} else
rq->rt.highest_prio.next = MAX_RT_PRIO;
} else {
rq->rt.highest_prio.next = MAX_RT_PRIO-1;
}
}
#else
@ -1147,8 +1148,9 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
sched_find_first_bit(array->bitmap);
}
} else
rt_rq->highest_prio.curr = MAX_RT_PRIO;
} else {
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
}
dec_rt_prio_smp(rt_rq, prio, prev_prio);
}
@ -1428,14 +1430,14 @@ static void yield_task_rt(struct rq *rq)
static int find_lowest_rq(struct task_struct *task);
static int
select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
select_task_rq_rt(struct task_struct *p, int cpu, int flags)
{
struct task_struct *curr;
struct rq *rq;
bool test;
/* For anything but wake ups, just return the task_cpu */
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
if (!(flags & (WF_TTWU | WF_FORK)))
goto out;
rq = cpu_rq(cpu);
@ -1658,7 +1660,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
cpumask_test_cpu(cpu, p->cpus_ptr))
cpumask_test_cpu(cpu, &p->cpus_mask))
return 1;
return 0;
@ -1752,7 +1754,7 @@ static int find_lowest_rq(struct task_struct *task)
return this_cpu;
}
best_cpu = cpumask_first_and(lowest_mask,
best_cpu = cpumask_any_and_distribute(lowest_mask,
sched_domain_span(sd));
if (best_cpu < nr_cpu_ids) {
rcu_read_unlock();
@ -1770,7 +1772,7 @@ static int find_lowest_rq(struct task_struct *task)
if (this_cpu != -1)
return this_cpu;
cpu = cpumask_any(lowest_mask);
cpu = cpumask_any_distribute(lowest_mask);
if (cpu < nr_cpu_ids)
return cpu;
@ -1811,7 +1813,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
* Also make sure that it wasn't scheduled on its rq.
*/
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
task_running(rq, task) ||
!rt_task(task) ||
!task_on_rq_queued(task))) {
@ -1859,7 +1861,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
* running task can migrate over to a CPU that is running a task
* of lesser priority.
*/
static int push_rt_task(struct rq *rq)
static int push_rt_task(struct rq *rq, bool pull)
{
struct task_struct *next_task;
struct rq *lowest_rq;
@ -1873,6 +1875,34 @@ static int push_rt_task(struct rq *rq)
return 0;
retry:
if (is_migration_disabled(next_task)) {
struct task_struct *push_task = NULL;
int cpu;
if (!pull || rq->push_busy)
return 0;
cpu = find_lowest_rq(rq->curr);
if (cpu == -1 || cpu == rq->cpu)
return 0;
/*
* Given we found a CPU with lower priority than @next_task,
* therefore it should be running. However we cannot migrate it
* to this other CPU, instead attempt to push the current
* running task on this CPU away.
*/
push_task = get_push_task(rq);
if (push_task) {
raw_spin_unlock(&rq->lock);
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
push_task, &rq->push_work);
raw_spin_lock(&rq->lock);
}
return 0;
}
if (WARN_ON(next_task == rq->curr))
return 0;
@ -1927,12 +1957,10 @@ retry:
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, lowest_rq->cpu);
activate_task(lowest_rq, next_task, 0);
resched_curr(lowest_rq);
ret = 1;
resched_curr(lowest_rq);
double_unlock_balance(rq, lowest_rq);
out:
put_task_struct(next_task);
@ -1942,7 +1970,7 @@ out:
static void push_rt_tasks(struct rq *rq)
{
/* push_rt_task will return true if it moved an RT */
while (push_rt_task(rq))
while (push_rt_task(rq, false))
;
}
@ -2095,7 +2123,8 @@ void rto_push_irq_work_func(struct irq_work *work)
*/
if (has_pushable_tasks(rq)) {
raw_spin_lock(&rq->lock);
push_rt_tasks(rq);
while (push_rt_task(rq, true))
;
raw_spin_unlock(&rq->lock);
}
@ -2120,7 +2149,7 @@ static void pull_rt_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, cpu;
bool resched = false;
struct task_struct *p;
struct task_struct *p, *push_task;
struct rq *src_rq;
int rt_overload_count = rt_overloaded(this_rq);
@ -2167,6 +2196,7 @@ static void pull_rt_task(struct rq *this_rq)
* double_lock_balance, and another CPU could
* alter this_rq
*/
push_task = NULL;
double_lock_balance(this_rq, src_rq);
/*
@ -2194,11 +2224,14 @@ static void pull_rt_task(struct rq *this_rq)
if (p->prio < src_rq->curr->prio)
goto skip;
resched = true;
if (is_migration_disabled(p)) {
push_task = get_push_task(src_rq);
} else {
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
resched = true;
}
/*
* We continue with the search, just in
* case there's an even higher prio task
@ -2208,6 +2241,13 @@ static void pull_rt_task(struct rq *this_rq)
}
skip:
double_unlock_balance(this_rq, src_rq);
if (push_task) {
raw_spin_unlock(&this_rq->lock);
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
push_task, &src_rq->push_work);
raw_spin_lock(&this_rq->lock);
}
}
if (resched)
@ -2429,8 +2469,8 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
return 0;
}
const struct sched_class rt_sched_class
__section("__rt_sched_class") = {
DEFINE_SCHED_CLASS(rt) = {
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
@ -2449,6 +2489,7 @@ const struct sched_class rt_sched_class
.rq_offline = rq_offline_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
.find_lock_rq = find_lock_lowest_rq,
#endif
.task_tick = task_tick_rt,

View File

@ -67,7 +67,6 @@
#include <linux/tsacct_kern.h>
#include <asm/tlb.h>
#include <asm-generic/vmlinux.lds.h>
#ifdef CONFIG_PARAVIRT
# include <asm/paravirt.h>
@ -257,30 +256,6 @@ struct rt_bandwidth {
void __dl_clear_params(struct task_struct *p);
/*
* To keep the bandwidth of -deadline tasks and groups under control
* we need some place where:
* - store the maximum -deadline bandwidth of the system (the group);
* - cache the fraction of that bandwidth that is currently allocated.
*
* This is all done in the data structure below. It is similar to the
* one used for RT-throttling (rt_bandwidth), with the main difference
* that, since here we are only interested in admission control, we
* do not decrease any runtime while the group "executes", neither we
* need a timer to replenish it.
*
* With respect to SMP, the bandwidth is given on a per-CPU basis,
* meaning that:
* - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
* - dl_total_bw array contains, in the i-eth element, the currently
* allocated bandwidth on the i-eth CPU.
* Moreover, groups consume bandwidth on each CPU, while tasks only
* consume bandwidth on the CPU they're running on.
* Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
* that will be shown the next time the proc or cgroup controls will
* be red. It on its turn can be changed by writing on its own
* control.
*/
struct dl_bandwidth {
raw_spinlock_t dl_runtime_lock;
u64 dl_runtime;
@ -292,6 +267,24 @@ static inline int dl_bandwidth_enabled(void)
return sysctl_sched_rt_runtime >= 0;
}
/*
* To keep the bandwidth of -deadline tasks under control
* we need some place where:
* - store the maximum -deadline bandwidth of each cpu;
* - cache the fraction of bandwidth that is currently allocated in
* each root domain;
*
* This is all done in the data structure below. It is similar to the
* one used for RT-throttling (rt_bandwidth), with the main difference
* that, since here we are only interested in admission control, we
* do not decrease any runtime while the group "executes", neither we
* need a timer to replenish it.
*
* With respect to SMP, bandwidth is given on a per root domain basis,
* meaning that:
* - bw (< 100%) is the deadline bandwidth of each CPU;
* - total_bw is the currently allocated bandwidth in each root domain;
*/
struct dl_bw {
raw_spinlock_t lock;
u64 bw;
@ -801,6 +794,15 @@ struct root_domain {
struct dl_bw dl_bw;
struct cpudl cpudl;
/*
* Indicate whether a root_domain's dl_bw has been checked or
* updated. It's monotonously increasing value.
*
* Also, some corner cases, like 'wrap around' is dangerous, but given
* that u64 is 'big enough'. So that shouldn't be a concern.
*/
u64 visit_gen;
#ifdef HAVE_RT_PUSH_IPI
/*
* For IPI pull requests, loop across the rto_mask.
@ -973,6 +975,7 @@ struct rq {
unsigned long cpu_capacity_orig;
struct callback_head *balance_callback;
unsigned char balance_flags;
unsigned char nohz_idle_balance;
unsigned char idle_balance;
@ -1003,6 +1006,10 @@ struct rq {
/* This is used to determine avg_idle's max value */
u64 max_idle_balance_cost;
#ifdef CONFIG_HOTPLUG_CPU
struct rcuwait hotplug_wait;
#endif
#endif /* CONFIG_SMP */
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@ -1048,6 +1055,12 @@ struct rq {
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
#endif
#ifdef CONFIG_SMP
unsigned int nr_pinned;
#endif
unsigned int push_busy;
struct cpu_stop_work push_work;
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@ -1075,6 +1088,16 @@ static inline int cpu_of(struct rq *rq)
#endif
}
#define MDF_PUSH 0x01
static inline bool is_migration_disabled(struct task_struct *p)
{
#ifdef CONFIG_SMP
return p->migration_disabled;
#else
return false;
#endif
}
#ifdef CONFIG_SCHED_SMT
extern void __update_idle_core(struct rq *rq);
@ -1221,6 +1244,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0;
#endif
#ifdef CONFIG_SMP
SCHED_WARN_ON(rq->balance_callback);
#endif
}
static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
@ -1382,6 +1408,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
#ifdef CONFIG_SMP
#define BALANCE_WORK 0x01
#define BALANCE_PUSH 0x02
static inline void
queue_balance_callback(struct rq *rq,
struct callback_head *head,
@ -1389,12 +1418,13 @@ queue_balance_callback(struct rq *rq,
{
lockdep_assert_held(&rq->lock);
if (unlikely(head->next))
if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH)))
return;
head->func = (void (*)(struct callback_head *))func;
head->next = rq->balance_callback;
rq->balance_callback = head;
rq->balance_flags |= BALANCE_WORK;
}
#define rcu_dereference_check_sched_domain(p) \
@ -1714,13 +1744,20 @@ static inline int task_on_rq_migrating(struct task_struct *p)
return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;
}
/*
* wake flags
*/
#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
#define WF_FORK 0x02 /* Child wakeup after fork */
#define WF_MIGRATED 0x04 /* Internal use, task got migrated */
#define WF_ON_CPU 0x08 /* Wakee is on_cpu */
/* Wake flags. The first three directly map to some SD flag value */
#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
#define WF_ON_CPU 0x40 /* Wakee is on_cpu */
#ifdef CONFIG_SMP
static_assert(WF_EXEC == SD_BALANCE_EXEC);
static_assert(WF_FORK == SD_BALANCE_FORK);
static_assert(WF_TTWU == SD_BALANCE_WAKE);
#endif
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
@ -1796,16 +1833,19 @@ struct sched_class {
#ifdef CONFIG_SMP
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
void (*set_cpus_allowed)(struct task_struct *p,
const struct cpumask *newmask);
const struct cpumask *newmask,
u32 flags);
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
#endif
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
@ -1833,7 +1873,7 @@ struct sched_class {
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_change_group)(struct task_struct *p, int type);
#endif
} __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */
};
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
@ -1847,6 +1887,20 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
next->sched_class->set_next_task(rq, next, false);
}
/*
* Helper to define a sched_class instance; each one is placed in a separate
* section which is ordered by the linker script:
*
* include/asm-generic/vmlinux.lds.h
*
* Also enforce alignment on the instance, not the type, to guarantee layout.
*/
#define DEFINE_SCHED_CLASS(name) \
const struct sched_class name##_sched_class \
__aligned(__alignof__(struct sched_class)) \
__section("__" #name "_sched_class")
/* Defined in include/asm-generic/vmlinux.lds.h */
extern struct sched_class __begin_sched_classes[];
extern struct sched_class __end_sched_classes[];
@ -1889,13 +1943,35 @@ static inline bool sched_fair_runnable(struct rq *rq)
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
extern struct task_struct *pick_next_task_idle(struct rq *rq);
#define SCA_CHECK 0x01
#define SCA_MIGRATE_DISABLE 0x02
#define SCA_MIGRATE_ENABLE 0x04
#ifdef CONFIG_SMP
extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
static inline struct task_struct *get_push_task(struct rq *rq)
{
struct task_struct *p = rq->curr;
lockdep_assert_held(&rq->lock);
if (rq->push_busy)
return NULL;
if (p->nr_cpus_allowed == 1)
return NULL;
rq->push_busy = true;
return get_task_struct(p);
}
extern int push_cpu_stop(void *arg);
#endif

View File

@ -11,7 +11,7 @@
#ifdef CONFIG_SMP
static int
select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
select_task_rq_stop(struct task_struct *p, int cpu, int flags)
{
return task_cpu(p); /* stop tasks as never migrate */
}
@ -109,8 +109,7 @@ static void update_curr_stop(struct rq *rq)
/*
* Simple, special scheduling class for the per-CPU stop tasks:
*/
const struct sched_class stop_sched_class
__section("__stop_sched_class") = {
DEFINE_SCHED_CLASS(stop) = {
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,

View File

@ -211,6 +211,15 @@ unsigned int sysctl_sched_energy_aware = 1;
DEFINE_MUTEX(sched_energy_mutex);
bool sched_energy_update;
void rebuild_sched_domains_energy(void)
{
mutex_lock(&sched_energy_mutex);
sched_energy_update = true;
rebuild_sched_domains();
sched_energy_update = false;
mutex_unlock(&sched_energy_mutex);
}
#ifdef CONFIG_PROC_SYSCTL
int sched_energy_aware_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
@ -223,13 +232,8 @@ int sched_energy_aware_handler(struct ctl_table *table, int write,
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) {
state = static_branch_unlikely(&sched_energy_present);
if (state != sysctl_sched_energy_aware) {
mutex_lock(&sched_energy_mutex);
sched_energy_update = 1;
rebuild_sched_domains();
sched_energy_update = 0;
mutex_unlock(&sched_energy_mutex);
}
if (state != sysctl_sched_energy_aware)
rebuild_sched_domains_energy();
}
return ret;
@ -324,6 +328,7 @@ static void sched_energy_set(bool has_eas)
* 3. no SMT is detected.
* 4. the EM complexity is low enough to keep scheduling overheads low;
* 5. schedutil is driving the frequency of all CPUs of the rd;
* 6. frequency invariance support is present;
*
* The complexity of the Energy Model is defined as:
*
@ -372,6 +377,14 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
goto free;
}
if (!arch_scale_freq_invariant()) {
if (sched_debug()) {
pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported",
cpumask_pr_args(cpu_map));
}
goto free;
}
for_each_cpu(i, cpu_map) {
/* Skip already covered CPUs. */
if (find_pd(pd, i))
@ -516,6 +529,7 @@ static int init_rootdomain(struct root_domain *rd)
init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
#endif
rd->visit_gen = 0;
init_dl_bw(&rd->dl_bw);
if (cpudl_init(&rd->cpudl) != 0)
goto free_rto_mask;
@ -674,6 +688,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
int numa_distance = 0;
/* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) {
@ -705,6 +720,38 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
sd->child = NULL;
}
for (tmp = sd; tmp; tmp = tmp->parent)
numa_distance += !!(tmp->flags & SD_NUMA);
/*
* FIXME: Diameter >=3 is misrepresented.
*
* Smallest diameter=3 topology is:
*
* node 0 1 2 3
* 0: 10 20 30 40
* 1: 20 10 20 30
* 2: 30 20 10 20
* 3: 40 30 20 10
*
* 0 --- 1 --- 2 --- 3
*
* NUMA-3 0-3 N/A N/A 0-3
* groups: {0-2},{1-3} {1-3},{0-2}
*
* NUMA-2 0-2 0-3 0-3 1-3
* groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2}
*
* NUMA-1 0-1 0-2 1-3 2-3
* groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2}
*
* NUMA-0 0 1 2 3
*
* The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
* group span isn't a subset of the domain span.
*/
WARN_ONCE(numa_distance > 2, "Shortest NUMA path spans too many nodes\n");
sched_domain_debug(sd, cpu);
rq_attach_root(rq, rd);

View File

@ -27,7 +27,7 @@
#include "smpboot.h"
#include "sched/smp.h"
#define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK)
#define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK)
struct call_function_data {
call_single_data_t __percpu *csd;
@ -130,7 +130,7 @@ static __always_inline int csd_lock_wait_getcpu(call_single_data_t *csd)
csd_type = CSD_TYPE(csd);
if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC)
return csd->dst; /* Other CSD_TYPE_ values might not have ->dst. */
return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */
return -1;
}
@ -146,7 +146,7 @@ static __always_inline bool csd_lock_wait_toolong(call_single_data_t *csd, u64 t
bool firsttime;
u64 ts2, ts_delta;
call_single_data_t *cpu_cur_csd;
unsigned int flags = READ_ONCE(csd->flags);
unsigned int flags = READ_ONCE(csd->node.u_flags);
if (!(flags & CSD_FLAG_LOCK)) {
if (!unlikely(*bug_id))
@ -224,14 +224,14 @@ static void csd_lock_record(call_single_data_t *csd)
static __always_inline void csd_lock_wait(call_single_data_t *csd)
{
smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
}
#endif
static __always_inline void csd_lock(call_single_data_t *csd)
{
csd_lock_wait(csd);
csd->flags |= CSD_FLAG_LOCK;
csd->node.u_flags |= CSD_FLAG_LOCK;
/*
* prevent CPU from reordering the above assignment
@ -243,12 +243,12 @@ static __always_inline void csd_lock(call_single_data_t *csd)
static __always_inline void csd_unlock(call_single_data_t *csd)
{
WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
/*
* ensure we're all done before releasing data:
*/
smp_store_release(&csd->flags, 0);
smp_store_release(&csd->node.u_flags, 0);
}
static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
@ -300,7 +300,7 @@ static int generic_exec_single(int cpu, call_single_data_t *csd)
return -ENXIO;
}
__smp_call_single_queue(cpu, &csd->llist);
__smp_call_single_queue(cpu, &csd->node.llist);
return 0;
}
@ -353,7 +353,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
* We don't have to use the _safe() variant here
* because we are not invoking the IPI handlers yet.
*/
llist_for_each_entry(csd, entry, llist) {
llist_for_each_entry(csd, entry, node.llist) {
switch (CSD_TYPE(csd)) {
case CSD_TYPE_ASYNC:
case CSD_TYPE_SYNC:
@ -378,16 +378,16 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
* First; run all SYNC callbacks, people are waiting for us.
*/
prev = NULL;
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
/* Do we wait until *after* callback? */
if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
smp_call_func_t func = csd->func;
void *info = csd->info;
if (prev) {
prev->next = &csd_next->llist;
prev->next = &csd_next->node.llist;
} else {
entry = &csd_next->llist;
entry = &csd_next->node.llist;
}
csd_lock_record(csd);
@ -395,7 +395,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
csd_unlock(csd);
csd_lock_record(NULL);
} else {
prev = &csd->llist;
prev = &csd->node.llist;
}
}
@ -406,14 +406,14 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
* Second; run all !SYNC callbacks.
*/
prev = NULL;
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
int type = CSD_TYPE(csd);
if (type != CSD_TYPE_TTWU) {
if (prev) {
prev->next = &csd_next->llist;
prev->next = &csd_next->node.llist;
} else {
entry = &csd_next->llist;
entry = &csd_next->node.llist;
}
if (type == CSD_TYPE_ASYNC) {
@ -429,7 +429,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
}
} else {
prev = &csd->llist;
prev = &csd->node.llist;
}
}
@ -465,7 +465,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
{
call_single_data_t *csd;
call_single_data_t csd_stack = {
.flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC,
.node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, },
};
int this_cpu;
int err;
@ -502,8 +502,8 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
csd->func = func;
csd->info = info;
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
csd->src = smp_processor_id();
csd->dst = cpu;
csd->node.src = smp_processor_id();
csd->node.dst = cpu;
#endif
err = generic_exec_single(cpu, csd);
@ -544,12 +544,12 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
preempt_disable();
if (csd->flags & CSD_FLAG_LOCK) {
if (csd->node.u_flags & CSD_FLAG_LOCK) {
err = -EBUSY;
goto out;
}
csd->flags = CSD_FLAG_LOCK;
csd->node.u_flags = CSD_FLAG_LOCK;
smp_wmb();
err = generic_exec_single(cpu, csd);
@ -667,14 +667,14 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
csd_lock(csd);
if (wait)
csd->flags |= CSD_TYPE_SYNC;
csd->node.u_flags |= CSD_TYPE_SYNC;
csd->func = func;
csd->info = info;
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
csd->src = smp_processor_id();
csd->dst = cpu;
csd->node.src = smp_processor_id();
csd->node.dst = cpu;
#endif
if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu)))
__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
}

View File

@ -42,11 +42,27 @@ struct cpu_stopper {
struct list_head works; /* list of pending works */
struct cpu_stop_work stop_work; /* for stop_cpus */
unsigned long caller;
cpu_stop_fn_t fn;
};
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
static bool stop_machine_initialized = false;
void print_stop_info(const char *log_lvl, struct task_struct *task)
{
/*
* If @task is a stopper task, it cannot migrate and task_cpu() is
* stable.
*/
struct cpu_stopper *stopper = per_cpu_ptr(&cpu_stopper, task_cpu(task));
if (task != stopper->thread)
return;
printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller);
}
/* static data for stop_cpus */
static DEFINE_MUTEX(stop_cpus_mutex);
static bool stop_cpus_in_progress;
@ -123,7 +139,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
{
struct cpu_stop_done done;
struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ };
cpu_stop_init_done(&done, 1);
if (!cpu_stop_queue_work(cpu, &work))
@ -331,7 +347,8 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
work1 = work2 = (struct cpu_stop_work){
.fn = multi_cpu_stop,
.arg = &msdata,
.done = &done
.done = &done,
.caller = _RET_IP_,
};
cpu_stop_init_done(&done, 2);
@ -367,7 +384,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
struct cpu_stop_work *work_buf)
{
*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
return cpu_stop_queue_work(cpu, work_buf);
}
@ -487,6 +504,8 @@ repeat:
int ret;
/* cpu stop callbacks must not sleep, make in_atomic() == T */
stopper->caller = work->caller;
stopper->fn = fn;
preempt_count_inc();
ret = fn(arg);
if (done) {
@ -495,6 +514,8 @@ repeat:
cpu_stop_signal_done(done);
}
preempt_count_dec();
stopper->fn = NULL;
stopper->caller = 0;
WARN_ONCE(preempt_count(),
"cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
goto repeat;

View File

@ -293,10 +293,8 @@ static void nohz_full_kick_func(struct irq_work *work)
/* Empty, the tick restart happens on tick_nohz_irq_exit() */
}
static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
.func = nohz_full_kick_func,
.flags = ATOMIC_INIT(IRQ_WORK_HARD_IRQ),
};
static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) =
IRQ_WORK_INIT_HARD(nohz_full_kick_func);
/*
* Kick this CPU if it's full dynticks in order to force it to

View File

@ -1096,7 +1096,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
return -EINVAL;
work = this_cpu_ptr(&send_signal_work);
if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY)
if (irq_work_is_busy(&work->irq_work))
return -EBUSY;
/* Add the current task, which is the target of sending signal,

View File

@ -4908,6 +4908,10 @@ static void unbind_workers(int cpu)
pool->flags |= POOL_DISASSOCIATED;
raw_spin_unlock_irq(&pool->lock);
for_each_pool_worker(worker, pool)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0);
mutex_unlock(&wq_pool_attach_mutex);
/*

View File

@ -267,3 +267,21 @@ int cpumask_any_and_distribute(const struct cpumask *src1p,
return next;
}
EXPORT_SYMBOL(cpumask_any_and_distribute);
int cpumask_any_distribute(const struct cpumask *srcp)
{
int next, prev;
/* NOTE: our first selection will skip 0. */
prev = __this_cpu_read(distribute_cpu_mask_prev);
next = cpumask_next(prev, srcp);
if (next >= nr_cpu_ids)
next = cpumask_first(srcp);
if (next < nr_cpu_ids)
__this_cpu_write(distribute_cpu_mask_prev, next);
return next;
}
EXPORT_SYMBOL(cpumask_any_distribute);

View File

@ -12,6 +12,7 @@
#include <linux/atomic.h>
#include <linux/kexec.h>
#include <linux/utsname.h>
#include <linux/stop_machine.h>
static char dump_stack_arch_desc_str[128];
@ -57,6 +58,7 @@ void dump_stack_print_info(const char *log_lvl)
log_lvl, dump_stack_arch_desc_str);
print_worker_info(log_lvl, current);
print_stop_info(log_lvl, current);
}
/**

View File

@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2)
if (current->nr_cpus_allowed == 1)
goto out;
#ifdef CONFIG_SMP
if (current->migration_disabled)
goto out;
#endif
/*
* It is valid to assume CPU-locality during early bootup:
*/

View File

@ -11179,8 +11179,7 @@ static int __init net_dev_init(void)
INIT_LIST_HEAD(&sd->poll_list);
sd->output_queue_tailp = &sd->output_queue;
#ifdef CONFIG_RPS
sd->csd.func = rps_trigger_softirq;
sd->csd.info = sd;
INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
sd->cpu = i;
#endif